尚硅谷Python爬虫教程小白零基础速通(含python基础+爬虫案例)
P62 urllib_ajax的代码,这里面我添加了随机休眠数,如果不想要可以删去主函数里的sleep方法。
一定要注意base_url是否拼写正确。
import urllib.parse
import urllib.request
import random
import time
# 定义请求的函数
def create_request(page):
base_url = 'https://movie.douban.com/j/chart/top_list?type=24&interval_id=100:90&action=&'
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36'
}
data = {
'start':(page-1) * 20,
'limit':20,
}
data = urllib.parse.urlencode(data)
url = base_url + data
request = urllib.request.Request(url=url,headers=headers)
return request
#获取响应内容的函数
def get_content(request):
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8')
return content
#下载的函数
def down_load(page,content):
with open('douban' + str(page) +'.json','w',encoding='utf-8') as fp:
fp.write(content)
# 程序的入口
start_page = int(input('请输入起始的页码'))
end_page= int(input('请输入结束的页码'))
for page in range(start_page,end_page+1):
time.sleep(random.randint(5,15))
request = create_request(page)
content = get_content(request)
down_load(page,content)
p78中,selenium代码如果按照老师教的,会出现DeprecationWarning: executable_path has been deprecated, please pass in a Service object
browser = webdriver.Edge(path)
但是并不影响继续运行

但是如果不想要报错,可以这样修改
# (1)导入selenium from selenium import webdriver from selenium.webdriver.chrome.service import Service # 导入Service类 s = Service(executable_path='chromedriver.exe') # 创建Service对象 browser = webdriver.Chrome(service=s) # 传入Service对象 # (3)访问网站 url = 'https://www.jd.com' browser.get(url) concent = browser.page_source print(concent)

