尚硅谷Python爬虫教程小白零基础速通(含python基础+爬虫案例)

P62 urllib_ajax的代码,这里面我添加了随机休眠数,如果不想要可以删去主函数里的sleep方法。
一定要注意base_url是否拼写正确。
import urllib.parse import urllib.request import random import time # 定义请求的函数 def create_request(page): base_url = 'https://movie.douban.com/j/chart/top_list?type=24&interval_id=100:90&action=&' headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36' } data = { 'start':(page-1) * 20, 'limit':20, } data = urllib.parse.urlencode(data) url = base_url + data request = urllib.request.Request(url=url,headers=headers) return request #获取响应内容的函数 def get_content(request): response = urllib.request.urlopen(request) content = response.read().decode('utf-8') return content #下载的函数 def down_load(page,content): with open('douban' + str(page) +'.json','w',encoding='utf-8') as fp: fp.write(content) # 程序的入口 start_page = int(input('请输入起始的页码')) end_page= int(input('请输入结束的页码')) for page in range(start_page,end_page+1): time.sleep(random.randint(5,15)) request = create_request(page) content = get_content(request) down_load(page,content)
p78中,selenium代码如果按照老师教的,会出现DeprecationWarning: executable_path has been deprecated, please pass in a Service object
browser = webdriver.Edge(path)
但是并不影响继续运行

但是如果不想要报错,可以这样修改
# (1)导入selenium from selenium import webdriver from selenium.webdriver.chrome.service import Service # 导入Service类 s = Service(executable_path='chromedriver.exe') # 创建Service对象 browser = webdriver.Chrome(service=s) # 传入Service对象 # (3)访问网站 url = 'https://www.jd.com' browser.get(url) concent = browser.page_source print(concent)