呃呃呃
import requests
from bs4 import BeautifulSoup as bs
from selenium import webdriver
import time
a = input('请输入爬取页面的数量:')
driver = webdriver.Chrome()
driver.get("https://movie.douban.com/subject/33455421/?tag=%E7%83%AD%E9%97%A8&from=gaia")
driver.find_element_by_xpath("//a[contains(text(),' 条')]").click()
for cishu in range(a):
time.sleep(3)
driver.switch_to.window(driver.window_handles[-1])
a = driver.current_url
cookies = {
'll': '"118254"',
'bid': 'aXdoJiY-aR0',
'ap_v': '0,6.0',
'_pk_id.100001.4cf6': '302c3bdefe2a6c04.1685689908.',
'__utmc': '30149280',
'__utmz': '30149280.1685689908.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic',
'__utmc': '223695111',
'__utmz': '223695111.1685689908.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic',
'__yadk_uid': 'hD9K77Vag4LPaL52WBu3tIOusW1m831I',
'_vwo_uuid_v2': 'DA61E56E708BE65DB3A6C5A77FB5C29CB|e38615be2b4802b5e32f0f56bc7adc44',
'__gads': 'ID=ba12cee8a1683aac-22eb64aab1e10079:T=1685689921:RT=1685690733:S=ALNI_MZlB3mZ0tgPF9c5nBhX4c6PBYxZNA',
'__gpi': 'UID=00000c0e8f42923b:T=1685689921:RT=1685690733:S=ALNI_MaxDBxu9Ora9k5hJt6hWuENVsK1FA',
'__utma': '30149280.469148891.1685689908.1685689908.1685693862.2',
'__utmb': '30149280.0.10.1685693862',
'__utma': '223695111.960813586.1685689908.1685689908.1685693862.2',
'__utmb': '223695111.0.10.1685693862',
'_pk_ref.100001.4cf6': '%5B%22%22%2C%22%22%2C1685693862%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DHA_VPCeMD3sZH-y0jBKk29963IqtrUqQVP56SxjwgBVGpmM7MzaGMT8yzKr116Iw%26wd%3D%26eqid%3De0e168f600022f96000000066479962a%22%5D',
'_pk_ses.100001.4cf6': '1',
}
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
# 'Cookie': 'll="118254"; bid=aXdoJiY-aR0; ap_v=0,6.0; _pk_id.100001.4cf6=302c3bdefe2a6c04.1685689908.; __utmc=30149280; __utmz=30149280.1685689908.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmc=223695111; __utmz=223695111.1685689908.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __yadk_uid=hD9K77Vag4LPaL52WBu3tIOusW1m831I; _vwo_uuid_v2=DA61E56E708BE65DB3A6C5A77FB5C29CB|e38615be2b4802b5e32f0f56bc7adc44; __gads=ID=ba12cee8a1683aac-22eb64aab1e10079:T=1685689921:RT=1685690733:S=ALNI_MZlB3mZ0tgPF9c5nBhX4c6PBYxZNA; __gpi=UID=00000c0e8f42923b:T=1685689921:RT=1685690733:S=ALNI_MaxDBxu9Ora9k5hJt6hWuENVsK1FA; __utma=30149280.469148891.1685689908.1685689908.1685693862.2; __utmb=30149280.0.10.1685693862; __utma=223695111.960813586.1685689908.1685689908.1685693862.2; __utmb=223695111.0.10.1685693862; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1685693862%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DHA_VPCeMD3sZH-y0jBKk29963IqtrUqQVP56SxjwgBVGpmM7MzaGMT8yzKr116Iw%26wd%3D%26eqid%3De0e168f600022f96000000066479962a%22%5D; _pk_ses.100001.4cf6=1',
'Referer': 'https://movie.douban.com/subject/33455421/?tag=%E7%83%AD%E9%97%A8&from=gaia',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36',
'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="100", "Google Chrome";v="100"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
}
response = requests.get(a, cookies=cookies,
headers=headers)
res = bs(response.text, 'lxml')
short = res.find_all(attrs={'class': {'comment-info'}})
j = 0
x = []
for i in short:
y = []
name = res.find_all('a', attrs={'class': {''}})[j].get_text()
star = res.find_all('span', attrs={'class': {'rating'}})[j]
time = res.find_all(attrs={'class': {'comment-time'}})[j]
address = res.find_all(attrs={'class': {'comment-location'}})[j].get_text()
comment = res.find_all(attrs={'class': {'short'}})[j].get_text()
star = star.get('title')
time = time.get('title')
j += 1
y.append(name)
y.append(star)
y.append(time)
y.append(address)
y.append(comment)
x.append(y)
print(x)
for i in range(20):
with open('comment{}.txt'.format(cishu), 'a', encoding='utf-8') as f:
f.write(str(x[i]))
f.write('\n\n')
driver.find_element_by_xpath("//a[contains(text(),' 条')]").click()