欢迎光临散文网 会员登陆 & 注册

呃呃呃

2023-06-09 18:25 作者:想吸莱莱  | 我要投稿

import  requests

from bs4 import BeautifulSoup as bs

from selenium import webdriver

import time


a = input('请输入爬取页面的数量:')


driver = webdriver.Chrome()

driver.get("https://movie.douban.com/subject/33455421/?tag=%E7%83%AD%E9%97%A8&from=gaia")

driver.find_element_by_xpath("//a[contains(text(),' 条')]").click()

for cishu in range(a):

    time.sleep(3)

    driver.switch_to.window(driver.window_handles[-1])


    a = driver.current_url


    cookies = {

        'll': '"118254"',

        'bid': 'aXdoJiY-aR0',

        'ap_v': '0,6.0',

        '_pk_id.100001.4cf6': '302c3bdefe2a6c04.1685689908.',

        '__utmc': '30149280',

        '__utmz': '30149280.1685689908.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic',

        '__utmc': '223695111',

        '__utmz': '223695111.1685689908.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic',

        '__yadk_uid': 'hD9K77Vag4LPaL52WBu3tIOusW1m831I',

        '_vwo_uuid_v2': 'DA61E56E708BE65DB3A6C5A77FB5C29CB|e38615be2b4802b5e32f0f56bc7adc44',

        '__gads': 'ID=ba12cee8a1683aac-22eb64aab1e10079:T=1685689921:RT=1685690733:S=ALNI_MZlB3mZ0tgPF9c5nBhX4c6PBYxZNA',

        '__gpi': 'UID=00000c0e8f42923b:T=1685689921:RT=1685690733:S=ALNI_MaxDBxu9Ora9k5hJt6hWuENVsK1FA',

        '__utma': '30149280.469148891.1685689908.1685689908.1685693862.2',

        '__utmb': '30149280.0.10.1685693862',

        '__utma': '223695111.960813586.1685689908.1685689908.1685693862.2',

        '__utmb': '223695111.0.10.1685693862',

        '_pk_ref.100001.4cf6': '%5B%22%22%2C%22%22%2C1685693862%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DHA_VPCeMD3sZH-y0jBKk29963IqtrUqQVP56SxjwgBVGpmM7MzaGMT8yzKr116Iw%26wd%3D%26eqid%3De0e168f600022f96000000066479962a%22%5D',

        '_pk_ses.100001.4cf6': '1',

    }


    headers = {

        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',

        'Accept-Language': 'zh-CN,zh;q=0.9',

        'Cache-Control': 'max-age=0',

        'Connection': 'keep-alive',

        # 'Cookie': 'll="118254"; bid=aXdoJiY-aR0; ap_v=0,6.0; _pk_id.100001.4cf6=302c3bdefe2a6c04.1685689908.; __utmc=30149280; __utmz=30149280.1685689908.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmc=223695111; __utmz=223695111.1685689908.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __yadk_uid=hD9K77Vag4LPaL52WBu3tIOusW1m831I; _vwo_uuid_v2=DA61E56E708BE65DB3A6C5A77FB5C29CB|e38615be2b4802b5e32f0f56bc7adc44; __gads=ID=ba12cee8a1683aac-22eb64aab1e10079:T=1685689921:RT=1685690733:S=ALNI_MZlB3mZ0tgPF9c5nBhX4c6PBYxZNA; __gpi=UID=00000c0e8f42923b:T=1685689921:RT=1685690733:S=ALNI_MaxDBxu9Ora9k5hJt6hWuENVsK1FA; __utma=30149280.469148891.1685689908.1685689908.1685693862.2; __utmb=30149280.0.10.1685693862; __utma=223695111.960813586.1685689908.1685689908.1685693862.2; __utmb=223695111.0.10.1685693862; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1685693862%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DHA_VPCeMD3sZH-y0jBKk29963IqtrUqQVP56SxjwgBVGpmM7MzaGMT8yzKr116Iw%26wd%3D%26eqid%3De0e168f600022f96000000066479962a%22%5D; _pk_ses.100001.4cf6=1',

        'Referer': 'https://movie.douban.com/subject/33455421/?tag=%E7%83%AD%E9%97%A8&from=gaia',

        'Sec-Fetch-Dest': 'document',

        'Sec-Fetch-Mode': 'navigate',

        'Sec-Fetch-Site': 'same-origin',

        'Sec-Fetch-User': '?1',

        'Upgrade-Insecure-Requests': '1',

        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36',

        'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="100", "Google Chrome";v="100"',

        'sec-ch-ua-mobile': '?0',

        'sec-ch-ua-platform': '"Windows"',

    }


    response = requests.get(a, cookies=cookies,

                            headers=headers)


    res = bs(response.text, 'lxml')


    short = res.find_all(attrs={'class': {'comment-info'}})


    j = 0


    x = []

    for i in short:

        y = []

        name = res.find_all('a', attrs={'class': {''}})[j].get_text()

        star = res.find_all('span', attrs={'class': {'rating'}})[j]

        time = res.find_all(attrs={'class': {'comment-time'}})[j]

        address = res.find_all(attrs={'class': {'comment-location'}})[j].get_text()

        comment = res.find_all(attrs={'class': {'short'}})[j].get_text()

        star = star.get('title')

        time = time.get('title')

        j += 1

        y.append(name)

        y.append(star)

        y.append(time)

        y.append(address)

        y.append(comment)

        x.append(y)


    print(x)

    for i in range(20):

        with open('comment{}.txt'.format(cishu), 'a', encoding='utf-8') as f:

            f.write(str(x[i]))

            f.write('\n\n')

    driver.find_element_by_xpath("//a[contains(text(),' 条')]").click()


呃呃呃的评论 (共 条)

分享到微博请遵守国家法律