欢迎光临散文网 会员登陆 & 注册

爬虫

2023-06-09 14:54 作者:alpha-H111  | 我要投稿

#!/usr/bin/env python

# -*- coding:utf-8 -*-

import requests

from bs4 import BeautifulSoup

#需求:爬取三国演义小说所有的章节标题和章节内容http://www.shicimingju.com/book/sanguoyanyi.html

if __name__ == "__main__":

    #对首页的页面数据进行爬取

    headers = {

        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'

    }

    url = 'http://www.shicimingju.com/book/sanguoyanyi.html'

    page_text = requests.get(url=url,headers=headers).text


    #在首页中解析出章节的标题和详情页的url

    #1.实例化BeautifulSoup对象,需要将页面源码数据加载到该对象中

    soup = BeautifulSoup(page_text,'lxml')

    #解析章节标题和详情页的url

    li_list = soup.select('.book-mulu > ul > li')

    fp = open('./sanguo.txt','w',encoding='utf-8')

    for li in li_list:

        title = li.a.string

        detail_url = 'http://www.shicimingju.com'+li.a['href']

        #对详情页发起请求,解析出章节内容

        detail_page_text = requests.get(url=detail_url,headers=headers).text

        #解析出详情页中相关的章节内容

        detail_soup = BeautifulSoup(detail_page_text,'lxml')

        div_tag = detail_soup.find('div',class_='chapter_content')

        #解析到了章节的内容

        content = div_tag.text

        fp.write(title+':'+content+'\n')

        print(title,'爬取成功!!!')


import requests

from lxml import etree

if __name__ == '__main__':

    headers = {

        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'

    }

    url = 'https://news.163.com/'

    page_text = requests.get(url = url,headers = headers).text

    #print(page_text)

    tree = etree.HTML(page_text)

    li_list=tree.xpath('/html/body/div[1]/div[3]/div[2]/div[3]/div[3]/div[10]//li')

    for li in li_list:

        rank=li.xpath('./em/text()')

        #print(rank)

        title=li.xpath('./a/@title')

        #print(title)

        num=li.xpath('./span/text()')[0]

        print(num)

 

#!/usr/bin/env python

# -*- coding:utf-8 -*-

from lxml import etree

if __name__ == "__main__":

    #实例化好了一个etree对象,且将被解析的源码加载到了该对象中

    tree = etree.parse('test.html')

    # r = tree.xpath('/html/body/div')

    # r = tree.xpath('/html//div')

    # r = tree.xpath('//div')

    # r = tree.xpath('//div[@class="song"]')

    # r = tree.xpath('//div[@class="tang"]//li[5]/a/text()')[0]

    # r = tree.xpath('//li[7]//text()')

    # r = tree.xpath('//div[@class="tang"]//text()')

    r = tree.xpath('//div[@class="song"]/img/@src')


    print(r)




import re  # 导入正则表达式模块re


pattern = r'he\w+'  # 模式字符串,以'he'开头的字符串,r表示'\w'不进行转义

string = 'hello world HELLO WORLD'  # 定义要匹配的字符串

match = re.findall(pattern, string, re.I)  # 搜索字符串,不区分大小写

print(match)  # 输出匹配结果

string = '你好世界hello world HELLO WORLD'  # 定义要匹配的字符串

match = re.findall(pattern, string)  # 搜索字符串,区分大小写

print(match)  # 输出匹配结果


pattern = r'https://(.*?)(\d+).com/'  # 表达式,非贪婪操作符'?'


import re  # 导入正则表达式模块re


pattern = r'https://(.*?)'  # 表达式

string = 'https://www.hao123.com/'  # 定义要匹配的字符串

match = re.findall(pattern, string)  # 匹配字符串

print(match)  # 输出匹配结果

pattern = r'https://(.*)'  # 表达式

match = re.findall(pattern, string)  # 匹配字符串

print(match)  # 输出匹配结果



爬虫的评论 (共 条)

分享到微博请遵守国家法律