爬虫
#!/usr/bin/env python
# -*- coding:utf-8 -*-
import requests
from bs4 import BeautifulSoup
#需求:爬取三国演义小说所有的章节标题和章节内容http://www.shicimingju.com/book/sanguoyanyi.html
if __name__ == "__main__":
#对首页的页面数据进行爬取
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'
}
url = 'http://www.shicimingju.com/book/sanguoyanyi.html'
page_text = requests.get(url=url,headers=headers).text
#在首页中解析出章节的标题和详情页的url
#1.实例化BeautifulSoup对象,需要将页面源码数据加载到该对象中
soup = BeautifulSoup(page_text,'lxml')
#解析章节标题和详情页的url
li_list = soup.select('.book-mulu > ul > li')
fp = open('./sanguo.txt','w',encoding='utf-8')
for li in li_list:
title = li.a.string
detail_url = 'http://www.shicimingju.com'+li.a['href']
#对详情页发起请求,解析出章节内容
detail_page_text = requests.get(url=detail_url,headers=headers).text
#解析出详情页中相关的章节内容
detail_soup = BeautifulSoup(detail_page_text,'lxml')
div_tag = detail_soup.find('div',class_='chapter_content')
#解析到了章节的内容
content = div_tag.text
fp.write(title+':'+content+'\n')
print(title,'爬取成功!!!')
import requests
from lxml import etree
if __name__ == '__main__':
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
}
url = 'https://news.163.com/'
page_text = requests.get(url = url,headers = headers).text
#print(page_text)
tree = etree.HTML(page_text)
li_list=tree.xpath('/html/body/div[1]/div[3]/div[2]/div[3]/div[3]/div[10]//li')
for li in li_list:
rank=li.xpath('./em/text()')
#print(rank)
title=li.xpath('./a/@title')
#print(title)
num=li.xpath('./span/text()')[0]
print(num)
#!/usr/bin/env python
# -*- coding:utf-8 -*-
from lxml import etree
if __name__ == "__main__":
#实例化好了一个etree对象,且将被解析的源码加载到了该对象中
tree = etree.parse('test.html')
# r = tree.xpath('/html/body/div')
# r = tree.xpath('/html//div')
# r = tree.xpath('//div')
# r = tree.xpath('//div[@class="song"]')
# r = tree.xpath('//div[@class="tang"]//li[5]/a/text()')[0]
# r = tree.xpath('//li[7]//text()')
# r = tree.xpath('//div[@class="tang"]//text()')
r = tree.xpath('//div[@class="song"]/img/@src')
print(r)
import re # 导入正则表达式模块re
pattern = r'he\w+' # 模式字符串,以'he'开头的字符串,r表示'\w'不进行转义
string = 'hello world HELLO WORLD' # 定义要匹配的字符串
match = re.findall(pattern, string, re.I) # 搜索字符串,不区分大小写
print(match) # 输出匹配结果
string = '你好世界hello world HELLO WORLD' # 定义要匹配的字符串
match = re.findall(pattern, string) # 搜索字符串,区分大小写
print(match) # 输出匹配结果
pattern = r'https://(.*?)(\d+).com/' # 表达式,非贪婪操作符'?'
import re # 导入正则表达式模块re
pattern = r'https://(.*?)' # 表达式
string = 'https://www.hao123.com/' # 定义要匹配的字符串
match = re.findall(pattern, string) # 匹配字符串
print(match) # 输出匹配结果
pattern = r'https://(.*)' # 表达式
match = re.findall(pattern, string) # 匹配字符串
print(match) # 输出匹配结果