Python超强爬虫8天速成(完整版)爬取各种网站数据实战案例
各个练习笔记
p10 肯德基作业,在pycharm直接打印:
import json
import requests
if __name__ == '__main__':
keyword=input('请输入位置')
url = 'http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword'
parme = {
'cname': '',
'pid': '',
'keyword': keyword,
'pageIndex': '1',
'pageSize': '10',
}
header = {
#写自己的UA标识,我就不打上去了
}
response = requests.post(url=url,params=parme,headers=header)
lict_data=response.json()
for item in lict_data['Table1']:#注意看json文件,他首先是由两个键值对组成,第一个表示相关的记录的条数,而详细内容存在第二个里面所以遍历的是第二个键值对
print(item['provinceName']+item['cityName']+item['addressDetail'])
p13的网址进不去,用豆瓣的top250练了一下,爬取图片
import requests
import time
from bs4 import BeautifulSoup
import os
t1=time.time()
header={}#写自己的UA标识
img_list=[]
if not os.path.exists('./img'):
# 判断文件夹是否存在
os.mkdir('./img')
for start_num in range(0,26,25):#只爬取了前面50张图片
url=f"https://movie.douban.com/top250?start={start_num}&file="
response=requests.get(url=url,headers=header).text
soup=BeautifulSoup(response,"html.parser")
all_i=soup.findAll('img')
for img in all_i:
img_list.append(img['src'])
img_list.pop()#最后一个多出两张不需要的二维码,删去
sum=0
for img_adress in img_list:
sum+=1
url_img=img_adress
res_img=requests.get(url=url_img,headers=header).content
with open(f'./img/{sum}.jpg','wb') as file:
file.write(res_img)
print('总耗时:',time.time()-t1)
p24 还是把.text改成content才是正解(二进制),然后io流内写入的编码格式是‘UTF-8'乱码就会少掉,但是其中还夹着一些无用的数据,需要分析一下
import os
import requests
from bs4 import BeautifulSoup
import time
header = {}#写自己的UA标识
t1 = time.time()
page_text = requests.get(url='https://www.shicimingju.com/book/sanguoyanyi.html', headers=header).content
soup = BeautifulSoup(page_text, 'lxml')
li_list = soup.select(".book-mulu > ul > li")
fp=open('./sanguo.txt','w',encoding='utf-8')
for li in li_list:
title=li.a.string
detail_url='https://www.shicimingju.com'+li.a['href']
detail_page_text=requests.get(url=detail_url,headers=header,proxies=proxy).content
detail_soup=BeautifulSoup(detail_page_text,'lxml')
div_tag=detail_soup.find('div',class_='chapter_content')
content=div_tag.text
fp.write(title+':'+content+'\n')
print(title,'爬取成功')
fp.close()
p25爬取58同城二手房标题
import time
import requests
from lxml import etree
header=#写自己的UA标识
t1=time.time()
reponse=requests.get(url='https://www.58.com/ershoufang/',headers=header,timeout=30).text
tree=etree.HTML(reponse)
x_tree=tree.xpath('//div[@class="cb"]//tr/td[@class="t"]/a/text()')
fp=open('./二手房标题.txt','w',encoding='utf-8')
for item in x_tree:
fp.write(item+'\n')
fp.close()
print(x_tree)
print('爬取总运行时间:'+time.time()-t1)

