python requests模块演示
import requests
from bs4 import BeautifulSoup
import csv
import re
import time
import json
import os.path
import random
record ={}
def func(date):
global writer
dailylist =[date]
record[date]={}
url='https://weixin.sogou.com/weixin?type=2&s_from=input&query={}+各区确诊病例++shanghaifabu&ie=utf8&_sug_=y&_sug_type_=&w=01019900&sut=523&sst0=1652065715152&lkt=0%2C0%2C0'\
.format(date)
headers={ 'user-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) \
appleWebKit/537.36 (KHTML, like Gecko) Chrome/\
86.0.4240.75 Safari/537.36',
}
sogoulink = 'https://weixin.sogou.com'
data=requests.get(url,headers = headers,timeout=5)
cookie = data.cookies.get_dict()
headers['cookie']=\
''.join([str(item[0])+'='+str(item[1])+'; ' for item in cookie.items()])
soup = BeautifulSoup(data.text,features = 'html.parser')
find =False
for n in range(8):
account = (soup.select('#sogou_vr_11002601_account_{}'.format(n)))[0].text
if account =='上海发布':
title = (soup.select('#sogou_vr_11002601_title_{}'.format(n)))[0]
titletext = title.text
if date in titletext and '本市各区' in titletext \
and '确诊病例' in titletext and '居住地信息'in titletext:
link = sogoulink + title.get('href')
find = True
print(titletext)
break
time.sleep(0.5)
if find:
daily = requests.get(link,headers = headers,timeout =5)
a=re.findall(r"url \+.*'",daily.text)
flink = ''.join([t.replace('url += ','').replace("\'",'') for t in a])
time.sleep(0.5)
fdaily = requests.get(flink,headers = headers,timeout =5)
chips = re.findall(r'{}.*?p>'.format(date),fdaily.text)
chipsp = re.sub('[,、]',',',chips[0]).split(',')
for item in chipsp:
if '确诊' in item and '无症状'in item and '隔离'not in item:
totquezhen =int(re.findall('\d+',item)[0])
totwuzheng =int(re.findall('\d+',item)[1])
record[date]['total']=[int(totquezhen),int(totwuzheng)]
dailylist.append(totquezhen+totwuzheng)
print('总计确诊:{} 无症状:{}'.format(totquezhen,totwuzheng))
break
for chip in chips[1:]:
chipsp = re.sub('[,、]',',',chip)# # 中文标点转化为英文逗号
qu = (re.findall(',.+区',chipsp))[0][1:]
qu = re.sub(r'<.*>','',qu)
quezhen =0
wuzheng =0
chipsp = chipsp.split(',')
for item in chipsp:
if '确诊' in item and re.findall('\d+',item) and '无症状' not in item:
quezhen = int(re.findall('\d+',item)[0])
if '无症状' in item and re.findall('\d+',item) and '确诊'not in item:
wuzheng = int(re.findall('\d+',item)[0])
print('{} 确诊:{} 无症状:{}'.format(qu,quezhen,wuzheng))
record[date][qu]=[quezhen,wuzheng]
dailylist.append(quezhen+wuzheng)
if sum([ int(record[date][f][0]) for f in record[date].keys() if f!='total'])\
!= int(totquezhen) or\
sum([ int(record[date][f][1]) for f in record[date].keys() if f!='total'])\
!= int(totwuzheng):
print('统计存在偏差!')
else:
print('未找到数据!')
for day in range(12,13):
date ='5月{}日'.format(day)
try:
func(date)
except:
print(date,' error')
time.sleep(5)