爬取百度指数


缘由是想玩一下数据可视化,需要数据源,茅头对准了百度指数,用周末时间码了一下,在此记录。
init.py
start_date = "2020-11-26"
end_date = "2022-11-25"
keyword_list = ["腾讯", "网易", "米哈游"]
header.py
cookie = ""
header = {
"Accept": "application/json, text/plain, */*",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "zh-CN,zh;q=0.9",
"Cipher-Text": "1669446300853_1669526110570_Zps8JEXzq9SKrsYVS8CTdXkWyVq/utJah5EChxPJNob8Fk+q4oEOdHsqjPHziaQdXUo3Soeq9UND9NJ7KI5474rTUgh6apQWleSKesxhLrG38d4HYhm3Z13QnTdY8SkltqngGgRMk7HZDt4ChGgwZwsbNYsvL1I9ur3MyF2msajKplNNj5Y3LMuaMS5gxyruyeErcZUV5UW9r2lxFRwMX9EnXK2ihb15TaGFqa6ByNjSoD8ixNXwP0VWTMhaYTo/8NEAi1pyKQUOhZ8BGBh8XTnS6s7Bue/cZ7O65Ai6xvs9YY/UQb9XGxkVyZ9EndTOf+Affh+MG6dbEwqFyn3gUGJnPxpQ8AnZdrhkdfZZBacjpUn+PuqRweESGv2Goi9dXFNYRfLK0/ZPM7dd75dTY+YyIEu6hYAQt3rBv9b3QVM=",
"Connection": "keep-alive",
"Cookie": f"{cookie}",
"Host": "index.baidu.com",
"Referer": "https://index.baidu.com/v2/main/index.html",
"sec-ch-ua": "\"Google Chrome\";v=\"107\", \"Chromium\";v=\"107\", \"Not=A?Brand\";v=\"24\"",
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": "\"Windows\"",
"Sec-Fetch-Dest": "empty",
"Sec-Fetch-Mode": "cors",
"Sec-Fetch-Site": "same-origin",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36"
}
get_data.py
import requests
from header import header
from url_pool import url_get_data, url_uniqid
import json
from init import keyword_list, start_date, end_date
def get_data():
data_list = []
for keyword in keyword_list:
word = f'[[{{"name":"{keyword}","wordType":1}}],[]]'
params = {
"area": "0",
"word": word,
"startDate": start_date,
"endDate": end_date
}
# 找数据
res = requests.get(url=url_get_data, params=params, headers=header)
print(res.text)
print("-------------")
data_json = json.loads(res.text)
data = data_json["data"]["userIndexes"][0]["all"]["data"]
print(data)
# 处理密匙
uniqid = data_json["data"]["uniqid"]
print("密匙id是:" + uniqid)
res = requests.get(url=url_uniqid + uniqid, headers=header)
key = json.loads(res.text)["data"]
print("密钥是:", key)
data_list.append({"data": data, "key": key})
return data_list
do_data.py
import datetime
from get_data import get_data
# 解密函数
def decryption(keys, data):
dec_dict = {}
for j in range(len(keys) // 2):
dec_dict[keys[j]] = keys[len(keys) // 2 + j]
dec_data = ''
for k in range(len(data)):
dec_data += dec_dict[data[k]]
return dec_data
# 处理加密数据并整理[[],[]..]
def do_data():
data_list = get_data()
dec_data_list = []
for one_data in data_list:
temp_list = []
dec_one_data = decryption(keys=one_data["key"], data=one_data["data"])
temp_list.append(dec_one_data)
dec_data_list.append(temp_list)
return dec_data_list
url_pool.py
import requests
from header import header
from urllib.parse import urlencode
from init import start_date, end_date, keyword_list
url_get_data = f"https://index.baidu.com/api/SearchApi/index?"
url_uniqid = "https://index.baidu.com/Interface/ptbk?uniqid="
main.py
from init import keyword_list, end_date, start_date
from do_data import do_data
import datetime
if __name__ == '__main__':
str_startDate = start_date
str_endDate = end_date
date_Start = datetime.datetime.strptime(str_startDate, '%Y-%m-%d')
date_End = datetime.datetime.strptime(str_endDate, '%Y-%m-%d')
dec_data_list = do_data()
# 写时间表头
first = 1
with open("merge.csv", "a+", encoding="utf-8") as fp:
while date_Start <= date_End:
if first == 1:
fp.write(",")
first = 0
str_date_Start = date_Start.strftime("%Y-%m-%d")
print(str_date_Start)
fp.write(str_date_Start+",")
date_Start += datetime.timedelta(days=1)
fp.write("\n")
# 写关键词所对应的每天的数据
cnt = 0
with open("merge.csv", "a+", encoding="utf-8") as fp:
for key in keyword_list:
fp.write(key+",")
for data in dec_data_list[cnt]:
fp.write(data+",")
cnt += 1
fp.write("\n")
更多内容敬请关注https://github.com/faithererer,给颗小星星* v *