研招网爬虫信息爬取
代码来自知乎: https://zhuanlan.zhihu.com/p/541811879
import pandas as pd
import requests as rqs
from lxml import html
import json
'''
以下就可以就可以解决from lxml import etree 不能用的方法
'''
etree = html.etree
# 请求头
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36"
}
getSs = 'https://yz.chsi.com.cn/zsml/pages/getSs.jsp'
getMl = 'https://yz.chsi.com.cn/zsml/pages/getMl.jsp'
shenggfen = rqs.get(getSs, headers=headers).content.decode("utf-8")
getMl = rqs.get(getMl, headers=headers).content.decode("utf-8")
formatted_getMl = []
for item in json.loads(getMl):
formatted_item = f"{item['mc']}({item['dm']})"
formatted_getMl.append(formatted_item)
print(formatted_getMl)
mldm = input('请选择(学术学位)门类(数字)_(专业学位)zyxw:')
print('________________________________________')
if mldm=='q':
getZy = f'https://yz.chsi.com.cn/zsml/pages/getZy.jsp?'
else:
getZy = f'https://yz.chsi.com.cn/zsml/pages/getZy.jsp?mldm={mldm}'
getZy = rqs.get(getZy, headers=headers).content.decode("utf-8")
formatted_getZy = []
for item in json.loads(getZy):
formatted_item = f"{item['mc']}({item['dm']})"
formatted_getZy.append(formatted_item)
print(formatted_getZy)
yjxkdm = input('请选择(学术学位)学科类别_(专业学位)专业领域代码(数字)_必选:')
print('________________________________________')
# 专业
zymc = f'https://yz.chsi.com.cn/zsml/code/zy.do?q={yjxkdm}'
print(rqs.get(zymc, headers=headers).content.decode("utf-8"))
zymc = input('请选择专业名称(汉字):')
ssdm = input('请选择省份(汉字):')
dwmc = input('请选择学校名称(汉字):')
print('________________________________________')
# 查省份代码
def find_dm_by_mc(data, mc):
for item in json.loads(data):
if item["mc"] == mc:
return item["dm"]
return ""
ssdm = find_dm_by_mc(shenggfen, ssdm)
if ssdm=='q':
ssdm = ''
if dwmc=='q':
dwmc = ''
if mldm=='q':
mldm = ''
if not yjxkdm:
yjxkdm = ''
if zymc=='q':
zymc = ''
params = {
"ssdm": ssdm, # 省份代码
"dwmc": dwmc, # 学校名称
"mldm": mldm, # 学术学位 或 专业学位 的拼音缩写
"mlmc": "", #
"yjxkdm": yjxkdm, # 专业领域代码_学科类别_
"zymc": zymc, # 专业名称
"xxfs": 1, # 1 全日制, 2 非全日制
"pageno": 1
}
# 爬取url模板
url = """https://yz.chsi.com.cn/zsml/queryAction.do?ssdm={ssdm}&dwmc={dwmc}&mldm={mldm}&mlmc={mlmc}&yjxkdm={yjxkdm}&zymc={zymc}&xxfs={xxfs}&pageno={pageno}"""
# 请求头
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36"
}
# 储存数据的列表
table = []
def getSearchData(params: dict):
"""
爬取需要的页面信息,仅为了方便爬取某个页面
@params: 填入url模板的参数字典
@return: 解码后的html字符串
"""
return rqs.get(url.format(**params), headers=headers).content.decode("utf-8")
def getARecored(url, schoolInfo: dict):
"""
储存一条记录的信息
@url: “详细页面”的url
@schoolInfo: 传入学校信息,用于存储(因为在详细页面没有学校信息,需要从第一个页面传入)
"""
data_xpath = etree.HTML(rqs.get("https://yz.chsi.com.cn" + url, headers=headers).content.decode("utf-8"))
table.append({
"学校": data_xpath.xpath("//table/tbody/tr[1]/td[2]/text()")[0],
"研究生院": schoolInfo["gso"],
"自主划线": schoolInfo["ao"],
"博士点": schoolInfo["phd"],
"考试方式": data_xpath.xpath("//table/tbody/tr[1]/td[4]/text()")[0],
"院系所": data_xpath.xpath("//table/tbody/tr[2]/td[2]/text()")[0],
"专业": data_xpath.xpath("//table/tbody/tr[2]/td[4]/text()")[0],
"研究方向": data_xpath.xpath("//table/tbody/tr[3]/td[4]/text()")[0],
"拟招人数": data_xpath.xpath("//table/tbody/tr[4]/td[4]/text()")[0],
"政治": data_xpath.xpath("//div[@class=\"zsml-result\"]/table/tbody/tr/td[1]/text()")[0],
"政治详细": data_xpath.xpath("//div[@class=\"zsml-result\"]/table/tbody/tr/td[1]/span/text()")[0],
"英语": data_xpath.xpath("//div[@class=\"zsml-result\"]/table/tbody/tr/td[2]/text()")[0],
"英语详细": data_xpath.xpath("//div[@class=\"zsml-result\"]/table/tbody/tr/td[2]/span/text()")[0],
"业务课一": data_xpath.xpath("//div[@class=\"zsml-result\"]/table/tbody/tr/td[3]/text()")[0],
"业务课一详细": data_xpath.xpath("//div[@class=\"zsml-result\"]/table/tbody/tr/td[3]/span/text()")[0],
"业务课二": data_xpath.xpath("//div[@class=\"zsml-result\"]/table/tbody/tr/td[4]/text()")[0],
"业务课二详细": data_xpath.xpath("//div[@class=\"zsml-result\"]/table/tbody/tr/td[4]/span/text()")[0]
})
def getASchoolData(url, schoolInfo: dict):
"""
获取一个学校的所有数据, 将调用getARecored来爬取这个学校的每一条记录
@url: “每一学校的专业界面”的url
@schoolInfo: 这个学校的信息(如研究生点等;用于最终的信息存储,再往后的页面没有学校信息,因此要传入)
@return:
"""
data_xpath = etree.HTML(rqs.get("https://yz.chsi.com.cn" + url, headers=headers).content.decode("utf-8"))
result = data_xpath.xpath("""//table/tbody/tr/td[8]/a/@href""") # 查看详细信息连接
for i in result:
getARecored(i, schoolInfo)
def getAPageData(data):
"""
获取查询后这一页的所有学校信息,以及其连接的下一页面的url
@data: 当前这一页的html数据(二进制)
@return: list[学校名, 学校专业页面url, 是否研究生院, 是否自主划线, 是否博士生点]
"""
data_xpath = etree.HTML(data)
school_names = data_xpath.xpath("""//*[@id="form3"]/a/text()""") # 学校名
print(school_names)
mid_urls = data_xpath.xpath("""//*[@id="form3"]/a/@href""") # 中间网址,进一步访问每一个学校此专业的搜索结果
graduate_school_opt = data_xpath.xpath("""/html/body//table/tbody/tr/td[3]""") # 是否研究生院
autonomous_opt = data_xpath.xpath("""/html/body//table/tbody/tr/td[4]""") # 是否是自主划线院校
PhD_point_opt = data_xpath.xpath("""/html/body//table/tbody/tr/td[4]""") # 是否是博士点
return [school_names, mid_urls, graduate_school_opt, autonomous_opt, PhD_point_opt]
def anlysisLoop(data):
"""
(统筹)遍历每一页,进入每个学校的页面,再进入详细页面,爬取数据
@data: 查询后的第一个页面的数据
@return:
"""
data_xpath = etree.HTML(data)
max_page_num = data_xpath.xpath("""/html/body//div[4]/ul/li/a/text()""")[-1] # 最大页数
for k in range(1, int(max_page_num) + 1):
global params
params["pageno"] = k
apage = getAPageData(rqs.get(url.format(**params), headers=headers).content.decode("utf-8"))
for s in range(len(apage[1])):
schoolInfo = {} # 记录这所学校的 研究生点、自主划线、博士点
for i in range(2, 5):
if len(apage[i][s].xpath("./i")) != 0:
schoolInfo[["gso", "ao", "phd"][i - 2]] = 1
else:
schoolInfo[["gso", "ao", "phd"][i - 2]] = 0
getASchoolData(apage[1][s], schoolInfo)
data = getSearchData(params) # 获取第一个页面的数据
anlysisLoop(data) # 传入此数据,进行爬取
df = pd.DataFrame(table) # 把存储的列表转化成 DF 对象,方便储存
df.to_csv("output.csv", encoding="gbk", index=False) # 存储到 csv 文件