import requests
import os
from lxml import etree
# 作业:分页爬取站长素材的免费简历模板
if __name__ == '__main__':
headers = {
'User-Agent': '===='
}
url = 'https://sc.chinaz.com/jianli/free_{}.html'
# https://sc.chinaz.com/jianli/free_2.html
for pageNum in range(1, 3):
# 获取1-10页的模板
if pageNum == 1:
new_url = 'https://sc.chinaz.com/jianli/free.html'
else:
new_url = url.format(pageNum)
# 获取某一页简历
response = requests.get(url=new_url, headers=headers)
page_text = response.text
# 实例化etree对象
tree = etree.HTML(page_text)
# 创建文件夹保存简历模板
if not os.path.exists('./jianliLibs'):
os.mkdir('./jianliLibs')
# 定位到所有a标签,a标签下有模板地址
a_list = tree.xpath('//div[@id="container"]//p/a')
# 遍历a标签,获取模板名称、模板地址,并下载模板
for a in a_list:
# 获取模板下载详情页面
down_url = a.xpath('./@href')[0]
detail_data = requests.get(url=down_url, headers=headers)
detail_data.encoding = 'utf-8' # 乱码解决
detail_text = detail_data.text
# 获取下载模板的连接和名称
download_tree = etree.HTML(detail_text)
# 模板下载链接
target_url = download_tree.xpath('//ul[@class="clearfix"]/li[1]/a/@href')[0]
# 模板名字
target_name = download_tree.xpath('//div[@class="ppt_tit clearfix"]/h1/text()')[0] + '.rar'
# print(target_name)
# 简历下载
jianLi = requests.get(url=target_url, headers=headers).content
jianLi_path = 'jianliLibs/' + target_name
with open(jianLi_path, 'wb') as fp:
fp.write(jianLi)
print(target_name, '下载成功!')
print('第{}页简历完成下载!!!'.format(pageNum))
print('=========================================')
print('\n')
标签: