欢迎光临散文网 会员登陆 & 注册

Python超强爬虫8天速成(完整版)爬取各种网站数据实战案例

2023-02-18 00:07 作者:末日希晓  | 我要投稿
import requests
import os
from lxml import etree

# 作业:分页爬取站长素材的免费简历模板
if __name__ == '__main__':
    headers = {
        'User-Agent': '===='
    }

    url = 'https://sc.chinaz.com/jianli/free_{}.html'
    # https://sc.chinaz.com/jianli/free_2.html
    for pageNum in range(1, 3):
        # 获取1-10页的模板
        if pageNum == 1:
            new_url = 'https://sc.chinaz.com/jianli/free.html'
        else:
            new_url = url.format(pageNum)

        # 获取某一页简历
        response = requests.get(url=new_url, headers=headers)
        page_text = response.text

        # 实例化etree对象
        tree = etree.HTML(page_text)

        # 创建文件夹保存简历模板
        if not os.path.exists('./jianliLibs'):
            os.mkdir('./jianliLibs')

        # 定位到所有a标签,a标签下有模板地址
        a_list = tree.xpath('//div[@id="container"]//p/a')

        # 遍历a标签,获取模板名称、模板地址,并下载模板
        for a in a_list:
            # 获取模板下载详情页面
            down_url = a.xpath('./@href')[0]
            detail_data = requests.get(url=down_url, headers=headers)
            detail_data.encoding = 'utf-8'  # 乱码解决
            detail_text = detail_data.text

            # 获取下载模板的连接和名称
            download_tree = etree.HTML(detail_text)
            # 模板下载链接
            target_url = download_tree.xpath('//ul[@class="clearfix"]/li[1]/a/@href')[0]
            # 模板名字
            target_name = download_tree.xpath('//div[@class="ppt_tit clearfix"]/h1/text()')[0] + '.rar'

            # print(target_name)
            # 简历下载
            jianLi = requests.get(url=target_url, headers=headers).content
            jianLi_path = 'jianliLibs/' + target_name
            with open(jianLi_path, 'wb') as fp:
                fp.write(jianLi)
                print(target_name, '下载成功!')

        print('第{}页简历完成下载!!!'.format(pageNum))
        print('=========================================')
        print('\n')


Python超强爬虫8天速成(完整版)爬取各种网站数据实战案例的评论 (共 条)

分享到微博请遵守国家法律