基于爬虫框架scrapy的bili爬虫
# 蜘蛛程序
import scrapy
from scrapy import Selector
from ..items import GetItem # 导入
class BiliSpider(scrapy.Spider):
name = 'bili'
allowed_domains = ['bilibili.com']
start_urls = ['https://www.bilibili.com/';] # 爬取的页面
def parse(self, response):
sel = Selector(response)
list_items = sel.xpath('/html/body/div[2]/div[2]/main/div[2]/div/div[1]/div')
for list_item in list_items:
spider_item = GetItem()
spider_item['title'] = list_item.css('h3::attr(title)').extract() # 标题
spider_item['author'] = list_item.css('span.bili-video-card__info--author::text').extract() # 作者
spider_item['time'] = list_item.css('span.bili-video-card__info--date::text').extract() # 时间
spider_item['link'] = list_item.css('h3 > a::attr(href)').extract() # 链接
yield spider_item

# items文件
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class GetItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
author = scrapy.Field()
time = scrapy.Field()
link = scrapy.Field()

# 配置
在setting文件中打开cookies,添加请求头

# 命令行启动,保存为csv文件
scrapy crawl bili -o bili.csv

