欢迎光临散文网 会员登陆 & 注册

获得b站弹幕并分析生成词云-以露米烟火之下为例

2023-02-04 15:27 作者:芜湖小量化  | 我要投稿

#  分析网页源代码pages":[{"cid":860252148,"   此处获取视频弹幕cid代码

例:<d p="57.01800,1,25,16777215,1665738755,0,579ea34d,1163059996662086144,10">烟火 fy 烟火</d>

url = https://comment.bilibili.com/{}.xml    #  {}填充cid

# 使用的库

import requests

from lxml import etree

from imageio.v2 import imread

import jieba

import wordcloud

#  爬取保存弹幕信息网页  请求标头见浏览器

def crawler():

    url = ''
    headers = {
       'authority': 'www.bilibili.com';,
       'accept': '',
       'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
       'cookie': '',
       'user-agent': ''
    }
    response = requests.get(url=url, headers=headers)
    response.encoding = 'utf-8'
    print(response.status_code)
    xml = response.text
    with open('烟火之下.xml', 'w', encoding='utf-8') as fw:
       fw.write(xml)

# 解析xml文本转化txt

def xml_parse():

    tree = etree.parse('烟火之下.xml')
    i = tree.getroot()
    print('i:', i.tag)
    ds = i.getchildren()
    ds_list = []
    for d in ds:
       ds_list.append(d.text)

    with open('弹幕.txt', 'w', encoding='utf-8') as f:
       f.writelines(ds_list)

# 生成词云  此处请自行准备mask图片,本实践图片为一只大鸟。

def  word_cloud_pic():

    stopwords = {'chun', 'a'}
    mask = imread('1.png')
    with open('弹幕.txt', 'r', encoding='utf-8') as f_2:
       f_dan = f_2.read()
    ls = jieba.lcut(f_dan)
    txt = ''.join(ls)
    wo = wordcloud.WordCloud(width=800, height=600,background_color='white',                 font_path="simkai.ttf",stopwords=stopwords, max_font_size=32, min_font_size=4, font_step=1, max_words=100, mask=mask)

    text_word = wo.generate(txt)
    wo.to_file('弹幕.png')

# 运行程序

if __name__ == '__main__':

    crawler()

    xml_parse()

    word_cloud_pic()

# 私货  关注露米Lumi_Official喵,谢谢了喵!!!

效果图展示


获得b站弹幕并分析生成词云-以露米烟火之下为例的评论 (共 条)

分享到微博请遵守国家法律