获得b站弹幕并分析生成词云-以露米烟火之下为例
# 分析网页源代码pages":[{"cid":860252148," 此处获取视频弹幕cid代码
例:<d p="57.01800,1,25,16777215,1665738755,0,579ea34d,1163059996662086144,10">烟火 fy 烟火</d>
url = https://comment.bilibili.com/{}.xml # {}填充cid

# 使用的库
import requests
from lxml import etree
from imageio.v2 import imread
import jieba
import wordcloud

# 爬取保存弹幕信息网页 请求标头见浏览器
def crawler():
url = ''
headers = {
'authority': 'www.bilibili.com';,
'accept': '',
'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
'cookie': '',
'user-agent': ''
}
response = requests.get(url=url, headers=headers)
response.encoding = 'utf-8'
print(response.status_code)
xml = response.text
with open('烟火之下.xml', 'w', encoding='utf-8') as fw:
fw.write(xml)

# 解析xml文本转化txt
def xml_parse():
tree = etree.parse('烟火之下.xml')
i = tree.getroot()
print('i:', i.tag)
ds = i.getchildren()
ds_list = []
for d in ds:
ds_list.append(d.text)
with open('弹幕.txt', 'w', encoding='utf-8') as f:
f.writelines(ds_list)

# 生成词云 此处请自行准备mask图片,本实践图片为一只大鸟。
def word_cloud_pic():
stopwords = {'chun', 'a'}
mask = imread('1.png')
with open('弹幕.txt', 'r', encoding='utf-8') as f_2:
f_dan = f_2.read()
ls = jieba.lcut(f_dan)
txt = ''.join(ls)
wo = wordcloud.WordCloud(width=800, height=600,background_color='white', font_path="simkai.ttf",stopwords=stopwords, max_font_size=32, min_font_size=4, font_step=1, max_words=100, mask=mask)
text_word = wo.generate(txt)
wo.to_file('弹幕.png')

# 运行程序
if __name__ == '__main__':
crawler()
xml_parse()
word_cloud_pic()

# 私货 关注露米Lumi_Official喵,谢谢了喵!!!

