**度文库提取(仅作学习交流)
import requests,re
from lxml import etree
from docx import Document
def get_detail(url):
header = {'User-agent': 'Googlebot'}
response = requests.get(url , headers = header).content.decode('gbk')
#print(response)
title_ze=r'<title>(.+?)_百度文库</title>'
div_ze=r'<div class="bd doc-reader">(.+?)<div class="aside">'
title=re.findall(title_ze,response,re.S)[0]
div=re.findall(div_ze,response,re.S)[0]
div=etree.HTML(div)
details=div.xpath('//div//text()')
j=0
for i in range(len(details)):
if details[j] in [' ']:
details.pop(j)
else:
j+=1
data=title,details
print(data)
return data
def get_word(data):
document = Document()
document.add_heading(data[0])
for detail in data[1]:
document.add_paragraph(detail) #添加段落
document.save(f'C:/Users/Desktop/******.docx')
if __name__=='__main__':
url="https://wenku.baidu.com/view/779f8c48cd22bcd126fff705cc17552706225e6d.html"
text=get_detail(url)
get_word(text)

