论坛
import requests,os,re
from lxml import etree
class Spider:
def __init__(self):
self.__site="http://39.106.228.59"
self.__file=f"{os.getcwd()}/data.txt"
self.__maxPage=1
def __getPostID(self,pageURL):
header={
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36"
}
ids=[]
response=requests.get(pageURL,headers=header)
if response.status_code==200:
html=etree.HTML(response.text)
hrefs=html.xpath('//a[@class="s xst"]/@href')
for href in hrefs:
postID=self.__parseID(href)
ids.append(self.__parseID(href))
return ids
def __getPostData(self,postID):
base=self.__site+"/forum.php?mod=viewthread&tid={}&extra=page%3D1"
header={
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36"
}
response=requests.get(base.format(postID),headers=header)
if response.status_code==200:
html=etree.HTML(response.text)
xis=html.xpath('//a[@class="xi2"]/text()')
user=xis[3]
score=xis[4]
level=html.xpath(f'//div[@id="favatar{postID}"]//p[1]/em[1]/a[1]/text()')[0]
title=html.xpath('//span[@id="thread_subject"]/text()')[0]
content=html.xpath(f'//td[@id="postmessage_{postID}"]/text()')[0].strip()
data=f"{postID}\t{user}\t{score}\t{level}\t{title}\t{content}\n"
return data
def __parseID(self,url):
reg=r".*tid=(.*?)\&.*"
result=re.findall(reg,url)
if len(result)>0:
return result[0]
def __makePageURL(self):
base=self.__site+"/forum.php?mod=forumdisplay&fid=2&page={}"
urls=[]
for num in range(1,self.__maxPage+1):
urls.append(base.format(num))
return urls
def run(self):
urls=self.__makePageURL()
file=open(self.__file,"a+",encoding="utf8")
for url in urls:
ids=self.__getPostID(url)
for id in ids:
data=self.__getPostData(id)
file.write(data)
file.close()
if __name__=="__main__":
spider=Spider()
spider.run()