欢迎光临散文网 会员登陆 & 注册

Python课程天花板,Python入门+Python爬虫+Python数据分析

2023-07-06 16:23 作者:可爱不是可受  | 我要投稿

跟着前几个视频做了一下豆瓣Top250的网络爬虫,自己小改了一下代码,大部分还是老师的源代码

from bs4 import BeautifulSoup

import re

import urllib.request,urllib.error

import xlwt

import sqlite3

def main():

  baseurl="https://movie.douban.com/top250?start="

  Datalist=getData(baseurl)

  savepath=".\\Data.xls"

  saveData(Datalist,savepath)

  #askUrl("https://movie.douban.com/top250?start=")


#创建正则表达式,表示字符串的规则

#影片超链接

findlink=re.compile(r'<a href="(.*?)">')

#影片的图片

findImgSrc=re.compile(r'<img.*src="(.*?)"',re.S) #re.S可以匹配换行符

#影片片名

findTitle=re.compile(r'<span class="title">(.*)</span>')

#影片评分

findRating=re.compile(r'<span class="rating_num" property="v:average">(.*)</span>')

#评价人数

findJudge=re.compile(r'<span>(\d*)人评价</span>')

#评价

findInq=re.compile(r'<span class="inq">(.*)</span>')

#影片相关内容

findBd=re.compile(r'<p class="">(.*?)</p>',re.S)

#爬取网页

def getData(baseurl):

  Datalist=[]

  for i in range(0,10):

    url=baseurl+str(i*25)

    html=askUrl(url)

    soup=BeautifulSoup(html,"html.parser")

    for item in soup.find_all('div',class_="item"):

      #print(item) #测试是否能成功筛选

      #break

      data = [] # 用来保存一部电影的全部信息

      item=str(item)

      #影片详情的链接0

      link=re.findall(findlink,item) #[0]表示只要第一个

      #print(link) #测试

      data.append(link)

      ImgSrc=re.findall(findImgSrc,item)

      #print(ImgSrc)

      data.append(ImgSrc)

      titles=re.findall(findTitle,item)

      #print(titles)

      if(len(titles)==2):

        ctitle=titles[0]

        data.append(ctitle)

        otitle=str(titles[1]).replace('\xa0',"")

        data.append(otitle)

      else:

        data.append(titles[0])

        data.append(" ")

      #print(data)

      Ratings=re.findall(findRating,item)

      data.append(Ratings)

      #print(Ratings)

      Judge=re.findall(findJudge,item)

      data.append(Judge)

      #print(Judge)

      Inq=re.findall(findInq,item)

      if(len(Inq) != 0):

        inq=Inq[0].replace("。"," ")

        data.append(inq)

      else:

        data.append(" ")

      Bd=str(re.findall(findBd,item))

      Bd=re.sub('<br/(/s+)?>(/s+)?'," ",Bd)

      Bd=Bd.replace('\\n',"")

      Bd=Bd.replace('\\xa0',"")

      #print(Bd)

      data.append(Bd.strip())

      Datalist.append(data)

  return Datalist


#保存数据

def saveData(datalist,savepath):

  print("save....")

  book=xlwt.Workbook(encoding="utf-8")

  sheet=book.add_sheet('豆瓣电影Top250',cell_overwrite_ok=True)

  col=("电影链接","封面图片链接","电影中文名","电影外文名","评分","评价数","概况","相关信息")

  for i in range(0,8):

    sheet.write(0,i,col[i])

  for i in range(0,250):

    print("第%d条"%i)

    data=datalist[i]

    for j in range(0,8):

      sheet.write(i+1,j,data[j])

  book.save(savepath)


def askUrl(url):

  head={

    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"

  } #伪装浏览器进行访问

  request=urllib.request.Request(url,headers=head)

  html=""

  try:

    response=urllib.request.urlopen(request)

    html=response.read().decode("utf-8")

    #print(html)

  except urllib.error.URLError as e:

    if hasattr(e,"code"):

      print(e.code)

    if hasattr(e,"reason"):

      print(e.reason)

  return html

main()

Python课程天花板,Python入门+Python爬虫+Python数据分析的评论 (共 条)

分享到微博请遵守国家法律