欢迎光临散文网 会员登陆 & 注册

商品——采集和分析

2023-03-09 19:41 作者:一个普通的优菈厨  | 我要投稿

import requests,os
from lxml import etree

class Spider:
   def __init__(self):
       self.__site="http://39.106.48.176/"
       self.__file=f"{os.getcwd()}/goods.txt"
       self.__maxID=1277
       pass
   
   def __getData(self):
       header={
           "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36"
       }
       
       base=self.__site "/index.php?s=/index/goods/index/id/{}.html"
       
       file=open(self.__file,"a ",encoding="utf8")

       for num in range(1,self.__maxID 1):
           response=requests.get(base.format(num),headers=header)
           if response.status_code==200:
               if '资源不存在或已被删除' not in response.text:
                   html=etree.HTML(response.text)

                   titles=html.xpath('/html/body/div[4]/div[2]/div[2]/div[1]/h1')
                   title=titles[0].text.strip()

                   prices=html.xpath('//b[@class="goods-price"]')
                   price=prices[0].text

                   counts=html.xpath('//span[@class="tm-count"]/text()')
                   view=counts[1]

                   sale=counts[0]

                   stocks=html.xpath('//span[@class="stock"]/text()')
                   stock=stocks[0]

                   data=f"{num},{title},{price},{view},{sale},{stock}\n"

                   file.write(data)
           else:
               continue
       file.close()

   def run(self):
       self.__getData()

if __name__=="__main__":
   spider=Spider()
   spider.run()



——————————————————————————————————————

create database if not exists shopxo;

use shopxo;

create table if not exists goods (

    id int,

    title string,

    price double,

    views int,

    sales int,

    stock int

) row format delimited

fields terminated by ','

;


load data local inpath '/root/college020/goods.txt' overwrite into table goods;


select * from goods limit 10;


insert overwrite local directory '/root/college023/'

row format delimited fields terminated by '\t'

select * from goods

where price is null

;


create table goods1 as

select * from goods

where title not like '%连衣裙%' or title not like '%女士%' and

price is not null

;


insert overwrite local directory '/root/college024/'

row format delimited fields terminated by '\t'

select count(*) from goods1;


insert overwrite local directory '/root/college025/'

row format delimited fields terminated by '\t'

select distinct title,price from goods1

order by price desc

limit 3

;


insert overwrite local directory '/root/college026/'

row format delimited fields terminated by '\t'

select split(title," ")[0] as brand,count(*) as brand_count from goods1

group by split(title," ")[0]

order by brand_count desc

limit 10

;


insert overwrite local directory '/root/college027/'

row format delimited fields terminated by '\t'

select perona,count(*) as p_count from goods1

lateral view explode(split(title," "))brand as perona

where title rlike ""

group by perona

order by p_count desc

limit 6

;


商品——采集和分析的评论 (共 条)

分享到微博请遵守国家法律