商品——采集和分析
import requests,os
from lxml import etree
class Spider:
def __init__(self):
self.__site="http://39.106.48.176/"
self.__file=f"{os.getcwd()}/goods.txt"
self.__maxID=1277
pass
def __getData(self):
header={
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36"
}
base=self.__site "/index.php?s=/index/goods/index/id/{}.html"
file=open(self.__file,"a ",encoding="utf8")
for num in range(1,self.__maxID 1):
response=requests.get(base.format(num),headers=header)
if response.status_code==200:
if '资源不存在或已被删除' not in response.text:
html=etree.HTML(response.text)
titles=html.xpath('/html/body/div[4]/div[2]/div[2]/div[1]/h1')
title=titles[0].text.strip()
prices=html.xpath('//b[@class="goods-price"]')
price=prices[0].text
counts=html.xpath('//span[@class="tm-count"]/text()')
view=counts[1]
sale=counts[0]
stocks=html.xpath('//span[@class="stock"]/text()')
stock=stocks[0]
data=f"{num},{title},{price},{view},{sale},{stock}\n"
file.write(data)
else:
continue
file.close()
def run(self):
self.__getData()
if __name__=="__main__":
spider=Spider()
spider.run()
——————————————————————————————————————
create database if not exists shopxo;
use shopxo;
create table if not exists goods (
id int,
title string,
price double,
views int,
sales int,
stock int
) row format delimited
fields terminated by ','
;
load data local inpath '/root/college020/goods.txt' overwrite into table goods;
select * from goods limit 10;
insert overwrite local directory '/root/college023/'
row format delimited fields terminated by '\t'
select * from goods
where price is null
;
create table goods1 as
select * from goods
where title not like '%连衣裙%' or title not like '%女士%' and
price is not null
;
insert overwrite local directory '/root/college024/'
row format delimited fields terminated by '\t'
select count(*) from goods1;
insert overwrite local directory '/root/college025/'
row format delimited fields terminated by '\t'
select distinct title,price from goods1
order by price desc
limit 3
;
insert overwrite local directory '/root/college026/'
row format delimited fields terminated by '\t'
select split(title," ")[0] as brand,count(*) as brand_count from goods1
group by split(title," ")[0]
order by brand_count desc
limit 10
;
insert overwrite local directory '/root/college027/'
row format delimited fields terminated by '\t'
select perona,count(*) as p_count from goods1
lateral view explode(split(title," "))brand as perona
where title rlike ""
group by perona
order by p_count desc
limit 6
;