获得京东商品信息并清洗为excel表格

2023-02-02 14:04 作者:芜湖小量化 0人读过 | 我要投稿

# 在京东进行网页搜索进行url分析发现url中page={}与页数之间的关系为2i+1

通过元素检查发现商品的信息都储存在xpath路径//*[@id="J_goodsList"]/ul中

# 使用的库

from pyquery import PyQuery
import requests
import time
import xlwt
from urllib.parse import quote

# 抓取单个页面的数据请求标头详见浏览器元素检查
def craw_one_page(url):
try:
cookie = quote(' ')
headers = {
'authority': 'search.jd.com',
'accept': '',
'accept-language': '',
'cookie': cookie,
'referer': 'https://www.jd.com/',
'user-agent': ''
}
response = requests.get(url=url, headers=headers)
if response.status_code == 200:
response_content = response.content # 获取二进制数据
response_content_doc = str(response_content, 'utf-8')
return response_content_doc
return None
except Exception:
return None

# 解析单个页面的数据 class属性空格可以用.定位

def analysis_one_page(html):

doc = PyQuery(html)
ul = doc('.gl-warp.clearfix')
li_list = ul('.gl-item')

for li in li_list.items():
product = li('div > div.p-name.p-name-type-2 > a > em')[0].text

if product is None:
product = li(' div > div.p-name.p-name-type-2 > a').attr('title')

price = li('div > div.p-price > strong > i').text()
sale = li('div > div.p-shop > span > a').text()
yield {
'商品': product,
'价格': price,
'卖家': sale
}

# 运行函数进行数据处理

if __name__ == '__main__':
urls = ['https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&wq=%E6%89%8B%E6%9C%BA&pvid=084ce205e8df47f48433198db7d00f35&page={}&s=56&click=0'.format(str(i)) for i in range(1, 13, 2)]
header = ['排名', '产品', '价格', '卖家'] # 定义xls的头
book = xlwt.Workbook(encoding='utf-8')
# 创建sheet
sheet_all = book.add_sheet('所有手机排名')
sheet_huawei = book.add_sheet('华为')
sheet_apple = book.add_sheet('苹果')
sheet_xiaomi = book.add_sheet('小米')
sheet_samsung = book.add_sheet('三星')
sheet_oppo = book.add_sheet('oppo')

for n in range(len(header)): # 为每个sheet添加标头
sheet_all.write(0, n, header[n])
sheet_apple.write(0, n, header[n])
sheet_huawei.write(0, n, header[n])
sheet_xiaomi.write(0, n, header[n])
sheet_samsung.write(0, n, header[n])
sheet_oppo.write(0, n, header[n])

i = 1
i_apple = 1
i_huawei = 1
i_xiaomi = 1
i_samsung = 1
i_oppo = 1

for url in urls:
mobile_infos = analysis_one_page(craw_one_page(url))
for mobile_info in mobile_infos: # 使可以打印每一条信息
print(mobile_info)
sheet_all.write(i, 0, str(i))
sheet_all.write(i, 1, mobile_info['商品'])
sheet_all.write(i, 2, mobile_info['价格'])
sheet_all.write(i, 3, mobile_info['卖家'])
i += 1

if mobile_info['商品'].lower().find('Apple') != -1:
sheet_apple.write(i_apple, 0, str(i_apple))
sheet_apple.write(i_apple, 1, mobile_info['商品'])
sheet_apple.write(i_apple, 2, mobile_info['价格'])
sheet_apple.write(i_apple, 3, mobile_info['卖家'])
i_apple += 1

if mobile_info['商品'].lower().find('华为') != -1:
sheet_huawei.write(i_huawei, 0, str(i_huawei))
sheet_huawei.write(i_huawei, 1, mobile_info['商品'])
sheet_huawei.write(i_huawei, 2, mobile_info['价格'])
sheet_huawei.write(i_huawei, 3, mobile_info['卖家'])
i_huawei += 1

if mobile_info['商品'].lower().find('小米') != -1:
sheet_xiaomi.write(i_xiaomi, 0, str(i_xiaomi))
sheet_xiaomi.write(i_xiaomi, 1, mobile_info['商品'])
sheet_xiaomi.write(i_xiaomi, 2, mobile_info['价格'])
sheet_xiaomi.write(i_xiaomi, 3, mobile_info['卖家'])
i_xiaomi += 1

if mobile_info['商品'].lower().find('三星') != -1:
sheet_samsung.write(i_samsung, 0, str(i_samsung))
sheet_samsung.write(i_samsung, 1, mobile_info['商品'])
sheet_samsung.write(i_samsung, 2, mobile_info['价格'])
sheet_samsung.write(i_samsung, 3, mobile_info['卖家'])
i_samsung += 1

if mobile_info['商品'].lower().find('OPPO') != -1:
sheet_oppo.write(i_oppo, 0, str(i_oppo))
sheet_oppo.write(i_oppo, 1, mobile_info['商品'])
sheet_oppo.write(i_oppo, 2, mobile_info['价格'])
sheet_oppo.write(i_oppo, 3, mobile_info['卖家'])
i_oppo += 1
time.sleep(0.3)

book.save('mobile_phone_2.xls')

# 私货欢迎关注天选国V　雫るる_Offical！谢谢了喵！！！

标签：

获得京东商品信息并清洗为excel表格

获得京东商品信息并清洗为excel表格的评论 (共条)

你可能也喜欢这些文章

最新发布的文章

获得京东商品信息并清洗为excel表格

本文作者的其他文章

获得京东商品信息并清洗为excel表格的评论 (共 条)

你可能也喜欢这些文章

最新发布的文章

获得京东商品信息并清洗为excel表格的评论 (共条)