元气桌面单页面爬取
# 糖醋慕雨的快乐时光
import requests
import json
from openpyxl import load_workbook
from openpyxl import Workbook
import os
# 获取程序的绝对路径
current_directory = os.path.dirname(os.path.abspath(__file__))
# 创建douyin文件夹路径
douyin_folder = os.path.join(current_directory, "元气")
# 如果douyin文件夹不存在,则创建它
if not os.path.exists("元气"):
os.mkdir("元气")
# 检查文件是否存在
file_path_xlsm = "yuan.xlsm"
if not os.path.exists(file_path_xlsm):
print('not exists')
# 创建一个工作簿
wb = Workbook()
# 保存为.xlsx文件
wb.save(file_path_xlsm)
else:
# 文件已存在,打开文件进行读取操作
print('exists')
# # 打开现有的.xlsx文件
# current_directory = os.path.dirname(os.path.abspath(__file__))
# file_path = os.path.join(current_directory,file_path_xlsm)
# wb = load_workbook(file_path)
wb = load_workbook(file_path_xlsm)
# 创建一个Workbook对象
# 选择或新建一个工作表
ws = wb.active
ws.cell(row=1, column=1, value="作者名") # 作者名
ws.cell(row=1, column=2, value="作品名") # 作品名
ws.cell(row=1, column=3, value="作品图片链接-封面") # 作品图片链接
ws.cell(row=1, column=4, value="作品视频链接") # # 作品视频链接
ws.cell(row=1, column=5, value="作者主页链接") # 作者主页链接
ws.cell(row=1, column=6, value="4k视频链接") # 4k视频链接
ws.cell(row=1, column=7, value="作品上传时间") # 作品上传时间
dwid = input('请输入url地址:')
dwid = int(dwid.split('=')[1])
detail_params = {
'wid': dwid,
'common': {'player_version': 0}
}
headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'accept-language': 'zh-CN,zh;q=0.9',
'accept-encoding': 'gzip, deflate, br',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'
}
detail_url = 'https://pcwallpaper.zhhainiao.com/wallpaper/live/detail' #
# 获取页面html
def scrape_Main(url,headers,detail_params):
response = requests.post(url, headers=headers,json=detail_params)
try:
if response.status_code == 200:
response.encoding = response.apparent_encoding
# print(response.text)/
return response.text
except requests.RequestException:
print('请求失败')
html = scrape_Main(detail_url,headers,detail_params)
# author_uid = json.loads(html) # 作者独有的uid
# print(author_uid) # 41424485
data = json.loads(html)['data']
# print(data)
preview_video = data['preview_video'] # 视频链接
update_time = data['update_time'] # 更新时间- 上传时间
preview_jpg = data['preview_jpg'].split('?')[0] # 图片链接
author = 'https://wp.cheetahfun.com/personal/author?author_uid=' + str(data['author_uid']) # 作者主页链接
video_4k = data['video_4k'] # 我打不开, 应该是4k 视频
author_name = data['author_name'] # 作者名
wname = data['wname'] # 作品名
# 获取最后一行的行号
last_row = ws.max_row
ws.cell(row=last_row + 1, column=1, value=author_name) # 作者名
ws.cell(row=last_row + 1, column=2, value=wname) # 作品名
ws.cell(row=last_row + 1, column=3, value=preview_jpg) # 作品图片链接
ws.cell(row=last_row + 1, column=4, value=preview_video) # 作品视频链接
ws.cell(row=last_row + 1, column=5, value=author) # 作者主页链接
ws.cell(row=last_row + 1, column=6, value=video_4k) # 4k视频链接
ws.cell(row=last_row + 1, column=7, value=update_time) # 作品上传时间
with open('yuan.txt', 'w', encoding='utf-8') as f:
f.write("作者名:{}\n作品名:{}\n作者主页:{}\n作品视频:{}\n图片链接:{}\n作品上传时间:{}\n".format(author_name, wname, author,preview_video,preview_jpg,update_time))
print(' {}: 数据已保存在yuan.txt文件下'.format(wname))
# 冻结首行
ws.freeze_panes = 'A2'
# 保存文件
wb.save(file_path_xlsm)
# 创建"元气\\wname\\"文件夹路径
douyin_folder = os.path.join("元气\\", f"{wname}\\")
# 如果"元气\\wname\\"文件夹不存在,则创建它
if not os.path.exists(douyin_folder):
os.mkdir(douyin_folder)
Video = requests.get(preview_video).content
with open('元气\\' + f'{wname}\\'+ wname + '.mp4', mode='wb+') as f:
f.write(Video)
print(wname + ': 视频下载完成')
preview_jpg = requests.get(preview_jpg).content
with open('元气\\' + f'{wname}\\'+ wname + '.jpg', mode='wb+') as f:
f.write(preview_jpg)
print(wname + ': 图片下载完成')
print('作品: {} 保存成功'.format(wname))
import pandas as pd
# 读取Excel文件
df = pd.read_excel(file_path_xlsm)
# 使用drop_duplicates方法去重
df_deduplicated = df.drop_duplicates()
# 将去重后的DataFrame保存回Excel
df_deduplicated.to_excel(file_path_xlsm, index=False)
# 打开现有的.xlsx文件
# current_directory = os.path.dirname(os.path.abspath(__file__))
# file_path = os.path.join(current_directory, file_path_xlsm)
wb = load_workbook(file_path_xlsm)
# 选择或新建一个工作表
ws = wb.active
# 冻结首行
ws.freeze_panes = 'A2'
# 保存文件
wb.save(file_path_xlsm)
print('数据去重成功')
# 测试: https://wp.cheetahfun.com/personal/wallpaper?dwid=180048