网页爬取程序1.0源码
本次使用的是python语言
源码:
import tkinter as tk
from tkinter import ttk
import requests
import urllib3
import time
import random
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) # 禁用InsecureRequestWarning警告
# 创建事件处理函数
def crawl_website():
url = url_entry.get()
filename = filename_entry.get()
feedback_text = "开始爬取网页..."
update_status_bar(feedback_text)
time.sleep(2)
feedback_text = "完成页面爬取."
update_status_bar(feedback_text)
time.sleep(2)
# 发送GET请求并获取网页源代码(忽略SSL证书验证错误)
response = requests.get(url, verify=False)
html_content = response.text
time.sleep(1)
# 如果文件名不包含后缀,则添加默认后缀
if not filename.lower().endswith('.html'):
filename += '.html'
# 将网页源代码保存到文件中
with open(filename, 'w', encoding='utf-8') as file:
file.write(html_content)
feedback_text = f"网页爬取成功并保存到文件: {filename}"
update_status_bar(feedback_text)
# 创建主窗口
window = tk.Tk()
window.title("网站源代码爬取器")
window.geometry("400x300")
# 创建标签和输入框
tk.Label(window, text="网址:").grid(row=0, column=0, padx=10, pady=10)
url_entry = tk.Entry(window, width=30)
url_entry.grid(row=0, column=1, padx=10, pady=10)
tk.Label(window, text="保存文件名:").grid(row=1, column=0, padx=10, pady=10)
filename_entry = tk.Entry(window, width=30)
filename_entry.grid(row=1, column=1, padx=10, pady=10)
# 创建开始按钮
start_button = ttk.Button(window, text="开始", command=crawl_website)
start_button.grid(row=2, column=0, columnspan=2, padx=10, pady=10)
# 状态栏
status_var = tk.StringVar()
status_bar = ttk.Label(window, textvariable=status_var, anchor=tk.W)
status_bar.grid(row=3, column=0, columnspan=2, sticky=tk.W+tk.E)
# 更新状态栏
def update_status_bar(text):
status_var.set(text)
status_bar.update()
window.mainloop()
如有问题私信UP

