本人初学python,参考网络资源编写的爬虫功能,定时隔2小时爬取百度热搜的热点话题(话题类型,话题标题,图片地址,话题地址,序号,热度,话题内容等)并将数据存入数据库,方便后续统计,数据分析等...
目录结构使用java的结构(勿喷),具体实现如下:
目录结构
1.数据库设计: CREATE TABLE `baidu_hot_main` ( `id` varchar(36) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NOT NULL COMMENT '主键', `address` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL COMMENT '地址', `position` varchar(100) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL COMMENT '位置', `times` varchar(20) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL COMMENT '次数', `header` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL COMMENT '请求头', `describe` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL COMMENT '描述', `date` datetime DEFAULT NULL COMMENT '日期', PRIMARY KEY (`id`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci; CREATE TABLE `baidu_hot_content` ( `id` varchar(36) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NOT NULL COMMENT '主键', `main_id` varchar(36) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL COMMENT '关联', `top` int DEFAULT NULL COMMENT '序号', `title` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL COMMENT '标题', `href` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL COMMENT '内容地址', `content` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL COMMENT '内容', `type` varchar(10) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL COMMENT '类型', `img` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL COMMENT '图片地址', `index` varchar(25) DEFAULT NULL COMMENT '热点指数', PRIMARY KEY (`id`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci; 2.配置文件:config配置文件配置数据库链接配置,网页爬取配置。
#数据库配置 host = ("127.0.0.1")#url port = (3306)#端口号 user = ("root")#数据库用户 password = ("root")#数据库密码 database = ("pachong")#要连接的数据库名称 #配置网页信息2 url1 = '' headers1={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'} cookies1 = '' params1="" contentA1 = "#sanRoot > main > div.container.right-container_2EFJr > div > div:nth-child(2) > div "url1:为爬取网站地址
headers1:内配置虚拟浏览器,模拟浏览器访问,防止反爬虫。
contentA1:为爬取的html位置,具体配置需要在网页控制台查看
image.png
contentA1 获取:页面右键控制台,找到元素中要爬取的div,右键复制,选择selector的值。
3.controller from reptile.Config import url1, contentA1, headers1, cookies1, params1 from reptile.service.WebHandleService import runRepitleMain #封装到类里,方便继承 class urlConfig(): #配置网页信息---- url = url1 contentA = contentA1 headers= headers1 cookies = cookies1 params= params1 def runRepitle(): runRepitleMain(urlConfig)4.service
import time import requests import uuid from bs4 import BeautifulSoup from reptile.dao.BaiduHotContentDao import setBatchContent, setContent from reptile.dao.BaiduHotMainDao import setMain, getTimes #获取网页资源 #爬baidu def runRepitleMain(urlConfig): response = requests.get(urlConfig.url,headers=urlConfig.headers) response.encoding = 'utf-8' #定义选择器 soup = BeautifulSoup(response.text, 'html.parser') id1 = uuid.uuid1() date = time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())) setMainTab(urlConfig.url,urlConfig.contentA,urlConfig.headers,id1,date) a = soup.select(urlConfig.contentA) # 插入的数据 data=[] for i in range(0,len(a)): # (id, main_id, top, title, href, content,type,img,index)) # 主键id id2 = uuid.uuid1() # 热点排序 top0 = a[i].find("a").find("div").text.strip() # 标题 title = a[i].find(name="div",attrs={"class":"content_1YWBm"}).find("a").text.strip() # 热点标志 type = a[i].find(name="div",attrs={"class":"hot-tag_1G080"}).text.strip() # 链接 href = a[i].find("a").get("href") # 图片链接 img = a[i].find("a").find("img").get("src") # 热搜点数 index = a[i].find(name="div",attrs={"class":"hot-index_1Bl1a"}).text.strip() # 部分内容 content = a[i].find(name="div",attrs={"class":"large_nSuFU"}).text.replace("查看更多>",'').strip() #三元表达式 top = ("0",top0)[len(top0) !=0 ] print("爬取第"+top+"条数据,内容:title-"+title+"type-"+type) # 单条插入 data.append((id2, id1,top, title,href,content,type,img,index)) #批量插入 setBatchContent(data) return "爬取成功"; #将该事件存入数据库main表 def setMainTab(url,contentA,headers,id1,date): times = getTimes(url,"百度热搜爬取内容")+1 print("共计:"+str(times)+"次") setMain(id1,str(url),str(contentA),str(times),str(headers),"百度热搜爬取内容",date) #将爬取内容存入数据库content表 5.dao import pymysql from reptile import Config sql_insert_baidu_hot_content = '''insert into baidu_hot_content (id,main_id,top,title,href,content,type,img) values(%s,%s,%s,%s,%s,%s,%s,%s)''' sql_insert_baidu_hot_content_batch = '''insert into baidu_hot_content (id,main_id,top,title,href,content,type,img,`index`) values(%s,%s,%s,%s,%s,%s,%s,%s,%s)''' #插入表 def setContent(id,main_id, top, title, href, content,type,img): #连接数据库 db= pymysql.connect( host=Config.host,#url port=Config.port,#端口号 user=Config.user,#数据库用户 password=Config.password,#数据库密码 database=Config.database#要连接的数据库名称 ) #连接串 cursor = db.cursor() try: #执行sql语句 cursor.execute(sql_insert_baidu_hot_content, (id, main_id, top, title, href, content,type,img)) #提交事务 db.commit() print('插入成功') except Exception as e: print(e) #如果出现异常,回滚 db.rollback() print('插入失败') finally: #关闭数据库连接 db.close() return 1 #批量插入表 def setBatchContent(dataList): #连接数据库 db= pymysql.connect( host=Config.host,#url port=Config.port,#端口号 user=Config.user,#数据库用户 password=Config.password,#数据库密码 database=Config.database#要连接的数据库名称 ) #连接串 cursor = db.cursor() try: #执行sql语句 cursor.executemany(sql_insert_baidu_hot_content_batch,dataList) #提交事务 db.commit() print('插入成功') except Exception as e: print(e) #如果出现异常,回滚 db.rollback() print('插入失败') finally: #关闭数据库连接 db.close() return 1 import pymysql from reptile import Config #连接串 sql_insert_baidu_hot_main = '''insert into baidu_hot_main(id,address,`position`,times,header,`date`,`describe`) values(%s,%s,%s,%s,%s,%s,%s)''' sql_getTimes_baidu_hot_main = "select id from baidu_hot_main where address=%s and `describe` =%s" #查询当前访问次数 def getTimes(address,describe): try: #连接数据库 db= pymysql.connect( host=Config.host,#url port=Config.port,#端口号 user=Config.user,#数据库用户 password=Config.password,#数据库密码 database=Config.database#要连接的数据库名称 ) cursor = db.cursor() #执行sql语句 sum = cursor.execute(sql_getTimes_baidu_hot_main, (address,describe)) print('describe='+describe+';;;;address='+address) #提交事务 db.commit() print('查询次数成功') except Exception as e: print(e) #如果出现异常,回滚 db.rollback() print('查询次数失败') finally: #关闭数据库连接 db.close() return sum #插入表 def setMain(id,address,position,times,header,describe,date): try: #连接数据库 db= pymysql.connect( host=Config.host,#url port=Config.port,#端口号 user=Config.user,#数据库用户 password=Config.password,#数据库密码 database=Config.database#要连接的数据库名称 ) cursor = db.cursor() #执行sql语句 cursor.execute(sql_insert_baidu_hot_main, (id,address,position,times,header,date,describe)) #提交事务 db.commit() print('插入成功') except Exception as e: print(e) #如果出现异常,回滚 db.rollback() print('插入失败') finally: #关闭数据库连接 db.close() return 1 后期更新如何做定时任务及打包部署...