This commit is contained in:
@ -1,5 +1,5 @@
|
|||||||
import json
|
import json
|
||||||
from time import sleep
|
import time
|
||||||
from DrissionPage import Chromium, ChromiumOptions
|
from DrissionPage import Chromium, ChromiumOptions
|
||||||
from config.database import SessionLocal
|
from config.database import SessionLocal
|
||||||
from models.source_content import SourceContent
|
from models.source_content import SourceContent
|
||||||
@ -14,8 +14,6 @@ class DoubanGroupSeek:
|
|||||||
|
|
||||||
def seek(self):
|
def seek(self):
|
||||||
db = SessionLocal()
|
db = SessionLocal()
|
||||||
# 获取最近100条数据的URL列表,用于过滤掉已存在的URL,避免重复爬取和存储
|
|
||||||
recent_contents = db.query(SourceContent).order_by(SourceContent.id.desc()).limit(100).all()
|
|
||||||
|
|
||||||
group_url = f'https://www.douban.com/group/{self.group_id}'
|
group_url = f'https://www.douban.com/group/{self.group_id}'
|
||||||
tab = self.browser.new_tab()
|
tab = self.browser.new_tab()
|
||||||
@ -28,20 +26,29 @@ class DoubanGroupSeek:
|
|||||||
for ele_tr in ele_trs:
|
for ele_tr in ele_trs:
|
||||||
topic_title = ele_tr.ele('tag:a').text
|
topic_title = ele_tr.ele('tag:a').text
|
||||||
topic_url = ele_tr.ele('tag:a').attr('href')
|
topic_url = ele_tr.ele('tag:a').attr('href')
|
||||||
topics.append((topic_title, topic_url))
|
update_time = ele_tr.ele('.time').text
|
||||||
|
topics.append((topic_title, topic_url, update_time))
|
||||||
|
|
||||||
# 过滤掉已存在的URL
|
# 去掉两个置顶的帖子,根据title包含“置顶”来判断,因为置顶的帖子一般情况是组规则和公告
|
||||||
existing_urls = set(content.link for content in recent_contents)
|
topics = [(title, url, update_time) for title, url, update_time in topics if "置顶" not in title]
|
||||||
topics = [(title, url) for title, url in topics if url not in existing_urls]
|
|
||||||
|
# 根据更新时间过滤,update_time格式为“10-18 12:34”,只保留24小时内的帖子
|
||||||
|
time_str = time.strftime("%m-%d %H:%M", time.localtime(time.time() - 24 * 3600))
|
||||||
|
topics = [(title, url, update_time) for title, url, update_time in topics if update_time >= time_str]
|
||||||
|
|
||||||
# 打印要爬取的主题列表
|
# 打印要爬取的主题列表
|
||||||
logger.info(f"Found {len(topics)} new topics to crawl:")
|
logger.info(f"Found {len(topics)} potential new topics to crawl:")
|
||||||
for topic_title, topic_url in topics:
|
|
||||||
logger.info(f"标题:{topic_title} 链接:{topic_url}\n")
|
|
||||||
|
|
||||||
results = []
|
results = []
|
||||||
for topic_title, topic_url in topics:
|
for topic_title, topic_url, update_time in topics:
|
||||||
logger.info(f"fetch 标题:{topic_title} 链接:{topic_url}\n")
|
# 检索数据库,根据topic_url查询是否已存在
|
||||||
|
existing_content = db.query(SourceContent.id).filter(SourceContent.link == topic_url).first()
|
||||||
|
if existing_content:
|
||||||
|
# logger.info(f"Topic already exists in database, skipping: {topic_title}:{topic_url}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
logger.info(f"fetch 标题:{topic_title} 链接:{topic_url} 更新时间:{update_time}\n")
|
||||||
|
|
||||||
tab.get(topic_url)
|
tab.get(topic_url)
|
||||||
tab.wait(30) # 等待页面加载完成,时间可根据实际情况调整
|
tab.wait(30) # 等待页面加载完成,时间可根据实际情况调整
|
||||||
try:
|
try:
|
||||||
|
|||||||
Binary file not shown.
Reference in New Issue
Block a user