diff --git a/seek/douban_com/douban_group_seek.py b/seek/douban_com/douban_group_seek.py index 780b891..41c67c3 100644 --- a/seek/douban_com/douban_group_seek.py +++ b/seek/douban_com/douban_group_seek.py @@ -1,5 +1,5 @@ import json -from time import sleep +import time from DrissionPage import Chromium, ChromiumOptions from config.database import SessionLocal from models.source_content import SourceContent @@ -14,8 +14,6 @@ class DoubanGroupSeek: def seek(self): db = SessionLocal() - # 获取最近100条数据的URL列表,用于过滤掉已存在的URL,避免重复爬取和存储 - recent_contents = db.query(SourceContent).order_by(SourceContent.id.desc()).limit(100).all() group_url = f'https://www.douban.com/group/{self.group_id}' tab = self.browser.new_tab() @@ -28,20 +26,29 @@ class DoubanGroupSeek: for ele_tr in ele_trs: topic_title = ele_tr.ele('tag:a').text topic_url = ele_tr.ele('tag:a').attr('href') - topics.append((topic_title, topic_url)) + update_time = ele_tr.ele('.time').text + topics.append((topic_title, topic_url, update_time)) - # 过滤掉已存在的URL - existing_urls = set(content.link for content in recent_contents) - topics = [(title, url) for title, url in topics if url not in existing_urls] + # 去掉两个置顶的帖子,根据title包含“置顶”来判断,因为置顶的帖子一般情况是组规则和公告 + topics = [(title, url, update_time) for title, url, update_time in topics if "置顶" not in title] + + # 根据更新时间过滤,update_time格式为“10-18 12:34”,只保留24小时内的帖子 + time_str = time.strftime("%m-%d %H:%M", time.localtime(time.time() - 24 * 3600)) + topics = [(title, url, update_time) for title, url, update_time in topics if update_time >= time_str] # 打印要爬取的主题列表 - logger.info(f"Found {len(topics)} new topics to crawl:") - for topic_title, topic_url in topics: - logger.info(f"标题:{topic_title} 链接:{topic_url}\n") + logger.info(f"Found {len(topics)} potential new topics to crawl:") results = [] - for topic_title, topic_url in topics: - logger.info(f"fetch 标题:{topic_title} 链接:{topic_url}\n") + for topic_title, topic_url, update_time in topics: + # 检索数据库,根据topic_url查询是否已存在 + existing_content = db.query(SourceContent.id).filter(SourceContent.link == topic_url).first() + if existing_content: + # logger.info(f"Topic already exists in database, skipping: {topic_title}:{topic_url}") + continue + + logger.info(f"fetch 标题:{topic_title} 链接:{topic_url} 更新时间:{update_time}\n") + tab.get(topic_url) tab.wait(30) # 等待页面加载完成,时间可根据实际情况调整 try: diff --git a/task/__pycache__/manager_task.cpython-312.pyc b/task/__pycache__/manager_task.cpython-312.pyc index 4cae05a..519ad2b 100644 Binary files a/task/__pycache__/manager_task.cpython-312.pyc and b/task/__pycache__/manager_task.cpython-312.pyc differ