调整豆瓣小组帖子获取逻辑

2026-02-20 18:51:35 +08:00
parent 43a3fdcded
commit ac6c881763
2 changed files with 19 additions and 12 deletions
--- a/seek/douban_com/douban_group_seek.py
+++ b/seek/douban_com/douban_group_seek.py
@ -1,5 +1,5 @@
 import json
-from time import sleep
+import time
 from DrissionPage import Chromium, ChromiumOptions
 from config.database import SessionLocal
 from models.source_content import SourceContent
@ -14,8 +14,6 @@ class DoubanGroupSeek:
    def seek(self):
        db = SessionLocal()
        # 获取最近100条数据的URL列表，用于过滤掉已存在的URL，避免重复爬取和存储
        recent_contents = db.query(SourceContent).order_by(SourceContent.id.desc()).limit(100).all()
        group_url = f'https://www.douban.com/group/{self.group_id}'
        tab = self.browser.new_tab()
@ -28,20 +26,29 @@ class DoubanGroupSeek:
        for ele_tr in ele_trs:
            topic_title = ele_tr.ele('tag:a').text
            topic_url = ele_tr.ele('tag:a').attr('href')
-            topics.append((topic_title, topic_url))
+            update_time = ele_tr.ele('.time').text
            topics.append((topic_title, topic_url, update_time))
-        # 过滤掉已存在的URL
+        # 去掉两个置顶的帖子，根据title包含“置顶”来判断，因为置顶的帖子一般情况是组规则和公告
-        existing_urls = set(content.link for content in recent_contents)
+        topics = [(title, url, update_time) for title, url, update_time in topics if "置顶" not in title]
-        topics = [(title, url) for title, url in topics if url not in existing_urls]
+        
        # 根据更新时间过滤，update_time格式为“10-18 12:34”，只保留24小时内的帖子
        time_str = time.strftime("%m-%d %H:%M", time.localtime(time.time() - 24 * 3600))
        topics = [(title, url, update_time) for title, url, update_time in topics if update_time >= time_str]
        # 打印要爬取的主题列表
-        logger.info(f"Found {len(topics)} new topics to crawl:")
+        logger.info(f"Found {len(topics)} potential new topics to crawl:")
        for topic_title, topic_url in topics:
            logger.info(f"标题：{topic_title} 链接：{topic_url}\n")
        results = []
-        for topic_title, topic_url in topics:
+        for topic_title, topic_url, update_time in topics:
-            logger.info(f"fetch 标题：{topic_title} 链接：{topic_url}\n")
+            # 检索数据库，根据topic_url查询是否已存在
            existing_content = db.query(SourceContent.id).filter(SourceContent.link == topic_url).first()
            if existing_content:
                # logger.info(f"Topic already exists in database, skipping: {topic_title}:{topic_url}")
                continue
            logger.info(f"fetch 标题：{topic_title} 链接：{topic_url} 更新时间：{update_time}\n")
            tab.get(topic_url)
            tab.wait(30)  # 等待页面加载完成，时间可根据实际情况调整
            try:
--- a/task/pycache/manager_task.cpython-312.pyc
+++ b/task/pycache/manager_task.cpython-312.pyc