peter/seek/douban_com/douban_group_seek.py

import json
import time
from DrissionPage import Chromium, ChromiumOptions
from psycopg import OperationalError
from config.database import SessionLocal
from models.source_content import SourceContent
from utils import logger


class DoubanGroupSeek:
    def __init__(self, group_id):
        co = ChromiumOptions().set_local_port(9333)
        self.browser = Chromium(addr_or_opts=co)
        self.group_id = group_id

    def fetch_with_retry(self, session, model, link, max_retries=3, base_delay=1):
        """
        带重试的数据库查询函数
        :param session: SQLAlchemy 会话
        :param model: 模型类
        :param link: 要查询的链接
        :param max_retries: 最大重试次数
        :param base_delay: 初始延迟时间（秒）
        :return: 查询结果或 None
        """
        for attempt in range(1, max_retries + 1):
            try:
                # 执行查询
                result = session.query(model.id).filter(model.link == link).first()
                return result  # 成功则返回结果
            except OperationalError as e:
                # 打印错误信息（可选）
                logger.error(f"数据库查询失败 (尝试 {attempt}/{max_retries}): {e}")
                # 回滚当前会话，避免事务挂起影响后续操作
                session.rollback()
                if attempt == max_retries:
                    # 最后一次重试失败，抛出异常或记录错误后返回 None
                    logger.info(f"已达到最大重试次数，放弃查询链接: {link}")
                    raise  # 或者返回 None，根据业务决定
                # 计算等待时间：指数退避
                sleep_time = base_delay * (2 ** (attempt - 1))
                time.sleep(sleep_time)
            except Exception as e:
                # 如果还想捕获其他数据库错误，可以统一处理
                logger.error(f"数据库错误: {e}")
                session.rollback()
                raise  # 非临时性错误，直接抛出
        return None  # 正常情况下不会执行到这里

    def seek(self):
        db = SessionLocal()

        group_url = f'https://www.douban.com/group/{self.group_id}'
        tab = self.browser.new_tab()
        tab.get(group_url)

        title = tab.title
        ele_table = tab.ele('tag:table@class=olt')
        ele_trs = ele_table.eles('tag:tr@!class=th')
        topics = []
        for ele_tr in ele_trs:
            topic_title = ele_tr.ele('tag:a').text
            topic_url = ele_tr.ele('tag:a').attr('href')
            update_time = ele_tr.ele('.time').text
            topics.append((topic_title, topic_url, update_time))

        # 去掉两个置顶的帖子，根据title包含“置顶”来判断，因为置顶的帖子一般情况是组规则和公告
        topics = [(title, url, update_time) for title, url, update_time in topics if "置顶" not in title]

        # 根据更新时间过滤，update_time格式为“10-18 12:34”，只保留24小时内的帖子
        time_str = time.strftime("%m-%d %H:%M", time.localtime(time.time() - 24 * 3600))
        topics = [(title, url, update_time) for title, url, update_time in topics if update_time >= time_str]

        # 打印要爬取的主题列表
        logger.info(f"Found {len(topics)} potential new topics to crawl:")

        results = []
        for topic_title, topic_url, update_time in topics:
            # 检索数据库，根据topic_url查询是否已存在
            # existing_content = db.query(SourceContent.id).filter(SourceContent.link == topic_url).first()
            existing_content = self.fetch_with_retry(db, SourceContent, topic_url)
            if existing_content:
                # logger.info(f"Topic already exists in database, skipping: {topic_title}:{topic_url}")
                continue

            logger.info(f"fetch 标题：{topic_title} 链接：{topic_url} 更新时间：{update_time}\n")

            tab.get(topic_url)
            tab.wait(30)  # 等待页面加载完成，时间可根据实际情况调整
            try:
                title = tab.title
                ele_article = tab.ele('.article')

                # 获取帖子内容、发布时间、IP地址位置、作者等信息
                ele_topic_content = ele_article.ele('#topic-content')
                ele_topic_doc = ele_topic_content.ele('.topic-doc')
                content = ele_topic_doc.ele('.topic-content').text
                post_time = ele_topic_doc.ele('.create-time').text
                ip_location = ele_topic_doc.ele('.ip-location').text
                author = ele_topic_doc.ele('.from').text

                # 获取评论列表
                comments = []
                # 评论不一定存在，需先判断
                try:
                    ele_comments = ele_article.ele('#comments')
                    ele_comments_list = ele_comments.eles('tag:li')
                    for ele_comment in ele_comments_list:
                        comment_content = ele_comment.ele('.reply-content').text
                        comment_time = ele_comment.ele('.pubtime').text
                        comment_author = ele_comment.ele('tag:h4').child().text
                        comments.append({
                            "comment_content": comment_content,
                            "comment_time": comment_time,
                            "comment_author": comment_author
                        })
                except Exception as e:
                    logger.warning(f"No comments found for topic {topic_title}:{topic_url}: {str(e)}")

                results.append((topic_url, json.dumps({
                        "title": title,
                        "content": content,
                        "post_time": post_time,
                        "ip_location": ip_location,
                        "author": author,
                        "comments": comments
                    }, ensure_ascii=False)))
            except Exception as e:
                logger.error(f"Error processing topic {topic_title}:{topic_url}: {str(e)}")
                continue

        # 存入数据库
        for topic_url, data in results:
            source_content = SourceContent(
                link=topic_url,
                platform='douban',
                content=data
            )
            db.add(source_content)
        db.commit()
        tab.close()