peter/seek/douban_com/douban_group_seek.py

import json
import time
from DrissionPage import Chromium, ChromiumOptions
from config.database import SessionLocal
from models.source_content import SourceContent
from utils import logger


class DoubanGroupSeek:
    def __init__(self, group_id):
        co = ChromiumOptions().set_local_port(9333)
        self.browser = Chromium(addr_or_opts=co)
        self.group_id = group_id

    def seek(self):
        db = SessionLocal()

        group_url = f'https://www.douban.com/group/{self.group_id}'
        tab = self.browser.new_tab()
        tab.get(group_url)

        title = tab.title
        ele_table = tab.ele('tag:table@class=olt')
        ele_trs = ele_table.eles('tag:tr@!class=th')
        topics = []
        for ele_tr in ele_trs:
            topic_title = ele_tr.ele('tag:a').text
            topic_url = ele_tr.ele('tag:a').attr('href')
            update_time = ele_tr.ele('.time').text
            topics.append((topic_title, topic_url, update_time))

        # 去掉两个置顶的帖子，根据title包含“置顶”来判断，因为置顶的帖子一般情况是组规则和公告
        topics = [(title, url, update_time) for title, url, update_time in topics if "置顶" not in title]

        # 根据更新时间过滤，update_time格式为“10-18 12:34”，只保留24小时内的帖子
        time_str = time.strftime("%m-%d %H:%M", time.localtime(time.time() - 24 * 3600))
        topics = [(title, url, update_time) for title, url, update_time in topics if update_time >= time_str]

        # 打印要爬取的主题列表
        logger.info(f"Found {len(topics)} potential new topics to crawl:")

        results = []
        for topic_title, topic_url, update_time in topics:
            # 检索数据库，根据topic_url查询是否已存在
            existing_content = db.query(SourceContent.id).filter(SourceContent.link == topic_url).first()
            if existing_content:
                # logger.info(f"Topic already exists in database, skipping: {topic_title}:{topic_url}")
                continue

            logger.info(f"fetch 标题：{topic_title} 链接：{topic_url} 更新时间：{update_time}\n")

            tab.get(topic_url)
            tab.wait(30)  # 等待页面加载完成，时间可根据实际情况调整
            try:
                title = tab.title
                ele_article = tab.ele('.article')

                # 获取帖子内容、发布时间、IP地址位置、作者等信息
                ele_topic_content = ele_article.ele('#topic-content')
                ele_topic_doc = ele_topic_content.ele('.topic-doc')
                content = ele_topic_doc.ele('.topic-content').text
                post_time = ele_topic_doc.ele('.create-time').text
                ip_location = ele_topic_doc.ele('.ip-location').text
                author = ele_topic_doc.ele('.from').text

                # 获取评论列表
                comments = []
                # 评论不一定存在，需先判断
                try:
                    ele_comments = ele_article.ele('#comments')
                    ele_comments_list = ele_comments.eles('tag:li')
                    for ele_comment in ele_comments_list:
                        comment_content = ele_comment.ele('.reply-content').text
                        comment_time = ele_comment.ele('.pubtime').text
                        comment_author = ele_comment.ele('tag:h4').child().text
                        comments.append({
                            "comment_content": comment_content,
                            "comment_time": comment_time,
                            "comment_author": comment_author
                        })
                except Exception as e:
                    logger.warning(f"No comments found for topic {topic_title}:{topic_url}: {str(e)}")

                results.append((topic_url, json.dumps({
                        "title": title,
                        "content": content,
                        "post_time": post_time,
                        "ip_location": ip_location,
                        "author": author,
                        "comments": comments
                    }, ensure_ascii=False)))
            except Exception as e:
                logger.error(f"Error processing topic {topic_title}:{topic_url}: {str(e)}")
                continue

        # 存入数据库
        for topic_url, data in results:
            source_content = SourceContent(
                link=topic_url,
                platform='douban',
                content=data
            )
            db.add(source_content)
        db.commit()
        tab.close()