peter/seek/douban_com/douban_group_seek.py

import json
from time import sleep
from DrissionPage import Chromium, ChromiumOptions
from config.database import SessionLocal
from models.source_content import SourceContent
from utils import logger


class DoubanGroupSeek:
    def __init__(self, group_id):
        co = ChromiumOptions().set_local_port(9333)
        self.browser = Chromium(addr_or_opts=co)
        self.group_id = group_id

    def seek(self):
        db = SessionLocal()
        # 获取最近100条数据的URL列表，用于过滤掉已存在的URL，避免重复爬取和存储
        recent_contents = db.query(SourceContent).order_by(SourceContent.id.desc()).limit(100).all()

        group_url = f'https://www.douban.com/group/{self.group_id}'
        tab = self.browser.new_tab()
        tab.get(group_url)

        title = tab.title
        ele_table = tab.ele('tag:table@class=olt')
        ele_trs = ele_table.eles('tag:tr@!class=th')
        topics = []
        for ele_tr in ele_trs:
            topic_title = ele_tr.ele('tag:a').text
            topic_url = ele_tr.ele('tag:a').attr('href')
            topics.append((topic_title, topic_url))

        # 过滤掉已存在的URL
        existing_urls = set(content.link for content in recent_contents)
        topics = [(title, url) for title, url in topics if url not in existing_urls]

        # 打印要爬取的主题列表
        logger.info(f"Found {len(topics)} new topics to crawl:")
        for topic_title, topic_url in topics:
            logger.info(f"标题：{topic_title} 链接：{topic_url}\n")

        results = []
        for topic_title, topic_url in topics:
            logger.info(f"fetch 标题：{topic_title} 链接：{topic_url}\n")
            tab.get(topic_url)
            tab.wait(30)  # 等待页面加载完成，时间可根据实际情况调整
            try:
                title = tab.title
                ele_article = tab.ele('.article')

                # 获取帖子内容、发布时间、IP地址位置、作者等信息
                ele_topic_content = ele_article.ele('#topic-content')
                ele_topic_doc = ele_topic_content.ele('.topic-doc')
                content = ele_topic_doc.ele('.topic-content').text
                post_time = ele_topic_doc.ele('.create-time').text
                ip_location = ele_topic_doc.ele('.ip-location').text
                author = ele_topic_doc.ele('.from').text

                # 获取评论列表
                comments = []
                # 评论不一定存在，需先判断
                try:
                    ele_comments = ele_article.ele('#comments')
                    ele_comments_list = ele_comments.eles('tag:li')
                    for ele_comment in ele_comments_list:
                        comment_content = ele_comment.ele('.reply-content').text
                        comment_time = ele_comment.ele('.pubtime').text
                        comment_author = ele_comment.ele('tag:h4').child().text
                        comments.append({
                            "comment_content": comment_content,
                            "comment_time": comment_time,
                            "comment_author": comment_author
                        })
                except Exception as e:
                    logger.warning(f"No comments found for topic {topic_title}:{topic_url}: {str(e)}")

                results.append((topic_url, json.dumps({
                        "title": title,
                        "content": content,
                        "post_time": post_time,
                        "ip_location": ip_location,
                        "author": author,
                        "comments": comments
                    }, ensure_ascii=False)))
            except Exception as e:
                logger.error(f"Error processing topic {topic_title}:{topic_url}: {str(e)}")
                continue

        # 存入数据库
        for topic_url, data in results:
            source_content = SourceContent(
                link=topic_url,
                platform='douban',
                content=data
            )
            db.add(source_content)
        db.commit()
        tab.close()