import peter

2025-11-12 20:42:16 +08:00
commit 8c1a740f0b
147 changed files with 2763 additions and 0 deletions
--- a/seek/zhihu_com/zhihu.py
+++ b/seek/zhihu_com/zhihu.py
@ -0,0 +1,173 @@
+import re
+
+from DrissionPage import Chromium
+from DrissionPage import ChromiumOptions
+from DrissionPage.errors import ElementNotFoundError
+
+from database.database import get_session
+from database.thotcontent.crud import create_contents_top3_if_url_not_exists
+from database.thotcontent.model import THotContent
+from database.thottopic.crud import create_topics_if_url_not_exists, update_hot_topic
+from database.thottopic.model import THotTopic
+from log.log_manager import logger
+
+def get_content_from_meta(metas, itemprop):
+    content = None
+    for meta in metas:
+        if meta.attr('itemprop') == itemprop:
+            content = meta.attr('content')
+    return content
+
+
+class Zhihu:
+    def __init__(self):
+        co = ChromiumOptions()
+        self.browser = Chromium()
+        self.tab = None
+
+    def get_topics(self):
+        """获取知乎数据"""
+        topics_result = []
+        try:
+            self.tab = self.browser.new_tab()
+            # 访问知乎主页面
+            self.tab.get('https://www.zhihu.com')
+
+            # 等待热榜内容加载
+            self.tab.wait.ele_displayed('.Card TopstoryItem TopstoryItem-isRecommend')
+
+            # 获取所有热榜条目
+            hot_items = self.tab.ele('.Topstory-content').eles('.Card TopstoryItem TopstoryItem-isRecommend')
+
+            for item in hot_items:
+                try:
+                    topic = THotTopic()
+                    topic.source = '知乎'
+                    # 提取标题和链接
+                    topic.topic = item.ele('tag:h2').ele('tag:a').text
+                    topic.url = item.ele('tag:h2').ele('tag:a').link
+                    pattern = r'^https://www\.zhihu\.com/question/\d+'
+                    result = re.findall(pattern, topic.url)
+                    if result:
+                        topic.url = result[0]
+                    else:
+                        continue
+                    topics_result.append(topic)
+                except ElementNotFoundError as e:
+                    logger.error(f"元素缺失：{str(e)}")
+                except ValueError as e:
+                    logger.error(f"热度值转换失败：{str(e)}")
+
+        except ElementNotFoundError as e:
+            logger.error(f"热榜容器元素未找到：{str(e)}")
+        except Exception as e:
+            logger.error(f"获取热榜数据异常：{str(e)}")
+        finally:
+            if self.tab:
+                self.tab.close()
+        return topics_result
+
+    def get_content(self, topic: THotTopic, db):
+        """获取话题内容数据"""
+        contents_result = []
+        try:
+            self.tab = self.browser.new_tab()
+            # 访问话题/问题页面
+            self.tab.get(topic.url)
+
+            for _ in range(10):
+                # 等待内容加载
+                self.tab.wait.ele_displayed('.List-item')
+                self.tab.wait(3)
+                # 向下滚动页面，直到所有内容加载完成
+                self.tab.scroll.to_bottom()
+                self.tab.wait(1)
+                self.tab.scroll.up(100)
+
+            # 获取话题/问题相关信息：话题内容、keywards、话题创建日期dateCreated、话题修改日期dateModified、回答数量answerCount、评论数量commentCount
+            question_page = self.tab.ele('.QuestionPage')
+            # 获取话题属性，为QuestionPage的前9个meta标签
+            metas = question_page.eles('tag:meta')[0:9]
+            # print(metas)
+            answer_count = get_content_from_meta(metas, 'answerCount')
+            comment_count = get_content_from_meta(metas, 'commentCount')
+            keywords = get_content_from_meta(metas, 'keywords')
+            date_created = get_content_from_meta(metas, 'dateCreated')
+            date_modified = get_content_from_meta(metas, 'dateModified')
+            follower_count = get_content_from_meta(metas, 'zhihu:followerCount')
+            # print(date_created, date_modified, answer_count, comment_count, keywords)
+            topic.content_count = int(answer_count)
+            topic.comment_count = int(comment_count)
+            topic.follower_count = int(follower_count)
+            topic.keywords = keywords
+            topic.date_created = date_created
+            topic.date_modified = date_modified
+            try:
+                topic.topic_description = question_page.ele('.RichText ztext css-ob6uua').text
+            except ElementNotFoundError as e:
+                logger.error(f"元素缺失：不存在topic_description")
+            update_hot_topic(db, topic)
+
+            # 获取所有内容条目
+            content_items = self.tab.ele('.Question-mainColumn').eles('.List-item')
+
+            for item in content_items:
+                try:
+                    content = THotContent()
+                    content.topic_id = topic.id
+                    content.url = item.ele('.ContentItem-time').ele('tag:a').link
+                    upvote_str = item.ele('.Button VoteButton VoteButton--up FEfUrdfMIKpQDJDqkjte').text
+                    match = re.search(r'(\d+\.?\d*)\s*万?', upvote_str)
+                    if match:
+                        number = float(match.group(1))
+                        content.content_upvote_count = int(number * 10000) if '万' in upvote_str else int(number)
+                    else:
+                        content.content_upvote_count = 0
+                    comment_str = item.ele('.Button ContentItem-action FEfUrdfMIKpQDJDqkjte Button--plain Button--withIcon Button--withLabel fEPKGkUK5jyc4fUuT0QP B46v1Ak6Gj5sL2JTS4PY RuuQ6TOh2cRzJr6WlyQp').text
+                    match = re.search(r'(\d{1,3}(?:,\d{3})*)', comment_str)
+                    if match:
+                        content.content_comment_count = int(match.group(1).replace(',', ''))
+                    else:
+                        content.content_comment_count = 0
+                    content.content = item.ele('.RichContent-inner').text
+                    contents_result.append(content)
+                except ElementNotFoundError as e:
+                    logger.error(f"元素缺失：{str(e)}")
+                except ValueError as e:
+                    logger.error(f"热度值转换失败：{str(e)}")
+
+        except ElementNotFoundError as e:
+            logger.error(f"热榜容器元素未找到：{str(e)}")
+        except Exception as e:
+            logger.error(f"获取热榜数据异常：{str(e)}")
+        finally:
+            if self.tab:
+                self.tab.close()
+        return contents_result
+
+
+
+def get_topics() -> list:
+    zhihu = Zhihu()
+    topics = zhihu.get_topics()
+    return topics
+
+def gather_task():
+    """任务执行入口"""
+    with get_session() as db:
+        zhihu = Zhihu()
+        topics = zhihu.get_topics()
+        inserted_topics = create_topics_if_url_not_exists(db, topics)
+        for topic in inserted_topics:
+            logger.info(f"采集到话题：{topic}")
+            contents = zhihu.get_content(topic, db)
+            create_contents_top3_if_url_not_exists(db, contents)
+
+
+if __name__ == '__main__':
+    # 测试用例
+    logger.info('知乎采集测试')
+    # 执行采集任务
+    gather_task()
+    logger.info('测试完成')
+