import re from DrissionPage import Chromium from DrissionPage import ChromiumOptions from DrissionPage.errors import ElementNotFoundError from database.database import get_session from database.thotcontent.crud import create_contents_top3_if_url_not_exists from database.thotcontent.model import THotContent from database.thottopic.crud import create_topics_if_url_not_exists, update_hot_topic from database.thottopic.model import THotTopic from log.log_manager import logger def get_content_from_meta(metas, itemprop): content = None for meta in metas: if meta.attr('itemprop') == itemprop: content = meta.attr('content') return content class Zhihu: def __init__(self): co = ChromiumOptions() self.browser = Chromium() self.tab = None def get_topics(self): """获取知乎数据""" topics_result = [] try: self.tab = self.browser.new_tab() # 访问知乎主页面 self.tab.get('https://www.zhihu.com') # 等待热榜内容加载 self.tab.wait.ele_displayed('.Card TopstoryItem TopstoryItem-isRecommend') # 获取所有热榜条目 hot_items = self.tab.ele('.Topstory-content').eles('.Card TopstoryItem TopstoryItem-isRecommend') for item in hot_items: try: topic = THotTopic() topic.source = '知乎' # 提取标题和链接 topic.topic = item.ele('tag:h2').ele('tag:a').text topic.url = item.ele('tag:h2').ele('tag:a').link pattern = r'^https://www\.zhihu\.com/question/\d+' result = re.findall(pattern, topic.url) if result: topic.url = result[0] else: continue topics_result.append(topic) except ElementNotFoundError as e: logger.error(f"元素缺失:{str(e)}") except ValueError as e: logger.error(f"热度值转换失败:{str(e)}") except ElementNotFoundError as e: logger.error(f"热榜容器元素未找到:{str(e)}") except Exception as e: logger.error(f"获取热榜数据异常:{str(e)}") finally: if self.tab: self.tab.close() return topics_result def get_content(self, topic: THotTopic, db): """获取话题内容数据""" contents_result = [] try: self.tab = self.browser.new_tab() # 访问话题/问题页面 self.tab.get(topic.url) for _ in range(10): # 等待内容加载 self.tab.wait.ele_displayed('.List-item') self.tab.wait(3) # 向下滚动页面,直到所有内容加载完成 self.tab.scroll.to_bottom() self.tab.wait(1) self.tab.scroll.up(100) # 获取话题/问题相关信息:话题内容、keywards、话题创建日期dateCreated、话题修改日期dateModified、回答数量answerCount、评论数量commentCount question_page = self.tab.ele('.QuestionPage') # 获取话题属性,为QuestionPage的前9个meta标签 metas = question_page.eles('tag:meta')[0:9] # print(metas) answer_count = get_content_from_meta(metas, 'answerCount') comment_count = get_content_from_meta(metas, 'commentCount') keywords = get_content_from_meta(metas, 'keywords') date_created = get_content_from_meta(metas, 'dateCreated') date_modified = get_content_from_meta(metas, 'dateModified') follower_count = get_content_from_meta(metas, 'zhihu:followerCount') # print(date_created, date_modified, answer_count, comment_count, keywords) topic.content_count = int(answer_count) topic.comment_count = int(comment_count) topic.follower_count = int(follower_count) topic.keywords = keywords topic.date_created = date_created topic.date_modified = date_modified try: topic.topic_description = question_page.ele('.RichText ztext css-ob6uua').text except ElementNotFoundError as e: logger.error(f"元素缺失:不存在topic_description") update_hot_topic(db, topic) # 获取所有内容条目 content_items = self.tab.ele('.Question-mainColumn').eles('.List-item') for item in content_items: try: content = THotContent() content.topic_id = topic.id content.url = item.ele('.ContentItem-time').ele('tag:a').link upvote_str = item.ele('.Button VoteButton VoteButton--up FEfUrdfMIKpQDJDqkjte').text match = re.search(r'(\d+\.?\d*)\s*万?', upvote_str) if match: number = float(match.group(1)) content.content_upvote_count = int(number * 10000) if '万' in upvote_str else int(number) else: content.content_upvote_count = 0 comment_str = item.ele('.Button ContentItem-action FEfUrdfMIKpQDJDqkjte Button--plain Button--withIcon Button--withLabel fEPKGkUK5jyc4fUuT0QP B46v1Ak6Gj5sL2JTS4PY RuuQ6TOh2cRzJr6WlyQp').text match = re.search(r'(\d{1,3}(?:,\d{3})*)', comment_str) if match: content.content_comment_count = int(match.group(1).replace(',', '')) else: content.content_comment_count = 0 content.content = item.ele('.RichContent-inner').text contents_result.append(content) except ElementNotFoundError as e: logger.error(f"元素缺失:{str(e)}") except ValueError as e: logger.error(f"热度值转换失败:{str(e)}") except ElementNotFoundError as e: logger.error(f"热榜容器元素未找到:{str(e)}") except Exception as e: logger.error(f"获取热榜数据异常:{str(e)}") finally: if self.tab: self.tab.close() return contents_result def get_topics() -> list: zhihu = Zhihu() topics = zhihu.get_topics() return topics def gather_task(): """任务执行入口""" with get_session() as db: zhihu = Zhihu() topics = zhihu.get_topics() inserted_topics = create_topics_if_url_not_exists(db, topics) for topic in inserted_topics: logger.info(f"采集到话题:{topic}") contents = zhihu.get_content(topic, db) create_contents_top3_if_url_not_exists(db, contents) if __name__ == '__main__': # 测试用例 logger.info('知乎采集测试') # 执行采集任务 gather_task() logger.info('测试完成')