from DrissionPage import Chromium from DrissionPage import ChromiumOptions from DrissionPage.errors import ElementNotFoundError from log.log_manager import logger def get_content_from_meta(metas, itemprop): content = None for meta in metas: if meta.attr('itemprop') == itemprop: content = meta.attr('content') return content class ZhihuHot: def __init__(self): co = ChromiumOptions() self.browser = Chromium() def get_topic_url_list(self) -> list: """获取知乎热榜数据""" _topic_url_list = [] _tab = None try: _tab = self.browser.new_tab() # 访问热榜页面 _tab.get('https://www.zhihu.com/hot') # 等待热榜内容加载 _tab.wait.ele_displayed('.HotItem') # 获取所有热榜条目 hot_items = _tab.ele('.HotList-list').eles('.HotItem') for item in hot_items: try: # 提取标题和链接 # title = item.ele('tag:a').attr('title').title() url = item.ele('tag:a').link _topic_url_list.append(url) except ElementNotFoundError as e: logger.error(f"元素缺失:{str(e)}") except ValueError as e: logger.error(f"热度值转换失败:{str(e)}") except ElementNotFoundError as e: logger.error(f"热榜容器元素未找到:{str(e)}") except Exception as e: logger.error(f"获取热榜数据异常:{str(e)}") finally: if _tab: _tab.close() return _topic_url_list def get_content(self, url): """获取话题内容数据""" contents_result = [] _tab = None global title, keywords, date_created, date_modified, follower_count, comment_count, answer_count, topic_description try: _tab = self.browser.new_tab() # 访问话题/问题页面 _tab.get(url) for _ in range(10): # for _ in range(1): # 等待内容加载 _tab.wait.ele_displayed('.List-item') _tab.wait(3) # 向下滚动页面,直到所有内容加载完成 _tab.scroll.to_bottom() _tab.wait(1) _tab.scroll.up(100) # 获取话题/问题相关信息:话题内容、keywards、话题创建日期dateCreated、话题修改日期dateModified、回答数量answerCount、评论数量commentCount question_page = _tab.ele('.QuestionPage') # 获取话题属性,为QuestionPage的前9个meta标签 metas = question_page.eles('tag:meta')[0:9] # print(metas) title = get_content_from_meta(metas, 'name') answer_count = get_content_from_meta(metas, 'answerCount') comment_count = get_content_from_meta(metas, 'commentCount') keywords = get_content_from_meta(metas, 'keywords') date_created = get_content_from_meta(metas, 'dateCreated') date_modified = get_content_from_meta(metas, 'dateModified') follower_count = get_content_from_meta(metas, 'zhihu:followerCount') # print(date_created, date_modified, answer_count, comment_count, keywords) topic_description = "" try: unfold_topic_description = question_page.ele('.^Button QuestionRichText-more') if unfold_topic_description: unfold_topic_description.click() topic_description = question_page.ele('.^QuestionRichText').text except ElementNotFoundError as e: logger.error(f"元素缺失:不存在topic_description") # 获取所有内容条目 content_items = _tab.ele('.Question-mainColumn').eles('.List-item') total_characters = 0 for item in content_items: try: content = item.ele('.RichContent-inner').text # 计算content的字数 content_len = len(content) print(content_len) if content_len > 1000 or content_len < 100: logger.error(f"skip本条内容,内容长度:{content_len}") continue if total_characters > 5000: logger.error(f"contents_result长度超过5000,跳出循环") break total_characters += content_len contents_result.append(content) # 打印contents_result的长度 logger.info(f"contents_result长度:{len(contents_result)}") except ElementNotFoundError as e: logger.error(f"元素缺失:{str(e)}") except ValueError as e: logger.error(f"热度值转换失败:{str(e)}") except ElementNotFoundError as e: logger.error(f"热榜容器元素未找到:{str(e)}") except Exception as e: logger.error(f"获取热榜数据异常:{str(e)}") finally: if _tab: _tab.close() return { 'title': title, 'topic_description': topic_description, 'keywords': keywords, 'url': url, 'contents': contents_result, 'date_created': date_created, 'date_modified': date_modified, 'follower_count': follower_count, 'answer_count': answer_count, 'comment_count': comment_count } if __name__ == '__main__': # 测试用例 logger.info('知乎采集测试') # 执行采集任务 zhihu_hot = ZhihuHot() result = zhihu_hot.get_content('https://www.zhihu.com/question/14351228309') print(len(result)) print(result) # topic_url_list = zhihu_hot.get_topic_url_list() # print(topic_url_list) logger.info('测试完成')