Files
peter/seek/zhihu_com/zhihu_hot.py
konjacpotato 8c1a740f0b import peter
2025-11-12 20:42:16 +08:00

157 lines
6.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from DrissionPage import Chromium
from DrissionPage import ChromiumOptions
from DrissionPage.errors import ElementNotFoundError
from log.log_manager import logger
def get_content_from_meta(metas, itemprop):
content = None
for meta in metas:
if meta.attr('itemprop') == itemprop:
content = meta.attr('content')
return content
class ZhihuHot:
def __init__(self):
co = ChromiumOptions()
self.browser = Chromium()
def get_topic_url_list(self) -> list:
"""获取知乎热榜数据"""
_topic_url_list = []
_tab = None
try:
_tab = self.browser.new_tab()
# 访问热榜页面
_tab.get('https://www.zhihu.com/hot')
# 等待热榜内容加载
_tab.wait.ele_displayed('.HotItem')
# 获取所有热榜条目
hot_items = _tab.ele('.HotList-list').eles('.HotItem')
for item in hot_items:
try:
# 提取标题和链接
# title = item.ele('tag:a').attr('title').title()
url = item.ele('tag:a').link
_topic_url_list.append(url)
except ElementNotFoundError as e:
logger.error(f"元素缺失:{str(e)}")
except ValueError as e:
logger.error(f"热度值转换失败:{str(e)}")
except ElementNotFoundError as e:
logger.error(f"热榜容器元素未找到:{str(e)}")
except Exception as e:
logger.error(f"获取热榜数据异常:{str(e)}")
finally:
if _tab:
_tab.close()
return _topic_url_list
def get_content(self, url):
"""获取话题内容数据"""
contents_result = []
_tab = None
global title, keywords, date_created, date_modified, follower_count, comment_count, answer_count, topic_description
try:
_tab = self.browser.new_tab()
# 访问话题/问题页面
_tab.get(url)
for _ in range(10):
# for _ in range(1):
# 等待内容加载
_tab.wait.ele_displayed('.List-item')
_tab.wait(3)
# 向下滚动页面,直到所有内容加载完成
_tab.scroll.to_bottom()
_tab.wait(1)
_tab.scroll.up(100)
# 获取话题/问题相关信息话题内容、keywards、话题创建日期dateCreated、话题修改日期dateModified、回答数量answerCount、评论数量commentCount
question_page = _tab.ele('.QuestionPage')
# 获取话题属性为QuestionPage的前9个meta标签
metas = question_page.eles('tag:meta')[0:9]
# print(metas)
title = get_content_from_meta(metas, 'name')
answer_count = get_content_from_meta(metas, 'answerCount')
comment_count = get_content_from_meta(metas, 'commentCount')
keywords = get_content_from_meta(metas, 'keywords')
date_created = get_content_from_meta(metas, 'dateCreated')
date_modified = get_content_from_meta(metas, 'dateModified')
follower_count = get_content_from_meta(metas, 'zhihu:followerCount')
# print(date_created, date_modified, answer_count, comment_count, keywords)
topic_description = ""
try:
unfold_topic_description = question_page.ele('.^Button QuestionRichText-more')
if unfold_topic_description:
unfold_topic_description.click()
topic_description = question_page.ele('.^QuestionRichText').text
except ElementNotFoundError as e:
logger.error(f"元素缺失不存在topic_description")
# 获取所有内容条目
content_items = _tab.ele('.Question-mainColumn').eles('.List-item')
total_characters = 0
for item in content_items:
try:
content = item.ele('.RichContent-inner').text
# 计算content的字数
content_len = len(content)
print(content_len)
if content_len > 1000 or content_len < 100:
logger.error(f"skip本条内容内容长度{content_len}")
continue
if total_characters > 5000:
logger.error(f"contents_result长度超过5000跳出循环")
break
total_characters += content_len
contents_result.append(content)
# 打印contents_result的长度
logger.info(f"contents_result长度{len(contents_result)}")
except ElementNotFoundError as e:
logger.error(f"元素缺失:{str(e)}")
except ValueError as e:
logger.error(f"热度值转换失败:{str(e)}")
except ElementNotFoundError as e:
logger.error(f"热榜容器元素未找到:{str(e)}")
except Exception as e:
logger.error(f"获取热榜数据异常:{str(e)}")
finally:
if _tab:
_tab.close()
return {
'title': title,
'topic_description': topic_description,
'keywords': keywords,
'url': url,
'contents': contents_result,
'date_created': date_created,
'date_modified': date_modified,
'follower_count': follower_count,
'answer_count': answer_count,
'comment_count': comment_count
}
if __name__ == '__main__':
# 测试用例
logger.info('知乎采集测试')
# 执行采集任务
zhihu_hot = ZhihuHot()
result = zhihu_hot.get_content('https://www.zhihu.com/question/14351228309')
print(len(result))
print(result)
# topic_url_list = zhihu_hot.get_topic_url_list()
# print(topic_url_list)
logger.info('测试完成')