import peter

This commit is contained in:
konjacpotato
2025-11-12 20:42:16 +08:00
commit 8c1a740f0b
147 changed files with 2763 additions and 0 deletions

156
seek/zhihu_com/zhihu_hot.py Normal file
View File

@ -0,0 +1,156 @@
from DrissionPage import Chromium
from DrissionPage import ChromiumOptions
from DrissionPage.errors import ElementNotFoundError
from log.log_manager import logger
def get_content_from_meta(metas, itemprop):
content = None
for meta in metas:
if meta.attr('itemprop') == itemprop:
content = meta.attr('content')
return content
class ZhihuHot:
def __init__(self):
co = ChromiumOptions()
self.browser = Chromium()
def get_topic_url_list(self) -> list:
"""获取知乎热榜数据"""
_topic_url_list = []
_tab = None
try:
_tab = self.browser.new_tab()
# 访问热榜页面
_tab.get('https://www.zhihu.com/hot')
# 等待热榜内容加载
_tab.wait.ele_displayed('.HotItem')
# 获取所有热榜条目
hot_items = _tab.ele('.HotList-list').eles('.HotItem')
for item in hot_items:
try:
# 提取标题和链接
# title = item.ele('tag:a').attr('title').title()
url = item.ele('tag:a').link
_topic_url_list.append(url)
except ElementNotFoundError as e:
logger.error(f"元素缺失:{str(e)}")
except ValueError as e:
logger.error(f"热度值转换失败:{str(e)}")
except ElementNotFoundError as e:
logger.error(f"热榜容器元素未找到:{str(e)}")
except Exception as e:
logger.error(f"获取热榜数据异常:{str(e)}")
finally:
if _tab:
_tab.close()
return _topic_url_list
def get_content(self, url):
"""获取话题内容数据"""
contents_result = []
_tab = None
global title, keywords, date_created, date_modified, follower_count, comment_count, answer_count, topic_description
try:
_tab = self.browser.new_tab()
# 访问话题/问题页面
_tab.get(url)
for _ in range(10):
# for _ in range(1):
# 等待内容加载
_tab.wait.ele_displayed('.List-item')
_tab.wait(3)
# 向下滚动页面,直到所有内容加载完成
_tab.scroll.to_bottom()
_tab.wait(1)
_tab.scroll.up(100)
# 获取话题/问题相关信息话题内容、keywards、话题创建日期dateCreated、话题修改日期dateModified、回答数量answerCount、评论数量commentCount
question_page = _tab.ele('.QuestionPage')
# 获取话题属性为QuestionPage的前9个meta标签
metas = question_page.eles('tag:meta')[0:9]
# print(metas)
title = get_content_from_meta(metas, 'name')
answer_count = get_content_from_meta(metas, 'answerCount')
comment_count = get_content_from_meta(metas, 'commentCount')
keywords = get_content_from_meta(metas, 'keywords')
date_created = get_content_from_meta(metas, 'dateCreated')
date_modified = get_content_from_meta(metas, 'dateModified')
follower_count = get_content_from_meta(metas, 'zhihu:followerCount')
# print(date_created, date_modified, answer_count, comment_count, keywords)
topic_description = ""
try:
unfold_topic_description = question_page.ele('.^Button QuestionRichText-more')
if unfold_topic_description:
unfold_topic_description.click()
topic_description = question_page.ele('.^QuestionRichText').text
except ElementNotFoundError as e:
logger.error(f"元素缺失不存在topic_description")
# 获取所有内容条目
content_items = _tab.ele('.Question-mainColumn').eles('.List-item')
total_characters = 0
for item in content_items:
try:
content = item.ele('.RichContent-inner').text
# 计算content的字数
content_len = len(content)
print(content_len)
if content_len > 1000 or content_len < 100:
logger.error(f"skip本条内容内容长度{content_len}")
continue
if total_characters > 5000:
logger.error(f"contents_result长度超过5000跳出循环")
break
total_characters += content_len
contents_result.append(content)
# 打印contents_result的长度
logger.info(f"contents_result长度{len(contents_result)}")
except ElementNotFoundError as e:
logger.error(f"元素缺失:{str(e)}")
except ValueError as e:
logger.error(f"热度值转换失败:{str(e)}")
except ElementNotFoundError as e:
logger.error(f"热榜容器元素未找到:{str(e)}")
except Exception as e:
logger.error(f"获取热榜数据异常:{str(e)}")
finally:
if _tab:
_tab.close()
return {
'title': title,
'topic_description': topic_description,
'keywords': keywords,
'url': url,
'contents': contents_result,
'date_created': date_created,
'date_modified': date_modified,
'follower_count': follower_count,
'answer_count': answer_count,
'comment_count': comment_count
}
if __name__ == '__main__':
# 测试用例
logger.info('知乎采集测试')
# 执行采集任务
zhihu_hot = ZhihuHot()
result = zhihu_hot.get_content('https://www.zhihu.com/question/14351228309')
print(len(result))
print(result)
# topic_url_list = zhihu_hot.get_topic_url_list()
# print(topic_url_list)
logger.info('测试完成')