Files
peter/seek/zhihu_com/zhihu.py
konjacpotato 8c1a740f0b import peter
2025-11-12 20:42:16 +08:00

174 lines
7.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import re
from DrissionPage import Chromium
from DrissionPage import ChromiumOptions
from DrissionPage.errors import ElementNotFoundError
from database.database import get_session
from database.thotcontent.crud import create_contents_top3_if_url_not_exists
from database.thotcontent.model import THotContent
from database.thottopic.crud import create_topics_if_url_not_exists, update_hot_topic
from database.thottopic.model import THotTopic
from log.log_manager import logger
def get_content_from_meta(metas, itemprop):
content = None
for meta in metas:
if meta.attr('itemprop') == itemprop:
content = meta.attr('content')
return content
class Zhihu:
def __init__(self):
co = ChromiumOptions()
self.browser = Chromium()
self.tab = None
def get_topics(self):
"""获取知乎数据"""
topics_result = []
try:
self.tab = self.browser.new_tab()
# 访问知乎主页面
self.tab.get('https://www.zhihu.com')
# 等待热榜内容加载
self.tab.wait.ele_displayed('.Card TopstoryItem TopstoryItem-isRecommend')
# 获取所有热榜条目
hot_items = self.tab.ele('.Topstory-content').eles('.Card TopstoryItem TopstoryItem-isRecommend')
for item in hot_items:
try:
topic = THotTopic()
topic.source = '知乎'
# 提取标题和链接
topic.topic = item.ele('tag:h2').ele('tag:a').text
topic.url = item.ele('tag:h2').ele('tag:a').link
pattern = r'^https://www\.zhihu\.com/question/\d+'
result = re.findall(pattern, topic.url)
if result:
topic.url = result[0]
else:
continue
topics_result.append(topic)
except ElementNotFoundError as e:
logger.error(f"元素缺失:{str(e)}")
except ValueError as e:
logger.error(f"热度值转换失败:{str(e)}")
except ElementNotFoundError as e:
logger.error(f"热榜容器元素未找到:{str(e)}")
except Exception as e:
logger.error(f"获取热榜数据异常:{str(e)}")
finally:
if self.tab:
self.tab.close()
return topics_result
def get_content(self, topic: THotTopic, db):
"""获取话题内容数据"""
contents_result = []
try:
self.tab = self.browser.new_tab()
# 访问话题/问题页面
self.tab.get(topic.url)
for _ in range(10):
# 等待内容加载
self.tab.wait.ele_displayed('.List-item')
self.tab.wait(3)
# 向下滚动页面,直到所有内容加载完成
self.tab.scroll.to_bottom()
self.tab.wait(1)
self.tab.scroll.up(100)
# 获取话题/问题相关信息话题内容、keywards、话题创建日期dateCreated、话题修改日期dateModified、回答数量answerCount、评论数量commentCount
question_page = self.tab.ele('.QuestionPage')
# 获取话题属性为QuestionPage的前9个meta标签
metas = question_page.eles('tag:meta')[0:9]
# print(metas)
answer_count = get_content_from_meta(metas, 'answerCount')
comment_count = get_content_from_meta(metas, 'commentCount')
keywords = get_content_from_meta(metas, 'keywords')
date_created = get_content_from_meta(metas, 'dateCreated')
date_modified = get_content_from_meta(metas, 'dateModified')
follower_count = get_content_from_meta(metas, 'zhihu:followerCount')
# print(date_created, date_modified, answer_count, comment_count, keywords)
topic.content_count = int(answer_count)
topic.comment_count = int(comment_count)
topic.follower_count = int(follower_count)
topic.keywords = keywords
topic.date_created = date_created
topic.date_modified = date_modified
try:
topic.topic_description = question_page.ele('.RichText ztext css-ob6uua').text
except ElementNotFoundError as e:
logger.error(f"元素缺失不存在topic_description")
update_hot_topic(db, topic)
# 获取所有内容条目
content_items = self.tab.ele('.Question-mainColumn').eles('.List-item')
for item in content_items:
try:
content = THotContent()
content.topic_id = topic.id
content.url = item.ele('.ContentItem-time').ele('tag:a').link
upvote_str = item.ele('.Button VoteButton VoteButton--up FEfUrdfMIKpQDJDqkjte').text
match = re.search(r'(\d+\.?\d*)\s*万?', upvote_str)
if match:
number = float(match.group(1))
content.content_upvote_count = int(number * 10000) if '' in upvote_str else int(number)
else:
content.content_upvote_count = 0
comment_str = item.ele('.Button ContentItem-action FEfUrdfMIKpQDJDqkjte Button--plain Button--withIcon Button--withLabel fEPKGkUK5jyc4fUuT0QP B46v1Ak6Gj5sL2JTS4PY RuuQ6TOh2cRzJr6WlyQp').text
match = re.search(r'(\d{1,3}(?:,\d{3})*)', comment_str)
if match:
content.content_comment_count = int(match.group(1).replace(',', ''))
else:
content.content_comment_count = 0
content.content = item.ele('.RichContent-inner').text
contents_result.append(content)
except ElementNotFoundError as e:
logger.error(f"元素缺失:{str(e)}")
except ValueError as e:
logger.error(f"热度值转换失败:{str(e)}")
except ElementNotFoundError as e:
logger.error(f"热榜容器元素未找到:{str(e)}")
except Exception as e:
logger.error(f"获取热榜数据异常:{str(e)}")
finally:
if self.tab:
self.tab.close()
return contents_result
def get_topics() -> list:
zhihu = Zhihu()
topics = zhihu.get_topics()
return topics
def gather_task():
"""任务执行入口"""
with get_session() as db:
zhihu = Zhihu()
topics = zhihu.get_topics()
inserted_topics = create_topics_if_url_not_exists(db, topics)
for topic in inserted_topics:
logger.info(f"采集到话题:{topic}")
contents = zhihu.get_content(topic, db)
create_contents_top3_if_url_not_exists(db, contents)
if __name__ == '__main__':
# 测试用例
logger.info('知乎采集测试')
# 执行采集任务
gather_task()
logger.info('测试完成')