Files
peter/seek/zhihu_com/hot.py
konjacpotato 8c1a740f0b import peter
2025-11-12 20:42:16 +08:00

86 lines
3.1 KiB
Python

import datetime
import re # 添加正则表达式库的导入
from DrissionPage.errors import ElementNotFoundError
from database.database import get_session
from database.tinformationsource.model import TInformationSource
from database.tnews.model import TNews
from log.log_manager import logger
from seek.seek_base import SeekBase
class ZhihuHot(SeekBase):
def get_news(self):
"""获取知乎热榜数据"""
news_result = []
try:
# 访问热榜页面
self.tab.get('https://www.zhihu.com/hot')
# 等待热榜内容加载
self.tab.wait.ele_displayed('.HotItem')
# 获取所有热榜条目
hot_items = self.tab.ele('.HotList-list').eles('.HotItem')
for item in hot_items:
try:
news = TNews()
# 提取标题和链接
news.title = item.ele('tag:a').attr('title').title()
news.url = item.ele('tag:a').link
# 提取热度值(去除"热度"文字)
heat_value = item('.HotItem-metrics HotItem-metrics--bottom').text
logger.info(f"热度值:{heat_value}")
# 使用正则表达式提取数值部分
match = re.search(r'(\d+\s*万)', heat_value)
if match:
news.heat = match.group(1).replace(' ', '') # 去除空格
else:
logger.error(f"无法提取热度值:{heat_value}")
logger.info(f"提取到的热度值:{news.heat}")
news.source = self.information_source.title
news.occurrence_date = datetime.datetime.now()
news_result.append(news)
except ElementNotFoundError as e:
logger.error(f"元素缺失:{str(e)}")
except ValueError as e:
logger.error(f"热度值转换失败:{str(e)}")
except ElementNotFoundError as e:
logger.error(f"热榜容器元素未找到:{str(e)}")
except Exception as e:
logger.error(f"获取热榜数据异常:{str(e)}")
return news_result
def get_news(information_source: TInformationSource) -> list:
"""对外暴露的获取新闻接口"""
zhihu = ZhihuHot(information_source)
return zhihu.get_news()
def news_task(information_source: TInformationSource):
"""任务执行入口"""
with get_session() as db:
news_list = get_news(information_source)
# create_news_list_if_url_not_exists(db, news_list)
for news in news_list:
logger.info(f"采集到新闻:{news}")
if __name__ == '__main__':
# 测试用例
logger.info('知乎热榜采集测试')
information_source_ = TInformationSource()
information_source_.is_static = False # 知乎需要浏览器渲染
information_source_.url = 'https://www.zhihu.com/hot'
information_source_.title = '热榜_知乎'
# 执行采集任务
news_task(information_source_)
logger.info('测试完成')