import peter
This commit is contained in:
85
seek/zhihu_com/hot.py
Normal file
85
seek/zhihu_com/hot.py
Normal file
@ -0,0 +1,85 @@
|
||||
import datetime
|
||||
import re # 添加正则表达式库的导入
|
||||
|
||||
from DrissionPage.errors import ElementNotFoundError
|
||||
|
||||
from database.database import get_session
|
||||
from database.tinformationsource.model import TInformationSource
|
||||
from database.tnews.model import TNews
|
||||
from log.log_manager import logger
|
||||
from seek.seek_base import SeekBase
|
||||
|
||||
|
||||
class ZhihuHot(SeekBase):
|
||||
def get_news(self):
|
||||
"""获取知乎热榜数据"""
|
||||
news_result = []
|
||||
try:
|
||||
# 访问热榜页面
|
||||
self.tab.get('https://www.zhihu.com/hot')
|
||||
|
||||
# 等待热榜内容加载
|
||||
self.tab.wait.ele_displayed('.HotItem')
|
||||
|
||||
# 获取所有热榜条目
|
||||
hot_items = self.tab.ele('.HotList-list').eles('.HotItem')
|
||||
|
||||
for item in hot_items:
|
||||
try:
|
||||
news = TNews()
|
||||
# 提取标题和链接
|
||||
news.title = item.ele('tag:a').attr('title').title()
|
||||
news.url = item.ele('tag:a').link
|
||||
|
||||
# 提取热度值(去除"热度"文字)
|
||||
heat_value = item('.HotItem-metrics HotItem-metrics--bottom').text
|
||||
logger.info(f"热度值:{heat_value}")
|
||||
# 使用正则表达式提取数值部分
|
||||
match = re.search(r'(\d+\s*万)', heat_value)
|
||||
|
||||
if match:
|
||||
news.heat = match.group(1).replace(' ', '') # 去除空格
|
||||
else:
|
||||
logger.error(f"无法提取热度值:{heat_value}")
|
||||
logger.info(f"提取到的热度值:{news.heat}")
|
||||
|
||||
news.source = self.information_source.title
|
||||
news.occurrence_date = datetime.datetime.now()
|
||||
news_result.append(news)
|
||||
except ElementNotFoundError as e:
|
||||
logger.error(f"元素缺失:{str(e)}")
|
||||
except ValueError as e:
|
||||
logger.error(f"热度值转换失败:{str(e)}")
|
||||
|
||||
except ElementNotFoundError as e:
|
||||
logger.error(f"热榜容器元素未找到:{str(e)}")
|
||||
except Exception as e:
|
||||
logger.error(f"获取热榜数据异常:{str(e)}")
|
||||
|
||||
return news_result
|
||||
|
||||
def get_news(information_source: TInformationSource) -> list:
|
||||
"""对外暴露的获取新闻接口"""
|
||||
zhihu = ZhihuHot(information_source)
|
||||
return zhihu.get_news()
|
||||
|
||||
def news_task(information_source: TInformationSource):
|
||||
"""任务执行入口"""
|
||||
with get_session() as db:
|
||||
news_list = get_news(information_source)
|
||||
# create_news_list_if_url_not_exists(db, news_list)
|
||||
for news in news_list:
|
||||
logger.info(f"采集到新闻:{news}")
|
||||
|
||||
if __name__ == '__main__':
|
||||
# 测试用例
|
||||
logger.info('知乎热榜采集测试')
|
||||
information_source_ = TInformationSource()
|
||||
information_source_.is_static = False # 知乎需要浏览器渲染
|
||||
information_source_.url = 'https://www.zhihu.com/hot'
|
||||
information_source_.title = '热榜_知乎'
|
||||
|
||||
# 执行采集任务
|
||||
news_task(information_source_)
|
||||
logger.info('测试完成')
|
||||
|
||||
Reference in New Issue
Block a user