86 lines
3.1 KiB
Python
86 lines
3.1 KiB
Python
import datetime
|
|
import re # 添加正则表达式库的导入
|
|
|
|
from DrissionPage.errors import ElementNotFoundError
|
|
|
|
from database.database import get_session
|
|
from database.tinformationsource.model import TInformationSource
|
|
from database.tnews.model import TNews
|
|
from log.log_manager import logger
|
|
from seek.seek_base import SeekBase
|
|
|
|
|
|
class ZhihuHot(SeekBase):
|
|
def get_news(self):
|
|
"""获取知乎热榜数据"""
|
|
news_result = []
|
|
try:
|
|
# 访问热榜页面
|
|
self.tab.get('https://www.zhihu.com/hot')
|
|
|
|
# 等待热榜内容加载
|
|
self.tab.wait.ele_displayed('.HotItem')
|
|
|
|
# 获取所有热榜条目
|
|
hot_items = self.tab.ele('.HotList-list').eles('.HotItem')
|
|
|
|
for item in hot_items:
|
|
try:
|
|
news = TNews()
|
|
# 提取标题和链接
|
|
news.title = item.ele('tag:a').attr('title').title()
|
|
news.url = item.ele('tag:a').link
|
|
|
|
# 提取热度值(去除"热度"文字)
|
|
heat_value = item('.HotItem-metrics HotItem-metrics--bottom').text
|
|
logger.info(f"热度值:{heat_value}")
|
|
# 使用正则表达式提取数值部分
|
|
match = re.search(r'(\d+\s*万)', heat_value)
|
|
|
|
if match:
|
|
news.heat = match.group(1).replace(' ', '') # 去除空格
|
|
else:
|
|
logger.error(f"无法提取热度值:{heat_value}")
|
|
logger.info(f"提取到的热度值:{news.heat}")
|
|
|
|
news.source = self.information_source.title
|
|
news.occurrence_date = datetime.datetime.now()
|
|
news_result.append(news)
|
|
except ElementNotFoundError as e:
|
|
logger.error(f"元素缺失:{str(e)}")
|
|
except ValueError as e:
|
|
logger.error(f"热度值转换失败:{str(e)}")
|
|
|
|
except ElementNotFoundError as e:
|
|
logger.error(f"热榜容器元素未找到:{str(e)}")
|
|
except Exception as e:
|
|
logger.error(f"获取热榜数据异常:{str(e)}")
|
|
|
|
return news_result
|
|
|
|
def get_news(information_source: TInformationSource) -> list:
|
|
"""对外暴露的获取新闻接口"""
|
|
zhihu = ZhihuHot(information_source)
|
|
return zhihu.get_news()
|
|
|
|
def news_task(information_source: TInformationSource):
|
|
"""任务执行入口"""
|
|
with get_session() as db:
|
|
news_list = get_news(information_source)
|
|
# create_news_list_if_url_not_exists(db, news_list)
|
|
for news in news_list:
|
|
logger.info(f"采集到新闻:{news}")
|
|
|
|
if __name__ == '__main__':
|
|
# 测试用例
|
|
logger.info('知乎热榜采集测试')
|
|
information_source_ = TInformationSource()
|
|
information_source_.is_static = False # 知乎需要浏览器渲染
|
|
information_source_.url = 'https://www.zhihu.com/hot'
|
|
information_source_.title = '热榜_知乎'
|
|
|
|
# 执行采集任务
|
|
news_task(information_source_)
|
|
logger.info('测试完成')
|
|
|