import datetime import re # 添加正则表达式库的导入 from DrissionPage.errors import ElementNotFoundError from database.database import get_session from database.tinformationsource.model import TInformationSource from database.tnews.model import TNews from log.log_manager import logger from seek.seek_base import SeekBase class ZhihuHot(SeekBase): def get_news(self): """获取知乎热榜数据""" news_result = [] try: # 访问热榜页面 self.tab.get('https://www.zhihu.com/hot') # 等待热榜内容加载 self.tab.wait.ele_displayed('.HotItem') # 获取所有热榜条目 hot_items = self.tab.ele('.HotList-list').eles('.HotItem') for item in hot_items: try: news = TNews() # 提取标题和链接 news.title = item.ele('tag:a').attr('title').title() news.url = item.ele('tag:a').link # 提取热度值(去除"热度"文字) heat_value = item('.HotItem-metrics HotItem-metrics--bottom').text logger.info(f"热度值:{heat_value}") # 使用正则表达式提取数值部分 match = re.search(r'(\d+\s*万)', heat_value) if match: news.heat = match.group(1).replace(' ', '') # 去除空格 else: logger.error(f"无法提取热度值:{heat_value}") logger.info(f"提取到的热度值:{news.heat}") news.source = self.information_source.title news.occurrence_date = datetime.datetime.now() news_result.append(news) except ElementNotFoundError as e: logger.error(f"元素缺失:{str(e)}") except ValueError as e: logger.error(f"热度值转换失败:{str(e)}") except ElementNotFoundError as e: logger.error(f"热榜容器元素未找到:{str(e)}") except Exception as e: logger.error(f"获取热榜数据异常:{str(e)}") return news_result def get_news(information_source: TInformationSource) -> list: """对外暴露的获取新闻接口""" zhihu = ZhihuHot(information_source) return zhihu.get_news() def news_task(information_source: TInformationSource): """任务执行入口""" with get_session() as db: news_list = get_news(information_source) # create_news_list_if_url_not_exists(db, news_list) for news in news_list: logger.info(f"采集到新闻:{news}") if __name__ == '__main__': # 测试用例 logger.info('知乎热榜采集测试') information_source_ = TInformationSource() information_source_.is_static = False # 知乎需要浏览器渲染 information_source_.url = 'https://www.zhihu.com/hot' information_source_.title = '热榜_知乎' # 执行采集任务 news_task(information_source_) logger.info('测试完成')