from database.tvideoscript.video_script import video_script_not_exists, VideoScript, create_video_script from seek.zhihu_com.zhihu_hot import ZhihuHot from task.manager_task import execute_task def spider_task(): zhihu_hot = ZhihuHot() # 1. 获取热榜主题 hot_topic_url_list = zhihu_hot.get_topic_url_list() # 2. 过滤掉已经在数据库存在的主题 hot_topic_url_list = video_script_not_exists(hot_topic_url_list) # 3. 选择前10个主题 hot_topic_url_list = hot_topic_url_list[:10] # hot_topic_url_list = hot_topic_url_list[:3] # 4. 循环获取每个主题的内容 for hot_topic_url in hot_topic_url_list: print(hot_topic_url) content = zhihu_hot.get_content(hot_topic_url) print(content) if content['contents'] is None or len(content['contents']) == 0: print(f'skip {hot_topic_url}, no fitch content') continue # 5. 将内容保存到数据库中 video_script = VideoScript(title=content['title'], keywords=content['keywords'], description=content['topic_description'], content=content['contents'], url=content['url']) create_video_script(video_script) if __name__ == '__main__': execute_task(spider_task)