36 lines
1.4 KiB
Python
36 lines
1.4 KiB
Python
from database.tvideoscript.video_script import video_script_not_exists, VideoScript, create_video_script
|
|
from seek.zhihu_com.zhihu_hot import ZhihuHot
|
|
from task.manager_task import execute_task
|
|
|
|
|
|
def spider_task():
|
|
zhihu_hot = ZhihuHot()
|
|
# 1. 获取热榜主题
|
|
hot_topic_url_list = zhihu_hot.get_topic_url_list()
|
|
|
|
# 2. 过滤掉已经在数据库存在的主题
|
|
hot_topic_url_list = video_script_not_exists(hot_topic_url_list)
|
|
|
|
# 3. 选择前10个主题
|
|
hot_topic_url_list = hot_topic_url_list[:10]
|
|
# hot_topic_url_list = hot_topic_url_list[:3]
|
|
|
|
# 4. 循环获取每个主题的内容
|
|
for hot_topic_url in hot_topic_url_list:
|
|
print(hot_topic_url)
|
|
content = zhihu_hot.get_content(hot_topic_url)
|
|
print(content)
|
|
if content['contents'] is None or len(content['contents']) == 0:
|
|
print(f'skip {hot_topic_url}, no fitch content')
|
|
continue
|
|
# 5. 将内容保存到数据库中
|
|
video_script = VideoScript(title=content['title'],
|
|
keywords=content['keywords'],
|
|
description=content['topic_description'],
|
|
content=content['contents'],
|
|
url=content['url'])
|
|
create_video_script(video_script)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
execute_task(spider_task) |