import peter
This commit is contained in:
0
task/__init__.py
Normal file
0
task/__init__.py
Normal file
BIN
task/__pycache__/__init__.cpython-312.pyc
Normal file
BIN
task/__pycache__/__init__.cpython-312.pyc
Normal file
Binary file not shown.
BIN
task/__pycache__/manager_task.cpython-312.pyc
Normal file
BIN
task/__pycache__/manager_task.cpython-312.pyc
Normal file
Binary file not shown.
0
task/content/__init__.py
Normal file
0
task/content/__init__.py
Normal file
BIN
task/content/__pycache__/__init__.cpython-312.pyc
Normal file
BIN
task/content/__pycache__/__init__.cpython-312.pyc
Normal file
Binary file not shown.
BIN
task/content/__pycache__/content_spider_task.cpython-312.pyc
Normal file
BIN
task/content/__pycache__/content_spider_task.cpython-312.pyc
Normal file
Binary file not shown.
38
task/content/content_spider_task.py
Normal file
38
task/content/content_spider_task.py
Normal file
@ -0,0 +1,38 @@
|
||||
import importlib
|
||||
|
||||
from database.database import get_session
|
||||
from database.tinformationsource.curd import get_active_information_sources
|
||||
from database.tnews.crud import get_news_need_content
|
||||
from log.log_manager import logger
|
||||
from task.manager_task import execute_task
|
||||
|
||||
|
||||
def content_spider_task():
|
||||
with get_session() as db:
|
||||
# 1. 获取信息源数据
|
||||
information_sources = get_active_information_sources(db)
|
||||
# 2. 获取需要获取内容的新闻数据
|
||||
news_list = get_news_need_content(db)
|
||||
# 3. 遍历新闻数据
|
||||
for news in news_list:
|
||||
for information_source in information_sources:
|
||||
if information_source.title != news.source:
|
||||
continue
|
||||
if information_source.module is None or information_source.method is None:
|
||||
logger.error(f"{information_source.title} module or method is None")
|
||||
continue
|
||||
news.is_static = information_source.is_static
|
||||
# 动态导入模块和函数
|
||||
# 把模块路径最后一部分换成content
|
||||
module_path = information_source.module.rsplit('.', 1)[0] + '.content'
|
||||
module = importlib.import_module(module_path)
|
||||
task_function = getattr(module, 'content_task')
|
||||
try:
|
||||
task_function(news)
|
||||
except Exception as e:
|
||||
logger.error(f"{information_source.title} task error: {e}")
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
execute_task(content_spider_task)
|
||||
0
task/default/__init__.py
Normal file
0
task/default/__init__.py
Normal file
BIN
task/default/__pycache__/__init__.cpython-312.pyc
Normal file
BIN
task/default/__pycache__/__init__.cpython-312.pyc
Normal file
Binary file not shown.
BIN
task/default/__pycache__/main_spider_task.cpython-312.pyc
Normal file
BIN
task/default/__pycache__/main_spider_task.cpython-312.pyc
Normal file
Binary file not shown.
26
task/default/main_spider_task.py
Normal file
26
task/default/main_spider_task.py
Normal file
@ -0,0 +1,26 @@
|
||||
import importlib
|
||||
|
||||
from database.database import get_session
|
||||
from database.tinformationsource.curd import get_active_information_sources
|
||||
from log.log_manager import logger
|
||||
from task.manager_task import execute_task
|
||||
|
||||
|
||||
def main_spider_task():
|
||||
with get_session() as db:
|
||||
information_sources = get_active_information_sources(db)
|
||||
for information_source in information_sources:
|
||||
if information_source.module is None or information_source.method is None:
|
||||
logger.error(f"{information_source.title} module or method is None")
|
||||
continue
|
||||
# 动态导入模块和函数
|
||||
module = importlib.import_module(information_source.module)
|
||||
task_function = getattr(module, information_source.method)
|
||||
try:
|
||||
task_function(information_source)
|
||||
except Exception as e:
|
||||
logger.error(f"{information_source.title} task error: {e}")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
execute_task(main_spider_task)
|
||||
0
task/hot_topic/__init__.py
Normal file
0
task/hot_topic/__init__.py
Normal file
BIN
task/hot_topic/__pycache__/__init__.cpython-312.pyc
Normal file
BIN
task/hot_topic/__pycache__/__init__.cpython-312.pyc
Normal file
Binary file not shown.
BIN
task/hot_topic/__pycache__/zhihu.cpython-312.pyc
Normal file
BIN
task/hot_topic/__pycache__/zhihu.cpython-312.pyc
Normal file
Binary file not shown.
36
task/hot_topic/zhihu.py
Normal file
36
task/hot_topic/zhihu.py
Normal file
@ -0,0 +1,36 @@
|
||||
from database.tvideoscript.video_script import video_script_not_exists, VideoScript, create_video_script
|
||||
from seek.zhihu_com.zhihu_hot import ZhihuHot
|
||||
from task.manager_task import execute_task
|
||||
|
||||
|
||||
def spider_task():
|
||||
zhihu_hot = ZhihuHot()
|
||||
# 1. 获取热榜主题
|
||||
hot_topic_url_list = zhihu_hot.get_topic_url_list()
|
||||
|
||||
# 2. 过滤掉已经在数据库存在的主题
|
||||
hot_topic_url_list = video_script_not_exists(hot_topic_url_list)
|
||||
|
||||
# 3. 选择前10个主题
|
||||
hot_topic_url_list = hot_topic_url_list[:10]
|
||||
# hot_topic_url_list = hot_topic_url_list[:3]
|
||||
|
||||
# 4. 循环获取每个主题的内容
|
||||
for hot_topic_url in hot_topic_url_list:
|
||||
print(hot_topic_url)
|
||||
content = zhihu_hot.get_content(hot_topic_url)
|
||||
print(content)
|
||||
if content['contents'] is None or len(content['contents']) == 0:
|
||||
print(f'skip {hot_topic_url}, no fitch content')
|
||||
continue
|
||||
# 5. 将内容保存到数据库中
|
||||
video_script = VideoScript(title=content['title'],
|
||||
keywords=content['keywords'],
|
||||
description=content['topic_description'],
|
||||
content=content['contents'],
|
||||
url=content['url'])
|
||||
create_video_script(video_script)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
execute_task(spider_task)
|
||||
112
task/manager_task.py
Normal file
112
task/manager_task.py
Normal file
@ -0,0 +1,112 @@
|
||||
import importlib
|
||||
import time
|
||||
|
||||
from apscheduler.schedulers.blocking import BlockingScheduler
|
||||
|
||||
from config import config
|
||||
from database.database import get_session
|
||||
from database.tscheduler.crud import get_tasks_by_executor
|
||||
from log.log_manager import log
|
||||
|
||||
"""
|
||||
这是一个特殊的任务,负责管理任务,命名为管理者任务。
|
||||
|
||||
工作流程:
|
||||
1 检索数据库任务数据表
|
||||
2 检查是否已经在任务队列中,如果不在则添加
|
||||
|
||||
任务执行时间间隔为600秒。
|
||||
|
||||
"""
|
||||
|
||||
def log_task_execution(task_name: str, start_time: float, end_time: float = None):
|
||||
"""辅助函数,记录任务的开始和结束日志"""
|
||||
start_time_str = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(start_time))
|
||||
end_time_str = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(end_time))
|
||||
if end_time is None:
|
||||
log(f"{task_name} start execute at {start_time_str}")
|
||||
else:
|
||||
elapsed_time = end_time - start_time
|
||||
log(f"{task_name} end execute at {end_time_str}, use time {elapsed_time:.2f} seconds")
|
||||
|
||||
|
||||
def execute_task(task: callable):
|
||||
"""执行任务并记录日志"""
|
||||
start_time = time.time()
|
||||
log_task_execution(task.__name__, start_time) # 先记录开始时间
|
||||
task()
|
||||
end_time = time.time()
|
||||
log_task_execution(task.__name__, start_time, end_time) # 记录结束时间
|
||||
|
||||
# 从数据库加载任务
|
||||
def load_tasks(scheduler: BlockingScheduler):
|
||||
with get_session() as db:
|
||||
tasks = get_tasks_by_executor(db, config.scheduler_name)
|
||||
|
||||
for task in tasks:
|
||||
module_path = task.module_path
|
||||
function_name = task.function_name
|
||||
trigger = task.trigger
|
||||
interval_seconds = task.interval_seconds
|
||||
task_id = task.id
|
||||
|
||||
# 动态导入模块和函数
|
||||
module = importlib.import_module(module_path)
|
||||
task_function = getattr(module, function_name)
|
||||
|
||||
job = scheduler.get_job(str(task_id))
|
||||
# 检查任务是否已存在
|
||||
if not job:
|
||||
if trigger == "interval":
|
||||
scheduler.add_job(
|
||||
task_function,
|
||||
"interval",
|
||||
seconds=interval_seconds,
|
||||
id=str(task_id),
|
||||
replace_existing=True,
|
||||
misfire_grace_time=interval_seconds
|
||||
)
|
||||
log(f"Task {task.task_name} added with interval {interval_seconds} seconds")
|
||||
elif trigger == "cron":
|
||||
# 解析 cron 表达式的字段
|
||||
fields = task.cron_expression.split()
|
||||
# 确保字段长度符合七字段格式
|
||||
if len(fields) != 7:
|
||||
raise ValueError("无效的 Quartz cron 表达式")
|
||||
# 替换 Quartz 风格的 `?` 为 APScheduler 可接受的 `*`
|
||||
if fields[5] == '?':
|
||||
fields[5] = '*' # 替换 `day_of_week` 字段中的 `?`
|
||||
# 使用 cron 表达式的字段添加任务
|
||||
scheduler.add_job(
|
||||
task_function, # 要执行的任务
|
||||
'cron', # 使用 cron 触发器
|
||||
second=fields[0], # 秒
|
||||
minute=fields[1], # 分钟
|
||||
hour=fields[2], # 小时
|
||||
day=fields[3], # 日期
|
||||
month=fields[4], # 月份
|
||||
day_of_week=fields[5], # 星期
|
||||
year=fields[6], # 年份
|
||||
id=str(task_id),
|
||||
replace_existing=True
|
||||
)
|
||||
log(f"Task {task.task_name} added with cron {task.cron_expression}")
|
||||
elif trigger == "date":
|
||||
scheduler.add_job(
|
||||
task_function,
|
||||
"date",
|
||||
run_date=task["run_date_and_time"],
|
||||
id=str(task_id),
|
||||
replace_existing=True
|
||||
)
|
||||
log(f"Task {task.task_name} added with date {task.execution_date}")
|
||||
else:
|
||||
log(f"Task Invalid trigger type: {trigger}")
|
||||
else:
|
||||
log(f"Task {task.task_name} already exists......")
|
||||
run_time = job.next_run_time - job.trigger.start_date
|
||||
log(f"Task {task.task_name} already exists, run time is {run_time}")
|
||||
|
||||
# 管理者任务
|
||||
def manager_task(scheduler: BlockingScheduler):
|
||||
load_tasks(scheduler)
|
||||
Reference in New Issue
Block a user