import peter

2025-11-12 20:42:16 +08:00
commit 8c1a740f0b
147 changed files with 2763 additions and 0 deletions
--- a/task/init.py
+++ b/task/init.py
--- a/task/pycache/init.cpython-312.pyc
+++ b/task/pycache/init.cpython-312.pyc
--- a/task/pycache/manager_task.cpython-312.pyc
+++ b/task/pycache/manager_task.cpython-312.pyc
--- a/task/content/init.py
+++ b/task/content/init.py
--- a/task/content/pycache/init.cpython-312.pyc
+++ b/task/content/pycache/init.cpython-312.pyc
--- a/task/content/pycache/content_spider_task.cpython-312.pyc
+++ b/task/content/pycache/content_spider_task.cpython-312.pyc
--- a/task/content/content_spider_task.py
+++ b/task/content/content_spider_task.py
@ -0,0 +1,38 @@
+import importlib
+
+from database.database import get_session
+from database.tinformationsource.curd import get_active_information_sources
+from database.tnews.crud import get_news_need_content
+from log.log_manager import logger
+from task.manager_task import execute_task
+
+
+def content_spider_task():
+    with get_session() as db:
+        # 1. 获取信息源数据
+        information_sources = get_active_information_sources(db)
+        # 2. 获取需要获取内容的新闻数据
+        news_list = get_news_need_content(db)
+        # 3. 遍历新闻数据
+        for news in news_list:
+            for information_source in information_sources:
+                if information_source.title != news.source:
+                    continue
+                if information_source.module is None or information_source.method is None:
+                    logger.error(f"{information_source.title} module or method is None")
+                    continue
+                news.is_static = information_source.is_static
+                # 动态导入模块和函数
+                # 把模块路径最后一部分换成content
+                module_path = information_source.module.rsplit('.', 1)[0] + '.content'
+                module = importlib.import_module(module_path)
+                task_function = getattr(module, 'content_task')
+                try:
+                    task_function(news)
+                except Exception as e:
+                    logger.error(f"{information_source.title} task error: {e}")
+
+
+
+if __name__ == '__main__':
+    execute_task(content_spider_task)
--- a/task/default/init.py
+++ b/task/default/init.py
--- a/task/default/pycache/init.cpython-312.pyc
+++ b/task/default/pycache/init.cpython-312.pyc
--- a/task/default/pycache/main_spider_task.cpython-312.pyc
+++ b/task/default/pycache/main_spider_task.cpython-312.pyc
--- a/task/default/main_spider_task.py
+++ b/task/default/main_spider_task.py
@ -0,0 +1,26 @@
+import importlib
+
+from database.database import get_session
+from database.tinformationsource.curd import get_active_information_sources
+from log.log_manager import logger
+from task.manager_task import execute_task
+
+
+def main_spider_task():
+    with get_session() as db:
+        information_sources = get_active_information_sources(db)
+        for information_source in information_sources:
+            if information_source.module is None or information_source.method is None:
+                logger.error(f"{information_source.title} module or method is None")
+                continue
+            # 动态导入模块和函数
+            module = importlib.import_module(information_source.module)
+            task_function = getattr(module, information_source.method)
+            try:
+                task_function(information_source)
+            except Exception as e:
+                logger.error(f"{information_source.title} task error: {e}")
+
+
+if __name__ == '__main__':
+    execute_task(main_spider_task)
--- a/task/hot_topic/init.py
+++ b/task/hot_topic/init.py
--- a/task/hot_topic/pycache/init.cpython-312.pyc
+++ b/task/hot_topic/pycache/init.cpython-312.pyc
--- a/task/hot_topic/pycache/zhihu.cpython-312.pyc
+++ b/task/hot_topic/pycache/zhihu.cpython-312.pyc
--- a/task/hot_topic/zhihu.py
+++ b/task/hot_topic/zhihu.py
@ -0,0 +1,36 @@
+from database.tvideoscript.video_script import video_script_not_exists, VideoScript, create_video_script
+from seek.zhihu_com.zhihu_hot import ZhihuHot
+from task.manager_task import execute_task
+
+
+def spider_task():
+    zhihu_hot = ZhihuHot()
+    # 1. 获取热榜主题
+    hot_topic_url_list = zhihu_hot.get_topic_url_list()
+
+    # 2. 过滤掉已经在数据库存在的主题
+    hot_topic_url_list = video_script_not_exists(hot_topic_url_list)
+
+    # 3. 选择前10个主题
+    hot_topic_url_list = hot_topic_url_list[:10]
+    # hot_topic_url_list = hot_topic_url_list[:3]
+
+    # 4. 循环获取每个主题的内容
+    for hot_topic_url in hot_topic_url_list:
+        print(hot_topic_url)
+        content = zhihu_hot.get_content(hot_topic_url)
+        print(content)
+        if content['contents'] is None or len(content['contents']) == 0:
+            print(f'skip {hot_topic_url}, no fitch content')
+            continue
+        # 5. 将内容保存到数据库中
+        video_script = VideoScript(title=content['title'],
+                                   keywords=content['keywords'],
+                                   description=content['topic_description'],
+                                   content=content['contents'],
+                                   url=content['url'])
+        create_video_script(video_script)
+
+
+if __name__ == '__main__':
+    execute_task(spider_task)
--- a/task/manager_task.py
+++ b/task/manager_task.py
@ -0,0 +1,112 @@
+import importlib
+import time
+
+from apscheduler.schedulers.blocking import BlockingScheduler
+
+from config import config
+from database.database import get_session
+from database.tscheduler.crud import get_tasks_by_executor
+from log.log_manager import log
+
+"""
+这是一个特殊的任务，负责管理任务，命名为管理者任务。
+
+工作流程：
+1 检索数据库任务数据表
+2 检查是否已经在任务队列中，如果不在则添加
+
+任务执行时间间隔为600秒。
+
+"""
+
+def log_task_execution(task_name: str, start_time: float, end_time: float = None):
+    """辅助函数，记录任务的开始和结束日志"""
+    start_time_str = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(start_time))
+    end_time_str = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(end_time))
+    if end_time is None:
+        log(f"{task_name} start execute at {start_time_str}")
+    else:
+        elapsed_time = end_time - start_time
+        log(f"{task_name} end execute at {end_time_str}, use time {elapsed_time:.2f} seconds")
+
+
+def execute_task(task: callable):
+    """执行任务并记录日志"""
+    start_time = time.time()
+    log_task_execution(task.__name__, start_time)  # 先记录开始时间
+    task()
+    end_time = time.time()
+    log_task_execution(task.__name__, start_time, end_time)  # 记录结束时间
+
+# 从数据库加载任务
+def load_tasks(scheduler: BlockingScheduler):
+    with get_session() as db:
+        tasks = get_tasks_by_executor(db, config.scheduler_name)
+
+        for task in tasks:
+            module_path = task.module_path
+            function_name = task.function_name
+            trigger = task.trigger
+            interval_seconds = task.interval_seconds
+            task_id = task.id
+
+            # 动态导入模块和函数
+            module = importlib.import_module(module_path)
+            task_function = getattr(module, function_name)
+
+            job = scheduler.get_job(str(task_id))
+            # 检查任务是否已存在
+            if not job:
+                if trigger == "interval":
+                    scheduler.add_job(
+                        task_function,
+                        "interval",
+                        seconds=interval_seconds,
+                        id=str(task_id),
+                        replace_existing=True,
+                        misfire_grace_time=interval_seconds
+                    )
+                    log(f"Task {task.task_name} added with interval {interval_seconds} seconds")
+                elif trigger == "cron":
+                    # 解析 cron 表达式的字段
+                    fields = task.cron_expression.split()
+                    # 确保字段长度符合七字段格式
+                    if len(fields) != 7:
+                        raise ValueError("无效的 Quartz cron 表达式")
+                    # 替换 Quartz 风格的 `?` 为 APScheduler 可接受的 `*`
+                    if fields[5] == '?':
+                        fields[5] = '*' # 替换 `day_of_week` 字段中的 `?`
+                    # 使用 cron 表达式的字段添加任务
+                    scheduler.add_job(
+                        task_function,  # 要执行的任务
+                        'cron',  # 使用 cron 触发器
+                        second=fields[0],  # 秒
+                        minute=fields[1],  # 分钟
+                        hour=fields[2],  # 小时
+                        day=fields[3],  # 日期
+                        month=fields[4],  # 月份
+                        day_of_week=fields[5],  # 星期
+                        year=fields[6],  # 年份
+                        id=str(task_id),
+                        replace_existing=True
+                    )
+                    log(f"Task {task.task_name} added with cron {task.cron_expression}")
+                elif trigger == "date":
+                    scheduler.add_job(
+                        task_function,
+                        "date",
+                        run_date=task["run_date_and_time"],
+                        id=str(task_id),
+                        replace_existing=True
+                    )
+                    log(f"Task {task.task_name} added with date {task.execution_date}")
+                else:
+                    log(f"Task Invalid trigger type: {trigger}")
+            else:
+                log(f"Task {task.task_name} already exists......")
+                run_time = job.next_run_time - job.trigger.start_date
+                log(f"Task {task.task_name} already exists, run time is {run_time}")
+
+# 管理者任务
+def manager_task(scheduler: BlockingScheduler):
+    load_tasks(scheduler)