commit code

2025-12-29 19:34:39 +08:00
parent 87160c5265
commit 6772699cfe
22 changed files with 2268 additions and 70 deletions
--- a/llm/generate_daily_article.py
+++ b/llm/generate_daily_article.py
@ -0,0 +1,110 @@
+import json
+from datetime import datetime, timedelta, timezone
+import re
+from typing import Any, Dict, List, Optional
+
+from openai import OpenAI
+
+from config.settings import settings
+from llm import prompt as prompts
+from utils.logger import logger
+from llm.prompts.daily_article_prompt import PROMPT_DAILY_ARTICLE
+
+
+BASE_URL = "https://dashscope.aliyuncs.com/compatible-mode/v1"
+MODEL = "deepseek-v3.2"
+
+
+def _make_client() -> OpenAI:
+    return OpenAI(api_key=settings.DASHSCOPE_API_KEY, base_url=BASE_URL)
+
+
+def _call_model(system_prompt: Optional[str], user_prompt: str, stream: bool = False, enable_search: bool = False) -> Any:
+    client = _make_client()
+    messages = []
+    if system_prompt:
+        messages.append({"role": "system", "content": system_prompt})
+    messages.append({"role": "user", "content": user_prompt})
+
+    # Non-streaming call for simplicity
+    resp = client.chat.completions.create(model=MODEL, messages=messages, stream=stream, extra_body={"enable_search": enable_search})
+    # When stream=False the SDK typically returns a full object; content location may vary.
+    # We'll try common access patterns.
+    try:
+        # OpenAI-compatible: resp.choices[0].message.content
+        return resp.choices[0].message.content
+    except Exception:
+        try:
+            # fallback: resp.choices[0].text
+            return resp.choices[0].text
+        except Exception:
+            # As last resort, return raw resp
+            return resp
+
+
+def _extract_json(text: str) -> str:
+    """Attempt to extract the first JSON object/array from text."""
+    if not isinstance(text, str):
+        raise ValueError("Expected text to be str")
+    # Find first '[' or '{'
+    start_idx = None
+    for i, ch in enumerate(text):
+        if ch in "[{":
+            start_idx = i
+            break
+    if start_idx is None:
+        raise ValueError("No JSON object/array found in text")
+
+    # Try to find a matching closing bracket by scanning and counting
+    stack = []
+    for j in range(start_idx, len(text)):
+        ch = text[j]
+        if ch in "{[":
+            stack.append(ch)
+        elif ch in "]}":
+            if not stack:
+                continue
+            opening = stack.pop()
+            if (opening == "{" and ch != "}") or (opening == "[" and ch != "]"):
+                # mismatched, continue
+                continue
+            if not stack:
+                return text[start_idx : j + 1]
+
+    # Fallback: try regex to capture last '}' or ']' occurrence
+    m = re.search(r"(\{.*\}|\[.*\])", text, re.S)
+    if m:
+        return m.group(1)
+    raise ValueError("Could not extract JSON from model output")
+
+
+def _parse_json_safe(text: str) -> Any:
+    try:
+        return json.loads(text)
+    except Exception:
+        # try to extract JSON substring
+        jtext = _extract_json(text)
+        return json.loads(jtext)
+
+
+def generate_daily_article() -> List[Dict[str, Any]]:
+    """Call PROMPT_DAILY_ARTICLE to generate a daily article."""
+    logger.debug(f"prompt for generate_daily_article:\n{PROMPT_DAILY_ARTICLE}")
+
+    content = _call_model(system_prompt=None, user_prompt=PROMPT_DAILY_ARTICLE, enable_search=True)
+    logger.debug(f"raw output from generate_daily_article:\n{content}")
+    if isinstance(content, (dict, list)):
+        return content
+    text = content if isinstance(content, str) else str(content)
+    data = _parse_json_safe(text)
+    logger.debug(f"result for generate_daily_article:\n{data}")
+    return data
+
+
+
+
+
+if __name__ == "__main__":
+    content = generate_daily_article()
+    article = content["阶段4_今日文章"]["文章正文"]
+    print(article)
--- a/llm/generate_podcast.py
+++ b/llm/generate_podcast.py
@ -11,14 +11,14 @@ from utils.logger import logger


 BASE_URL = "https://dashscope.aliyuncs.com/compatible-mode/v1"
-MODEL = "deepseek-v3.2-exp"
+MODEL = "deepseek-v3.2"


 def _make_client() -> OpenAI:
    return OpenAI(api_key=settings.DASHSCOPE_API_KEY, base_url=BASE_URL)


-def _call_model(system_prompt: Optional[str], user_prompt: str, stream: bool = False) -> Any:
+def _call_model(system_prompt: Optional[str], user_prompt: str, stream: bool = False, enable_search: bool = False) -> Any:
    client = _make_client()
    messages = []
    if system_prompt:
@ -26,7 +26,7 @@ def _call_model(system_prompt: Optional[str], user_prompt: str, stream: bool = F
    messages.append({"role": "user", "content": user_prompt})

    # Non-streaming call for simplicity
-    resp = client.chat.completions.create(model=MODEL, messages=messages, stream=stream)
+    resp = client.chat.completions.create(model=MODEL, messages=messages, stream=stream, extra_body={"enable_search": enable_search})
    # When stream=False the SDK typically returns a full object; content location may vary.
    # We'll try common access patterns.
    try:
@ -118,7 +118,7 @@ def generate_topics(start_time: Optional[str] = None, end_time: Optional[str] =

    logger.debug(f"prompt for generate_topics:\n{user_prompt}")

-    content = _call_model(system_prompt=None, user_prompt=user_prompt)
+    content = _call_model(system_prompt=None, user_prompt=user_prompt, enable_search=True)
    logger.debug(f"raw output from generate_topics:\n{content}")
    if isinstance(content, (dict, list)):
        return content
--- a/llm/prompt.py
+++ b/llm/prompt.py
@ -42,7 +42,7 @@ prompt_b1 = """
 - meme_name：要写段子的梗名称（字符串）
 - research：关于该梗的深度研究文本（字符串）

-根据以上输入，创作3篇风格不同的脱口秀段子，要求如下并严格返回 JSON 对象（仅输出 JSON）：
+根据以上输入，创作1篇风格不同的脱口秀段子，要求如下并严格返回 JSON 对象（仅输出 JSON）：
 {
    "meme": "梗名称",
    "style": "观察生活", 
@ -57,7 +57,7 @@ prompt_b2 = """
 - meme_name：要写段子的梗名称（字符串）
 - research：关于该梗的深度研究文本（字符串）

-根据以上输入，创作3篇风格不同的脱口秀段子，要求如下并严格返回 JSON 对象（仅输出 JSON）：
+根据以上输入，创作1篇风格不同的脱口秀段子，要求如下并严格返回 JSON 对象（仅输出 JSON）：
 {
    "meme": "梗名称",
    "style": "夸张讽刺", 
@ -72,7 +72,7 @@ prompt_b3 = """
 - meme_name：要写段子的梗名称（字符串）
 - research：关于该梗的深度研究文本（字符串）

-根据以上输入，创作3篇风格不同的脱口秀段子，要求如下并严格返回 JSON 对象（仅输出 JSON）：
+根据以上输入，创作1篇风格不同的脱口秀段子，要求如下并严格返回 JSON 对象（仅输出 JSON）：
 {
    "meme": "梗名称",
    "style": "角色扮演", 
@ -87,27 +87,37 @@ prompt_c = """
 - meme_name：梗名称（字符串）
 - materials：包含“深度研究”与若干脱口秀段子的文本（字符串），已由人工筛选

-任务：把 materials 整合成一篇完整的播客文稿，结构严格按照：开场白 -> 梗介绍 -> 起源考据 -> 传播路径 -> 影响分析 -> 脱口秀环节（插入2-3个段子） -> 结束语
+任务：将 materials 整合为一档四人播客的完整文稿。

-输出格式（严格 JSON，对话按顺序列出，角色限定为 host/guest）：
+节目设定：
+- 主持人 Host（1人）：理性、引导节奏、串联全场。
+- 脱口秀演员 Guest_A / Guest_B / Guest_C（3人）：各有幽默风格，可即兴互动，负责讲段子与分析。
+
+文稿结构（请严格按以下流程撰写）：
+1. 开场白（Host 开场，介绍节目与三位演员，轻松互动，40-80字）
+2. 梗介绍（Host 简明引入梗，可向演员提问互动，40-100字）
+3. 起源考据（由一位演员结合材料讲述，可穿插其他人简短反应，60-150字）
+4. 传播路径（Host 引导，可由不同演员补充案例，50-120字）
+5. 影响分析（演员轮流发表观点，Host 总结，80-180字）
+6. 脱口秀环节（Host 引入，三位演员依次表演段子，每个段子 1000 - 1200 字，段子之间可有简短互动或调侃）
+7. 结束语（Host 收尾，感谢演员，邀请听众互动，30-60字）
+
+输出格式（严格 JSON）：
 {
-    "title": "节目标题（建议不超12字）",
+    "title": "节目标题（12字以内，吸引人）",
    "script": [
-        {"role": "host", "text": "开场白（口语化，20-60字）"},
-        {"role": "host", "text": "梗介绍（简明，30-80字）"},
-        {"role": "guest", "text": "起源考据（40-120字）"},
-        {"role": "host", "text": "传播路径（30-80字）"},
-        {"role": "guest", "text": "影响分析（40-120字）"},
-        {"role": "host", "text": "转入脱口秀环节的台词（15-40字）"},
-        {"role": "guest", "text": "段子A（来自 materials，1000-1200字）"},
-        {"role": "guest", "text": "段子B（来自 materials，1000-1200字）"},
-        {"role": "guest", "text": "段子C（来自 materials，1000-1200字）"},
-        {"role": "host", "text": "结束语（15-40字）"}
+        {"role": "host", "text": "..."},
+        {"role": "guest_a", "text": "..."},
+        {"role": "guest_b", "text": "..."},
+        {"role": "guest_c", "text": "..."},
+        ...
    ]
 }

 要求：
- 语言口语化，避免书面语；角色语气分别为：host（理性、引导）、guest（幽默、即兴）。
- 在 script 中只保留最终可直接朗读的台词，不要加入编剧说明或括注。每段尽量简洁，便于主播读出。
- 严格输出 JSON，不要额外解释或多余文本。
+- 语言高度口语化，符合聊天氛围，避免书面语。
+- 角色区分明显：host 控场理性，guest_a/b/c 幽默且风格可略有不同（可自设特点，如冷笑话、夸张、吐槽等）。
+- script 中只放最终台词，不添加说明。每段台词不宜过长，确保可朗读。
+- 在合适处允许演员之间简短对话（如提问、接梗、调侃），增强现场感。
+- 严格仅输出 JSON，无任何额外文本。
 """
--- a/llm/prompts/daily_article_prompt.py
+++ b/llm/prompts/daily_article_prompt.py
@ -0,0 +1,105 @@
+PROMPT_DAILY_ARTICLE = """
+你是【智能写作素材生成系统】。
+
+你的任务是严格按照下述【四个阶段】执行，并且【只允许输出一个 JSON 对象】。
+❗除 JSON 外，不得输出任何解释、说明、注释、Markdown、代码块或多余文本。
+
+====================
+【通用强制规则】
+1. 最终输出必须是一个合法 JSON（UTF-8，无注释）
+2. 字段名、层级结构、顺序必须与下方模板完全一致
+3. 不允许新增、删除、重命名任何字段
+4. 所有字符串必须是中文
+5. 所有数组必须按要求数量输出（不可多不可少）
+6. 需要联网获取信息（今日热点 / 文化日历 / 写作趋势 / 天气季节）
+====================
+
+【阶段1：信息采集（联网）】
+- 搜索今日热点，提取 5 个“写作灵感关键词”
+- 查询今日文化日历事件（至少 2 条）
+- 分析当前热门写作趋势（至少 3 条，来自写作/内容社区）
+- 获取今日天气与季节特征（概括性描述）
+
+【阶段2：主题生成】
+基于阶段1信息，生成 3 个写作主题：
+- 主题A：结合“热点 + 文化事件”
+- 主题B：回应“社会情绪 + 季节特征”
+- 主题C：实验性主题（新兴写作形式或叙事结构）
+
+【阶段3：风格匹配】
+为 主题A / 主题B / 主题C 分别给出：
+- 写作风格
+- 叙事视角
+- 重点训练技巧
+- 应避免的常见问题
+
+【阶段4：生成今日文章】
+- 在 A / B / C 中选择综合质量最高的一个
+- 生成一篇 800–1000 字中文文章
+- 文章必须完整、可直接发表
+
+====================
+【❗唯一允许的输出 JSON 模板如下】
+（必须严格匹配，不得修改结构）
+
+{
+  "阶段1_信息采集": {
+    "今日热点关键词": [
+      "",
+      "",
+      "",
+      "",
+      ""
+    ],
+    "今日文化日历事件": [
+      "",
+      ""
+    ],
+    "当前热门写作趋势": [
+      "",
+      "",
+      ""
+    ],
+    "今日天气与季节特征": ""
+  },
+  "阶段2_主题生成": {
+    "主题A": {
+      "标题": "",
+      "主题说明": ""
+    },
+    "主题B": {
+      "标题": "",
+      "主题说明": ""
+    },
+    "主题C": {
+      "标题": "",
+      "主题说明": ""
+    }
+  },
+  "阶段3_风格匹配": {
+    "主题A": {
+      "写作风格": "",
+      "叙事视角": "",
+      "重点训练技巧": "",
+      "应避免的常见问题": ""
+    },
+    "主题B": {
+      "写作风格": "",
+      "叙事视角": "",
+      "重点训练技巧": "",
+      "应避免的常见问题": ""
+    },
+    "主题C": {
+      "写作风格": "",
+      "叙事视角": "",
+      "重点训练技巧": "",
+      "应避免的常见问题": ""
+    }
+  },
+  "阶段4_今日文章": {
+    "选定主题": "主题A / 主题B / 主题C（三选一）",
+    "文章标题": "",
+    "文章正文": ""
+  }
+}
+"""