调整 real estate 故事创作逻辑

2026-02-23 14:49:10 +08:00
parent 6008a7ff4b
commit c1e5db64c1
5 changed files with 173 additions and 6 deletions
--- a/task/hot_topic/real_estate_story.py
+++ b/task/hot_topic/real_estate_story.py
@ -1,4 +1,4 @@
-from datetime import datetime, timedelta
+from datetime import datetime
 import json
 from task.manager_task import execute_task
 from config.database import SessionLocal
@ -9,17 +9,112 @@ from llm import LLMThinkingEngine

 def story_edit_task():
    with SessionLocal() as db:
-        # 获取今天最新的帖子（限定，最多10条）
+        # 获取今天的帖子（限定，最多50条）
        today_contents = db.query(SourceContent).filter(
            SourceContent.create_time >= (datetime.today().replace(hour=0, minute=0, second=0, microsecond=0))
-        ).order_by(SourceContent.create_time.desc()).limit(10).all()
+        ).order_by(SourceContent.create_time.desc()).limit(50).all()
        if len(today_contents) == 0:
            logger.info("story_edit_task finish, content size 0")
            return
        logger.info(f"story_edit_task get {len(today_contents)} contents")
+
+        # 按照帖子正文字数排序
+        # 定义提取函数：解析JSON并返回content字段长度
+        def get_content_length(item):
+            try:
+                if not item.content:
+                    return 0
+                data = json.loads(item.content)
+                # 安全获取 content 字段，避免 None
+                body = data.get('content') or ''
+                return len(body)
+            except (json.JSONDecodeError, TypeError, AttributeError):
+                return 0
+        today_contents.sort(key=lambda x: get_content_length(x), reverse=True)
        
+        # 去掉帖子正文字数小于200的帖子
+        to_processed_contents = [content for content in today_contents if get_content_length(content) >= 200]
+        logger.info(f"story_edit_task after filter content size {len(to_processed_contents)}")
+
+        # 如果没有符合条件的帖子，直接使用字数最多的帖子（即使它的字数小于200）
+        if len(to_processed_contents) == 0 and len(today_contents) > 0:
+            to_processed_contents = [today_contents[0]]
+
+        # 下面会调用LLM对帖子进行筛选，此处限定所有帖子的正文字数之和不超过10000字（成本安全考虑）
+        total_length = sum(get_content_length(content) for content in to_processed_contents)
+        if total_length > 10000:
+            # 从字数最多的帖子开始，逐步移除，直到总字数不超过10000
+            while total_length > 10000 and to_processed_contents:
+                removed_content = to_processed_contents.pop()
+                total_length -= get_content_length(removed_content)
+
+        # 如果to_processed_contents数量超过2条，则让LLM从中选择2条最适合创作故事的帖子
+        # 定义提取函数：解析JSON并返回content内容
+        def get_content(item):
+            try:
+                if not item.content:
+                    return ""
+                data = json.loads(item.content)
+                # 安全获取 content 字段，避免 None
+                body = data.get('content') or ''
+                return body
+            except (json.JSONDecodeError, TypeError, AttributeError):
+                return ""
+        if len(to_processed_contents) > 2:
+            llm_engine = LLMThinkingEngine(system_prompt_file="real_estate_story_selection_system_prompt.txt")
+            content_list_str = "\n".join([f"{idx+1}. {get_content(content)}" for idx, content in enumerate(to_processed_contents)])
+            logger.info(f"story_edit_task LLM selection content list: {content_list_str}")
+            selection_result = llm_engine.think(content_list_str)
+            logger.info(f"story_edit_task LLM selection result: {selection_result}")
+            # 解析LLM的选择结果，提取出数字编号
+            selected_indices = []
+            for part in selection_result.split(","):
+                part = part.strip()
+                if part.isdigit():
+                    idx = int(part) - 1
+                    if 0 <= idx < len(to_processed_contents):
+                        selected_indices.append(idx)
+                if len(selected_indices) >= 2:
+                    break
+            to_processed_contents = [to_processed_contents[idx] for idx in selected_indices]
+            logger.info(f"story_edit_task after LLM selection content size {len(to_processed_contents)}")
+        
+        # 下面是对筛选后的帖子进行故事创作，目前先处理一条内容，后续再改成批量处理
+        llm_engine = LLMThinkingEngine(system_prompt_file="wechat_official_account_system_prompt.txt")
+        for content in to_processed_contents:
+            logger.info(f"story_edit_task content id: {content.id}, title: {content.link}, platform: {content.platform}")
+            story = llm_engine.think(f"【素材内容】\n{content.content}")
+            logger.info(f"story_edit_task content id: {content.id} story: {story}")
+            # llm生成的结果有时不是json结构，会在前后增加一些文本，需要提取出json部分进行解析
+            try:
+                json_start = story.find("{")
+                json_end = story.rfind("}") + 1
+                if json_start != -1 and json_end != -1:
+                    story = story[json_start:json_end]
+                else:
+                    logger.warning(f"story_edit_task content id: {content.id} llm生成的结果不是有效的json格式，无法提取故事内容")
+                    continue
+            except json.JSONDecodeError:
+                logger.warning(f"story_edit_task content id: {content.id} llm生成的结果不是有效的json格式，无法解析故事内容")
+                continue
+            # 将生成的故事写入Article表
+            json_story = json.loads(story)
+            title = json_story.get("title", "无标题")
+            article_content = json_story.get("body", "无内容")
+            # article_content有连续多个换行的情况，替换成单个换行
+            # article_content = "\n".join([line.strip() for line in article_content.splitlines() if line.strip()])
+            article = Article(
+                title=title,
+                keywords=None,
+                content=article_content,
+                used=False
+            )
+            db.add(article)
+            db.commit()
+            # break  # 目前先处理一条内容，后续再改成批量处理
+
        llm_engine = LLMThinkingEngine(system_prompt_file="real_estate_story_system_prompt.txt")
-        for content in today_contents:
+        for content in to_processed_contents:
            logger.info(f"story_edit_task content id: {content.id}, title: {content.link}, platform: {content.platform}")
            story = llm_engine.think(f"故事素材：{content.content}")
            logger.info(f"story_edit_task content id: {content.id} story: {story}")
@ -51,7 +146,7 @@ def story_edit_task():
            # break  # 目前先处理一条内容，后续再改成批量处理
        
        llm_engine = LLMThinkingEngine(system_prompt_file="real_estate_story_short_system_prompt.txt")
-        for content in today_contents:
+        for content in to_processed_contents:
            logger.info(f"story_edit_task content id: {content.id}, title: {content.link}, platform: {content.platform}")
            story = llm_engine.think(f"故事素材：{content.content}")
            logger.info(f"story_edit_task content id: {content.id} story: {story}")