Files
edward/task/hot_topic/real_estate_story.py
konjacpotato 9361eb97bf
All checks were successful
Gitea Actions Demo / deploy (push) Successful in 2m51s
修改微头条prompt
2026-02-27 10:35:49 +08:00

154 lines
8.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from datetime import datetime
import json
from task.manager_task import execute_task
from config.database import SessionLocal
from models import SourceContent, Article
from utils import logger
from llm import LLMThinkingEngine
def story_edit_task():
with SessionLocal() as db:
# 获取今天的帖子限定最多50条
today_contents = db.query(SourceContent).filter(
SourceContent.create_time >= (datetime.today().replace(hour=0, minute=0, second=0, microsecond=0))
).order_by(SourceContent.create_time.desc()).limit(50).all()
if len(today_contents) == 0:
logger.info("story_edit_task finish, content size 0")
return
logger.info(f"story_edit_task get {len(today_contents)} contents")
# 按照帖子正文字数排序
# 定义提取函数解析JSON并返回content字段长度
def get_content_length(item):
try:
if not item.content:
return 0
data = json.loads(item.content)
# 安全获取 content 字段,避免 None
body = data.get('content') or ''
return len(body)
except (json.JSONDecodeError, TypeError, AttributeError):
return 0
today_contents.sort(key=lambda x: get_content_length(x), reverse=True)
# 去掉帖子正文字数小于200的帖子
to_processed_contents = [content for content in today_contents if get_content_length(content) >= 200]
logger.info(f"story_edit_task after filter content size {len(to_processed_contents)}")
# 如果没有符合条件的帖子直接使用字数最多的帖子即使它的字数小于200
if len(to_processed_contents) == 0 and len(today_contents) > 0:
to_processed_contents = [today_contents[0]]
# 下面会调用LLM对帖子进行筛选此处限定所有帖子的正文字数之和不超过10000字成本安全考虑
total_length = sum(get_content_length(content) for content in to_processed_contents)
if total_length > 10000:
# 从字数最多的帖子开始逐步移除直到总字数不超过10000
while total_length > 10000 and to_processed_contents:
removed_content = to_processed_contents.pop()
total_length -= get_content_length(removed_content)
# 如果to_processed_contents数量超过2条则让LLM从中选择2条最适合创作故事的帖子
# 定义提取函数解析JSON并返回content内容
def get_content(item):
try:
if not item.content:
return ""
data = json.loads(item.content)
# 安全获取 content 字段,避免 None
body = data.get('content') or ''
return body
except (json.JSONDecodeError, TypeError, AttributeError):
return ""
if len(to_processed_contents) > 2:
llm_engine = LLMThinkingEngine(system_prompt_file="real_estate_story_selection_system_prompt.txt")
content_list_str = "\n".join([f"{idx+1}. {get_content(content)}" for idx, content in enumerate(to_processed_contents)])
logger.info(f"story_edit_task LLM selection content list: {content_list_str}")
selection_result = llm_engine.think(content_list_str)
logger.info(f"story_edit_task LLM selection result: {selection_result}")
# 解析LLM的选择结果提取出数字编号
selected_indices = []
for part in selection_result.split(","):
part = part.strip()
if part.isdigit():
idx = int(part) - 1
if 0 <= idx < len(to_processed_contents):
selected_indices.append(idx)
if len(selected_indices) >= 2:
break
to_processed_contents = [to_processed_contents[idx] for idx in selected_indices]
logger.info(f"story_edit_task after LLM selection content size {len(to_processed_contents)}")
# 下面是对筛选后的帖子进行故事创作,目前先处理一条内容,后续再改成批量处理
llm_engine = LLMThinkingEngine(system_prompt_file="wechat_official_account_system_prompt.txt")
for content in to_processed_contents:
logger.info(f"story_edit_task content id: {content.id}, title: {content.link}, platform: {content.platform}")
story = llm_engine.think(f"【素材内容】\n{content.content}")
logger.info(f"story_edit_task content id: {content.id} story: {story}")
# llm生成的结果有时不是json结构会在前后增加一些文本需要提取出json部分进行解析
try:
json_start = story.find("{")
json_end = story.rfind("}") + 1
if json_start != -1 and json_end != -1:
story = story[json_start:json_end]
else:
logger.warning(f"story_edit_task content id: {content.id} llm生成的结果不是有效的json格式无法提取故事内容")
continue
except json.JSONDecodeError:
logger.warning(f"story_edit_task content id: {content.id} llm生成的结果不是有效的json格式无法解析故事内容")
continue
# 将生成的故事写入Article表
json_story = json.loads(story)
title = json_story.get("title", "无标题")
article_content = json_story.get("body", "无内容")
# article_content有连续多个换行的情况替换成单个换行
# article_content = "\n".join([line.strip() for line in article_content.splitlines() if line.strip()])
article = Article(
title=title,
keywords=None,
content=article_content,
used=False
)
db.add(article)
db.commit()
# break # 目前先处理一条内容,后续再改成批量处理
llm_engine = LLMThinkingEngine(system_prompt_file="wei_toutiao_system_prompt.txt")
for content in to_processed_contents:
logger.info(f"story_edit_task content id: {content.id}, title: {content.link}, platform: {content.platform}")
story = llm_engine.think(f"故事素材:{content.content}")
logger.info(f"story_edit_task content id: {content.id} story: {story}")
# llm生成的结果有时不是json结构会在前后增加一些文本需要提取出json部分进行解析
try:
json_start = story.find("{")
json_end = story.rfind("}") + 1
if json_start != -1 and json_end != -1:
story = story[json_start:json_end]
else:
logger.warning(f"story_edit_task content id: {content.id} llm生成的结果不是有效的json格式无法提取故事内容")
continue
except json.JSONDecodeError:
logger.warning(f"story_edit_task content id: {content.id} llm生成的结果不是有效的json格式无法解析故事内容")
continue
# 将生成的故事写入Article表
try:
json_story = json.loads(story)
except json.JSONDecodeError:
logger.warning(f"story_edit_task content id: {content.id} llm生成的故事内容不是有效的json格式无法解析故事内容")
continue
title = json_story.get("title", "无标题")
paragraphs = json_story.get("body", ["无内容"])
article_content = "\n".join(paragraphs)
article = Article(
title=title,
keywords=None,
content=article_content,
used=False
)
db.add(article)
db.commit()
# break # 目前先处理一条内容,后续再改成批量处理
if __name__ == "__main__":
execute_task(story_edit_task)