commit 8c1a740f0b41a3d0f86acf55d0fe2da15e9e7a2f Author: konjacpotato Date: Wed Nov 12 20:42:16 2025 +0800 import peter diff --git a/Readme.md b/Readme.md new file mode 100644 index 0000000..0a4df64 --- /dev/null +++ b/Readme.md @@ -0,0 +1,3 @@ +# Peter + +电影里面的蜘蛛侠叫Peter Parker. diff --git a/config/__init__.py b/config/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/config/__pycache__/__init__.cpython-312.pyc b/config/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000..bbef600 Binary files /dev/null and b/config/__pycache__/__init__.cpython-312.pyc differ diff --git a/config/__pycache__/config.cpython-312.pyc b/config/__pycache__/config.cpython-312.pyc new file mode 100644 index 0000000..381977e Binary files /dev/null and b/config/__pycache__/config.cpython-312.pyc differ diff --git a/config/config.py b/config/config.py new file mode 100644 index 0000000..e125c7d --- /dev/null +++ b/config/config.py @@ -0,0 +1,4 @@ +# scheduler name +scheduler_name = 'peter' +# scheduler interval in seconds +scheduler_interval = 3600 \ No newline at end of file diff --git a/database/Readme.md b/database/Readme.md new file mode 100644 index 0000000..e92ffdd --- /dev/null +++ b/database/Readme.md @@ -0,0 +1,5 @@ + +``` +t_top_topic table DDL +上面是top_topic数据表的DDL,根据DDL信息在database模块下,按照项目结构,创建model.py和crud.py +``` \ No newline at end of file diff --git a/database/__init__.py b/database/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/database/__pycache__/__init__.cpython-312.pyc b/database/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000..3561bac Binary files /dev/null and b/database/__pycache__/__init__.cpython-312.pyc differ diff --git a/database/__pycache__/database.cpython-312.pyc b/database/__pycache__/database.cpython-312.pyc new file mode 100644 index 0000000..22615c9 Binary files /dev/null and b/database/__pycache__/database.cpython-312.pyc differ diff --git a/database/database.py b/database/database.py new file mode 100644 index 0000000..c80c549 --- /dev/null +++ b/database/database.py @@ -0,0 +1,37 @@ +from contextlib import contextmanager + +from sqlalchemy import create_engine +from sqlalchemy.orm import sessionmaker, declarative_base + +from log.log_manager import logger + +Base = declarative_base() + +DATABASE_URL = 'postgresql+psycopg://postgres:K8u3fg0o@47.119.128.161:60001/squirrel' +engine = create_engine( + DATABASE_URL, + pool_size=10, + max_overflow=20, + pool_timeout=30, + pool_recycle=1800, # 防止数据库端连接过期 + connect_args={ + 'connect_timeout': 15, + 'keepalives_idle': 60, + 'keepalives_interval': 10, + 'keepalives_count': 5 + } +) +Base.metadata.create_all(engine) + +@contextmanager +def get_session(): + session = sessionmaker(bind=engine)() + try: + yield session + session.commit() # 自动提交成功的事务 + except Exception as e: + session.rollback() # 异常时回滚 + logger.error(f"Database operation failed: {str(e)}") + raise # 重新抛出异常 + finally: + session.close() # 确保会话关闭 \ No newline at end of file diff --git a/database/thotcontent/__pycache__/crud.cpython-312.pyc b/database/thotcontent/__pycache__/crud.cpython-312.pyc new file mode 100644 index 0000000..fee1d83 Binary files /dev/null and b/database/thotcontent/__pycache__/crud.cpython-312.pyc differ diff --git a/database/thotcontent/__pycache__/model.cpython-312.pyc b/database/thotcontent/__pycache__/model.cpython-312.pyc new file mode 100644 index 0000000..b77bf1b Binary files /dev/null and b/database/thotcontent/__pycache__/model.cpython-312.pyc differ diff --git a/database/thotcontent/crud.py b/database/thotcontent/crud.py new file mode 100644 index 0000000..3ebd192 --- /dev/null +++ b/database/thotcontent/crud.py @@ -0,0 +1,77 @@ +from database.thotcontent.model import THotContent +from log.log_manager import logger + + +def create_hot_content(db, hot_content: THotContent): + db.add(hot_content) + db.commit() + db.refresh(hot_content) + return hot_content + +# 插入数据库之前判断数据库中是否已经存在,根据news.url 判断 +def create_content_if_url_not_exists(db, hot_content: THotContent): + # 检查是否已经存在具有相同 URL 的记录 + existing_content = db.query(THotContent).filter(THotContent.url == hot_content.url).first() + + if existing_content: + # 如果记录已存在,直接返回已有的记录 + return existing_content + + # 如果记录不存在,插入新的记录 + db.add(hot_content) + db.commit() + db.refresh(hot_content) + return hot_content + + +def create_contents_top3_if_url_not_exists(db, contents: list[THotContent]): + logger.info(f"采集到内容数量:{len(contents)},存入数据库前三") + # 按照 THotContent.content_upvote_count 对contents进行排序 + contents.sort(key=lambda x: x.content_upvote_count, reverse=True) + + # 保留 contents 的前3条 + contents = contents[:3] + + inserted_contents = [] # 用于保存实际插入的新闻记录 + + for content in contents: + # 检查是否已经存在具有相同 URL 的记录 + existing_content = db.query(THotContent).filter(THotContent.url == content.url).first() + + if not existing_content: + # 如果记录不存在,插入新的记录 + db.add(content) + inserted_contents.append(content) + + # 批量提交所有插入的记录 + db.commit() + + # 刷新所有新插入的记录 + for content in inserted_contents: + db.refresh(content) + + return inserted_contents + +def get_hot_content_by_id(db, hot_content_id: int): + return db.query(THotContent).filter(THotContent.id == hot_content_id).first() + +def get_hot_content_by_topic_id(db, topic_id: int): + return db.query(THotContent).filter(THotContent.topic_id == topic_id).all() + +def get_hot_contents(db, skip: int = 0, limit: int = 100): + return db.query(THotContent).offset(skip).limit(limit).all() + +def update_hot_content(db, hot_content_id: int, updates: dict): + hot_content = db.query(THotContent).filter(THotContent.id == hot_content_id).first() + if hot_content: + for key, value in updates.items(): + setattr(hot_content, key, value) + db.commit() + db.refresh(hot_content) + return hot_content + +def delete_hot_content(db, hot_content_id: int): + hot_content = db.query(THotContent).filter(THotContent.id == hot_content_id).first() + if hot_content: + db.delete(hot_content) + db.commit() \ No newline at end of file diff --git a/database/thotcontent/model.py b/database/thotcontent/model.py new file mode 100644 index 0000000..49470e6 --- /dev/null +++ b/database/thotcontent/model.py @@ -0,0 +1,23 @@ +from dataclasses import dataclass +from datetime import datetime +from typing import Optional + +from sqlalchemy import Column, Integer, String, BIGINT, TIMESTAMP, func +from sqlalchemy.dialects.postgresql import BIGINT + +from database.database import Base + +@dataclass +class THotContent(Base): + __tablename__ = 't_hot_content' + + id: int = Column(BIGINT, primary_key=True, autoincrement=True, comment='序号') + topic_id: int = Column(BIGINT, nullable=False, comment='关联话题ID') + url: Optional[str] = Column(String, nullable=True, comment='内容链接') + content: Optional[str] = Column(String, nullable=True, comment='内容详情') + content_upvote_count: Optional[int] = Column(BIGINT, nullable=True, comment='内容点赞数量') + content_comment_count: Optional[int] = Column(Integer, nullable=True, comment='内容评论数量') + create_time: datetime = Column(TIMESTAMP(timezone=True), server_default=func.now(), nullable=False, comment='创建时间') + + def __repr__(self): + return f"" \ No newline at end of file diff --git a/database/thottopic/__init__.py b/database/thottopic/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/database/thottopic/__pycache__/__init__.cpython-312.pyc b/database/thottopic/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000..044710d Binary files /dev/null and b/database/thottopic/__pycache__/__init__.cpython-312.pyc differ diff --git a/database/thottopic/__pycache__/crud.cpython-312.pyc b/database/thottopic/__pycache__/crud.cpython-312.pyc new file mode 100644 index 0000000..b500de3 Binary files /dev/null and b/database/thottopic/__pycache__/crud.cpython-312.pyc differ diff --git a/database/thottopic/__pycache__/model.cpython-312.pyc b/database/thottopic/__pycache__/model.cpython-312.pyc new file mode 100644 index 0000000..01cc610 Binary files /dev/null and b/database/thottopic/__pycache__/model.cpython-312.pyc differ diff --git a/database/thottopic/crud.py b/database/thottopic/crud.py new file mode 100644 index 0000000..58f4759 --- /dev/null +++ b/database/thottopic/crud.py @@ -0,0 +1,90 @@ +from database.thottopic.model import THotTopic + + +def create_hot_topic(db, hot_topic: THotTopic): + db.add(hot_topic) + db.commit() + db.refresh(hot_topic) + return hot_topic + + +# 插入数据库之前判断数据库中是否已经存在,根据news.url 判断 +def create_topic_if_url_not_exists(db, hot_topic: THotTopic): + # 检查是否已经存在具有相同 URL 的记录 + existing_topic = db.query(THotTopic).filter(THotTopic.url == hot_topic.url).first() + + if existing_topic: + # 如果记录已存在,直接返回已有的记录 + return existing_topic + + # 如果记录不存在,插入新的记录 + db.add(hot_topic) + db.commit() + db.refresh(hot_topic) + return hot_topic + + +def create_topics_if_url_not_exists(db, topics: list[THotTopic]): + inserted_topics = [] # 用于保存实际插入的新闻记录 + + for topic in topics: + # 检查是否已经存在具有相同 URL 的记录 + existing_topic = db.query(THotTopic).filter(THotTopic.url == topic.url).first() + + if not existing_topic: + # 如果记录不存在,插入新的记录 + db.add(topic) + inserted_topics.append(topic) + + # 批量提交所有插入的记录 + db.commit() + + # 刷新所有新插入的记录 + for topic in inserted_topics: + db.refresh(topic) + + return inserted_topics + +def hot_topic_not_exists(db, url_list: list) -> list: + """ + url如果在数据库中已经存在,则去除掉 + :param db: + :param url_list: + :return: + """ + hot_topics = db.query(THotTopic).filter(THotTopic.url.in_(url_list)).all() + for hot_topic in hot_topics: + url_list.remove(hot_topic.url) + return url_list + +def get_hot_topic_by_id(db, hot_topic_id: int): + return db.query(THotTopic).filter(THotTopic.id == hot_topic_id).first() + + +def get_hot_topics(db, skip: int = 0, limit: int = 100): + return db.query(THotTopic).offset(skip).limit(limit).all() + +# 根据THotTopic.update_time排序,获取最新的THotTopic +def get_latest_hot_topic(db): + return db.query(THotTopic).order_by(THotTopic.update_time.desc()).first() + + +def update_hot_topic(db, hot_topic: THotTopic): + db.merge(hot_topic) + db.commit() + db.refresh(hot_topic) + return hot_topic + + +# def update_hot_topic(db, hot_topic_id: int, updates: dict): +# db.query(THotTopic).filter(THotTopic.id == hot_topic_id).update(updates) +# db.commit() +# return db.query(THotTopic).filter(THotTopic.id == hot_topic_id).first() + + +def delete_hot_topic(db, hot_topic_id: int): + hot_topic = db.query(THotTopic).filter(THotTopic.id == hot_topic_id).first() + if hot_topic: + db.delete(hot_topic) + db.commit() + return hot_topic diff --git a/database/thottopic/model.py b/database/thottopic/model.py new file mode 100644 index 0000000..a2dbf51 --- /dev/null +++ b/database/thottopic/model.py @@ -0,0 +1,34 @@ +from dataclasses import dataclass +from datetime import datetime +from typing import Optional + +from sqlalchemy import Column, String, Integer, TIMESTAMP, func +from sqlalchemy.dialects.postgresql import BIGINT + +from database.database import Base + + +@dataclass +class THotTopic(Base): + __tablename__ = 't_hot_topic' + + id: int = Column(BIGINT, primary_key=True, autoincrement=True, comment='序号') + topic: str = Column(String, nullable=False, comment='话题') + topic_description: Optional[str] = Column(String, nullable=True, comment='话题描述') + url: Optional[str] = Column(String, nullable=True, comment='话题链接') + source: Optional[str] = Column(String, nullable=True, comment='话题来源') + keywords: Optional[str] = Column(String, nullable=True, comment='话题关键词') + content_count: int = Column(Integer, default=0, nullable=False, comment='话题内容数量') + comment_count: int = Column(Integer, default=0, nullable=False, comment='话题评论数量') + follower_count: int = Column(Integer, default=0, nullable=False, comment='话题关注者数量') + date_created: Optional[datetime] = Column(TIMESTAMP(timezone=True), nullable=True, comment='话题创建时间') + date_modified: Optional[datetime] = Column(TIMESTAMP(timezone=True), nullable=True, comment='话题修改时间') + top_content_url: Optional[str] = Column(String, nullable=True, comment='热内内容链接') + top_content_upvote_count: Optional[int] = Column(BIGINT, nullable=True, comment='热门内容点赞数量') + top_content_comment_count: Optional[int] = Column(Integer, nullable=True, comment='热门内容评论数量') + create_time: datetime = Column(TIMESTAMP(timezone=True), server_default=func.now(), nullable=False, comment='创建时间') + update_time: Optional[datetime] = Column(TIMESTAMP(timezone=True), server_default=func.now(), nullable=False, comment='更新时间') + ai_script: Optional[str] = Column(String, nullable=True, comment='内容脚本') + + def __repr__(self): + return f"" \ No newline at end of file diff --git a/database/tinformationsource/__init__.py b/database/tinformationsource/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/database/tinformationsource/__pycache__/__init__.cpython-312.pyc b/database/tinformationsource/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000..08d9c63 Binary files /dev/null and b/database/tinformationsource/__pycache__/__init__.cpython-312.pyc differ diff --git a/database/tinformationsource/__pycache__/curd.cpython-312.pyc b/database/tinformationsource/__pycache__/curd.cpython-312.pyc new file mode 100644 index 0000000..dbae38a Binary files /dev/null and b/database/tinformationsource/__pycache__/curd.cpython-312.pyc differ diff --git a/database/tinformationsource/__pycache__/model.cpython-312.pyc b/database/tinformationsource/__pycache__/model.cpython-312.pyc new file mode 100644 index 0000000..e54d636 Binary files /dev/null and b/database/tinformationsource/__pycache__/model.cpython-312.pyc differ diff --git a/database/tinformationsource/curd.py b/database/tinformationsource/curd.py new file mode 100644 index 0000000..fa6196c --- /dev/null +++ b/database/tinformationsource/curd.py @@ -0,0 +1,31 @@ +from database.tinformationsource.model import TInformationSource + + +def create_information_source(db, information_source: TInformationSource): + db.add(information_source) + db.commit() + db.refresh(information_source) + return information_source + +def get_information_source_by_id(db, information_source_id: int): + return db.query(TInformationSource).filter(TInformationSource.id == information_source_id).first() + +def get_active_information_sources(db) -> list: + return db.query(TInformationSource).filter(TInformationSource.active == True).all() + +def update_information_source(db, information_source_id: int, updates: dict): + update_information = db.query(TInformationSource).filter(TInformationSource.id == information_source_id).first() + if update_information: + for key, value in updates.items(): + setattr(update_information, key, value) + db.commit() + db.refresh(update_information) + return update_information + + +def delete_update_information(db, information_source_id: int): + update_information = db.query(TInformationSource).filter(TInformationSource.id == information_source_id).first() + if update_information: + db.delete(update_information) + db.commit() + return update_information diff --git a/database/tinformationsource/model.py b/database/tinformationsource/model.py new file mode 100644 index 0000000..34ee9e7 --- /dev/null +++ b/database/tinformationsource/model.py @@ -0,0 +1,34 @@ +from dataclasses import dataclass + +from sqlalchemy import Column, String, Boolean, TIMESTAMP, func, INT +from sqlalchemy.dialects.postgresql import BIGINT + +from database.database import Base + + +@dataclass +class TInformationSource(Base): + __tablename__ = 't_information_source' + + id: int = Column(BIGINT, primary_key=True, autoincrement=True, comment='编号') + title: str = Column(String, nullable=False, comment='标题') + description: str = Column(String, nullable=True, comment='描述') + keywords: str = Column(String, nullable=True, comment='关键字') + url: str = Column(String, nullable=True, comment='网站链接') + rss: str = Column(String, nullable=True, comment='RSS链接') + api: str = Column(String, nullable=True, comment='API') + primary_category: str = Column(String, nullable=True, comment='一级类别') + secondary_category: str = Column(String, nullable=True, comment='二级类别') + tertiary_category: str = Column(String, nullable=True, comment='三级类别') + label: str = Column(String, nullable=True, comment='标签') + lang: str = Column(String, nullable=False, default='zh', comment='语言') + priority: int = Column(INT, nullable=False, default=100, comment='优先级') + active: bool = Column(Boolean, default=False, nullable=False, comment='是否启用:false未启用,true启用') + module: str = Column(String, nullable=True, comment='任务逻辑所在模块名称') + method: str = Column(String, nullable=True, comment='任务逻辑的函数名称') + create_time: str = Column(TIMESTAMP(timezone=True), server_default=func.now(), nullable=False, comment='创建时间') + update_time: str = Column(TIMESTAMP(timezone=True), server_default=func.now(), onupdate=func.now(), nullable=False, comment='更新时间') + is_static: bool = Column(Boolean, default=True, nullable=False, comment='是否是静态网站:false动态,true静态') + + def __repr__(self): + return f"" diff --git a/database/tnews/__init__.py b/database/tnews/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/database/tnews/__pycache__/__init__.cpython-312.pyc b/database/tnews/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000..237c90d Binary files /dev/null and b/database/tnews/__pycache__/__init__.cpython-312.pyc differ diff --git a/database/tnews/__pycache__/crud.cpython-312.pyc b/database/tnews/__pycache__/crud.cpython-312.pyc new file mode 100644 index 0000000..3c38feb Binary files /dev/null and b/database/tnews/__pycache__/crud.cpython-312.pyc differ diff --git a/database/tnews/__pycache__/model.cpython-312.pyc b/database/tnews/__pycache__/model.cpython-312.pyc new file mode 100644 index 0000000..e266a04 Binary files /dev/null and b/database/tnews/__pycache__/model.cpython-312.pyc differ diff --git a/database/tnews/crud.py b/database/tnews/crud.py new file mode 100644 index 0000000..a80a3c1 --- /dev/null +++ b/database/tnews/crud.py @@ -0,0 +1,87 @@ +from database.tnews.model import TNews + + +def create_news(db, news: TNews): + db.add(news) + db.commit() + db.refresh(news) + return news + + +# 插入数据库之前判断数据库中是否已经存在,根据news.url 判断 +def create_news_if_url_not_exists(db, news: TNews): + # 检查是否已经存在具有相同 URL 的记录 + existing_news = db.query(TNews).filter(TNews.url == news.url).first() + + if existing_news: + # 如果记录已存在,直接返回已有的记录 + return existing_news + + # 如果记录不存在,插入新的记录 + db.add(news) + db.commit() + db.refresh(news) + return news + + +def create_news_list_if_url_not_exists(db, news_list: list[TNews]): + inserted_news = [] # 用于保存实际插入的新闻记录 + + for news in news_list: + # 检查是否已经存在具有相同 URL 的记录 + existing_news = db.query(TNews).filter(TNews.url == news.url).first() + + if not existing_news: + # 如果记录不存在,插入新的记录 + db.add(news) + inserted_news.append(news) + + # 批量提交所有插入的记录 + db.commit() + + # 刷新所有新插入的记录 + for news in inserted_news: + db.refresh(news) + + return inserted_news + + +def get_news_by_id(db, news_id: int): + return db.query(TNews).filter(TNews.id == news_id).first() + +def get_news_need_content(db): + return db.query(TNews).filter(TNews.content == None).all() + +def get_news_need_summary(db): + return db.query(TNews).filter(TNews.ai_summary == None).all() + + +def get_news_for_generate_reference_message(db, news_type: str) -> list[TNews]: + return db.query(TNews).filter( + TNews.type == news_type, + TNews.ai_summary != None, + TNews.is_usage == False + ).order_by(TNews.occurrence_date.desc()).all() + + +def update_news_by_id(db, news: TNews): + db.merge(news) + db.commit() + + +def update_news(db, news_id: int, updates: dict): + news = db.query(TNews).filter(TNews.id == news_id).first() + if news: + for key, value in updates.items(): + setattr(news, key, value) + db.commit() + db.refresh(news) + return news + + +def delete_news(db, news_id: int): + news = db.query(TNews).filter(TNews.id == news_id).first() + if news: + db.delete(news) + db.commit() + return news diff --git a/database/tnews/model.py b/database/tnews/model.py new file mode 100644 index 0000000..7c683d6 --- /dev/null +++ b/database/tnews/model.py @@ -0,0 +1,25 @@ +from dataclasses import dataclass +from datetime import datetime +from typing import Optional + +from sqlalchemy import Column, String, Boolean, DateTime, BigInteger, text, INT +from database.database import Base + +@dataclass +class TNews(Base): + __tablename__ = 't_news' + + id: int = Column(BigInteger, primary_key=True, autoincrement=True, comment='编号') + title: Optional[str] = Column(String, nullable=True, comment='标题') + summary: Optional[str] = Column(String, nullable=True, comment='摘要') + url: Optional[str] = Column(String, nullable=True, comment='链接') + content: Optional[str] = Column(String, nullable=True, comment='内容/正文') + occurrence_date: Optional[datetime] = Column(DateTime(timezone=True), nullable=True, comment='发布日期') + source: Optional[str] = Column(String, nullable=True, comment='来源') + primary_category: str = Column(String, nullable=True, comment='一级类别') + secondary_category: str = Column(String, nullable=True, comment='二级类别') + tertiary_category: str = Column(String, nullable=True, comment='三级类别') + label: str = Column(String, nullable=True, comment='标签') + lang: str = Column(String, nullable=False, default='zh', comment='语言') + is_usage: bool = Column(Boolean, nullable=False, default=False, server_default=text('false'), comment='是否已用') + create_time: datetime = Column(DateTime(timezone=True), nullable=False, server_default=text('now()'), comment='创建日期') \ No newline at end of file diff --git a/database/tscheduler/__init__.py b/database/tscheduler/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/database/tscheduler/__pycache__/__init__.cpython-312.pyc b/database/tscheduler/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000..1ebf8e8 Binary files /dev/null and b/database/tscheduler/__pycache__/__init__.cpython-312.pyc differ diff --git a/database/tscheduler/__pycache__/crud.cpython-312.pyc b/database/tscheduler/__pycache__/crud.cpython-312.pyc new file mode 100644 index 0000000..52e7260 Binary files /dev/null and b/database/tscheduler/__pycache__/crud.cpython-312.pyc differ diff --git a/database/tscheduler/__pycache__/model.cpython-312.pyc b/database/tscheduler/__pycache__/model.cpython-312.pyc new file mode 100644 index 0000000..688770d Binary files /dev/null and b/database/tscheduler/__pycache__/model.cpython-312.pyc differ diff --git a/database/tscheduler/crud.py b/database/tscheduler/crud.py new file mode 100644 index 0000000..6f11fce --- /dev/null +++ b/database/tscheduler/crud.py @@ -0,0 +1,35 @@ +from database.tscheduler.model import TScheduler + +def create_task(db, task: TScheduler): + db.add(task) + db.commit() + db.refresh(task) + return task + +def get_task_by_id(db, task_id: int): + return db.query(TScheduler).filter(TScheduler.id == task_id).first() + +def get_active_tasks(db): + return db.query(TScheduler).filter(TScheduler.active == True).all() + +def get_tasks_by_executor(db, executor: str): + return db.query(TScheduler).filter( + TScheduler.executor == executor, + TScheduler.active == True + ).all() + +def update_task(db, task_id: int, updates: dict): + task = db.query(TScheduler).filter(TScheduler.id == task_id).first() + if task: + for key, value in updates.items(): + setattr(task, key, value) + db.commit() + db.refresh(task) + return task + +def delete_task(db, task_id: int): + task = db.query(TScheduler).filter(TScheduler.id == task_id).first() + if task: + db.delete(task) + db.commit() + return task \ No newline at end of file diff --git a/database/tscheduler/model.py b/database/tscheduler/model.py new file mode 100644 index 0000000..9cc29fc --- /dev/null +++ b/database/tscheduler/model.py @@ -0,0 +1,26 @@ +from dataclasses import dataclass +from datetime import datetime +from typing import Optional +from sqlalchemy import Column, Integer, String, Boolean, Text, DateTime +from database.database import Base + +@dataclass +class TScheduler(Base): + __tablename__ = 't_scheduler' + + id: int = Column(Integer, primary_key=True, autoincrement=True, comment='自动递增的唯一任务ID') + task_name: str = Column(String(64), nullable=False, comment='任务名称') + trigger: str = Column(String(10), nullable=False, comment='调度方式,interval、cron、date') + interval_seconds: Optional[int] = Column(Integer, nullable=True, comment='固定时间间隔(秒),用于 interval 类型') + cron_expression: Optional[str] = Column(String(255), nullable=True, comment='CRON 表达式,用于 cron 类型') + execution_date: Optional[datetime] = Column(DateTime, nullable=True, comment='执行时间,用于 date 类型') + task_payload: Optional[str] = Column(Text, nullable=True, comment='任务相关的参数或数据') + active: Optional[bool] = Column(Boolean, default=False, nullable=True, comment='任务状态,是否启用') + executor: Optional[str] = Column(String(32), nullable=True, comment='任务执行者') + handler: Optional[str] = Column(String(32), nullable=True, comment='任务执行程序') + last_run: Optional[datetime] = Column(DateTime, nullable=True, comment='上一次执行时间') + next_run: Optional[datetime] = Column(DateTime, nullable=True, comment='下一次执行时间') + create_time: datetime = Column(DateTime, default=datetime.utcnow, nullable=True, comment='创建时间') + update_time: datetime = Column(DateTime, default=datetime.utcnow, nullable=True, comment='更新时间') + module_path: Optional[str] = Column(String(255), nullable=True, comment='任务逻辑所在模块名称') + function_name: Optional[str] = Column(String(256), nullable=True, comment='任务逻辑的函数名称') \ No newline at end of file diff --git a/database/tvideoscript/__init__.py b/database/tvideoscript/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/database/tvideoscript/__pycache__/__init__.cpython-312.pyc b/database/tvideoscript/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000..3574ff6 Binary files /dev/null and b/database/tvideoscript/__pycache__/__init__.cpython-312.pyc differ diff --git a/database/tvideoscript/__pycache__/video_script.cpython-312.pyc b/database/tvideoscript/__pycache__/video_script.cpython-312.pyc new file mode 100644 index 0000000..df23486 Binary files /dev/null and b/database/tvideoscript/__pycache__/video_script.cpython-312.pyc differ diff --git a/database/tvideoscript/video_script.py b/database/tvideoscript/video_script.py new file mode 100644 index 0000000..0bf6722 --- /dev/null +++ b/database/tvideoscript/video_script.py @@ -0,0 +1,47 @@ +from dataclasses import dataclass +from datetime import datetime +from typing import Optional + +from sqlalchemy import Column, String, TIMESTAMP, func + +from database.database import Base, get_session +from utils import utils + + +@dataclass +class VideoScript(Base): + __tablename__ = 't_video_script' + + id: str = Column(String, primary_key=True, comment='唯一标识') + title: str = Column(String, nullable=False, comment='标题') + description: Optional[str] = Column(String, nullable=True, comment='描述') + keywords: Optional[str] = Column(String, nullable=True, comment='话题关键词') + url: str = Column(String, nullable=False, comment='话题链接') + script: str = Column(String, nullable=True, comment='视频脚本') + content: str = Column(String, nullable=True, comment='话题内容') + create_time: datetime = Column(TIMESTAMP(timezone=True), server_default=func.now(), nullable=False, comment='创建时间') + + def __repr__(self): + return f"" + +def create_video_script(video_script: VideoScript): + if video_script.id is None: + video_script.id = utils.get_md5(video_script.url) + + with get_session() as db: + db.add(video_script) + db.commit() + db.refresh(video_script) + return video_script + +def video_script_not_exists(url_list: list): + """ + url_list如果在数据库中已经存在,则去除掉 + :param url_list: + :return: + """ + with get_session() as db: + video_scripts = db.query(VideoScript).filter(VideoScript.url.in_(url_list)).all() + for video_script in video_scripts: + url_list.remove(video_script.url) + return url_list \ No newline at end of file diff --git a/log/__init__.py b/log/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/log/__pycache__/__init__.cpython-312.pyc b/log/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000..9b69d31 Binary files /dev/null and b/log/__pycache__/__init__.cpython-312.pyc differ diff --git a/log/__pycache__/log_manager.cpython-312.pyc b/log/__pycache__/log_manager.cpython-312.pyc new file mode 100644 index 0000000..7588fed Binary files /dev/null and b/log/__pycache__/log_manager.cpython-312.pyc differ diff --git a/log/log_manager.py b/log/log_manager.py new file mode 100644 index 0000000..d7b5dd5 --- /dev/null +++ b/log/log_manager.py @@ -0,0 +1,70 @@ +import logging.config +import sys + +import config.config + +""" +Usage: +1 code +from log.log_manager import logger +logger.info("Starting Jarvas") +2 app start +python demo.py --logconfig=log_prod.config + +当前目录下的log_prod.config是一份参考配置 +""" + +# default logging config for development +LOG_DEV_CONFIG = { + "version": 1, + "disable_existing_loggers": False, + "loggers": { + "root": { + "level": "INFO", + "handlers": ["consoleHandler"] + } + }, + "handlers": { + "consoleHandler": { + "class": "logging.StreamHandler", + "level": "INFO", + "formatter": "verbose", + "stream": sys.stdout + } + }, + "formatters": { + "verbose": { + "format": "%(asctime)s - %(name)s - %(levelname)s - %(message)s", + "datefmt": "%Y-%m-%d %H:%M:%S" + } + } +} + +log_config_message = "" +# 获取命令行参数 +args = sys.argv + +# 查找包含 'logconfig' 的参数 +logconfig_param = next((arg for arg in args if '--logconfig' in arg), None) +logconfig_value = None +if logconfig_param: + # 如果找到了 logconfig 参数,提取其值 + _, logconfig_value = logconfig_param.split('=') # 以 '=' 分割 + log_config_message = f"--logconfig value: {logconfig_value}" +else: + log_config_message = "没有找到 --logconfig 参数,使用默认log配置" + +if logconfig_value: + # 使用入参日志配置文件 + logging.config.fileConfig(logconfig_value) +else: + # 使用默认日志配置 + logging.config.dictConfig(LOG_DEV_CONFIG) + +logger = logging.getLogger('root') +# 打印日志配置信息 +logger.info(log_config_message) + + +def log(message: str): + logger.info(f'{config.config.scheduler_name} {message}') diff --git a/log/log_prod.config b/log/log_prod.config new file mode 100644 index 0000000..fe2255b --- /dev/null +++ b/log/log_prod.config @@ -0,0 +1,22 @@ +[loggers] +keys=root + +[handlers] +keys=fileHandler + +[formatters] +keys=verbose + +[logger_root] +level=INFO +handlers=fileHandler + +[handler_fileHandler] +class=FileHandler +level=INFO +formatter=verbose +args=('app.log', 'a') + +[formatter_verbose] +format=%(asctime)s - %(name)s - %(levelname)s - %(message)s +datefmt=%Y-%m-%d %H:%M:%S diff --git a/peter.py b/peter.py new file mode 100644 index 0000000..ed57a39 --- /dev/null +++ b/peter.py @@ -0,0 +1,37 @@ +import datetime +from functools import partial + +from apscheduler.schedulers.blocking import BlockingScheduler + +from config import config +from log.log_manager import log, logger +from task.manager_task import manager_task + +from apscheduler.events import EVENT_JOB_ERROR + +def job_error_listener(event): + if event.exception: + logger.error(f"Job {event.job_id} crashed: {str(event.exception)}") + # 可添加邮件/钉钉告警逻辑 + + +if __name__ == '__main__': + scheduler = BlockingScheduler() + + # 每隔config.scheduler_interval秒执行一次任务,同时设定第一次执行在程序启动后10秒后执行 + scheduler.add_job( + partial(manager_task, scheduler), + 'interval', + seconds=config.scheduler_interval, + jitter=30, # 添加随机抖动避免任务雪崩 + next_run_time=datetime.datetime.now() + datetime.timedelta(seconds=10) # 替代 date 触发器 + ) + + # 添加任务错误监听器 + scheduler.add_listener(job_error_listener, EVENT_JOB_ERROR) + + try: + log(f"started successfully.") + scheduler.start() # 阻塞运行 + except (KeyboardInterrupt, SystemExit): + log(f"Shutting down ...") diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..84c24ce Binary files /dev/null and b/requirements.txt differ diff --git a/seek/163_com/__init__.py b/seek/163_com/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/seek/163_com/__pycache__/__init__.cpython-312.pyc b/seek/163_com/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000..abbde1f Binary files /dev/null and b/seek/163_com/__pycache__/__init__.cpython-312.pyc differ diff --git a/seek/163_com/__pycache__/content.cpython-312.pyc b/seek/163_com/__pycache__/content.cpython-312.pyc new file mode 100644 index 0000000..7d92f01 Binary files /dev/null and b/seek/163_com/__pycache__/content.cpython-312.pyc differ diff --git a/seek/163_com/__pycache__/house.cpython-312.pyc b/seek/163_com/__pycache__/house.cpython-312.pyc new file mode 100644 index 0000000..f736130 Binary files /dev/null and b/seek/163_com/__pycache__/house.cpython-312.pyc differ diff --git a/seek/163_com/content.py b/seek/163_com/content.py new file mode 100644 index 0000000..8016d4c --- /dev/null +++ b/seek/163_com/content.py @@ -0,0 +1,45 @@ +import datetime + +from DrissionPage.errors import ElementNotFoundError + +from database.tinformationsource.model import TInformationSource +from database.tnews.model import TNews +from log.log_manager import logger +from seek.content_base import ContentBase + + +class ArticleContent(ContentBase): + def __init__(self, news: TNews): + super().__init__(news) + + def get_content(self): + try: + content_ = self.session.s_ele('.post_body').text + except ElementNotFoundError: + content_ = 'not found element' + return content_ + + +def get_content(information_source: TInformationSource) -> list: + article_content = ArticleContent(information_source) + result = article_content.get_content() + article_content.finish() + return result + + +def content_task(news: TNews): + logger.info(f'{news.title} news_task start execute at {datetime.datetime.now()}', ) + ofweek_com_ai = ArticleContent(news) + ofweek_com_ai.do_seek_task() + ofweek_com_ai.finish() + logger.info(f'{news.title} news_task end execute at {datetime.datetime.now()}') + + +if __name__ == '__main__': + logger.info('This module is not for direct call!') + news_ = TNews() + news_.is_static = True + news_.url = 'https://www.163.com/dy/article/JKC1V4E70519DDQ2.html' + content = get_content(news_) + logger.info(content) + logger.info('Done.') diff --git a/seek/163_com/house.py b/seek/163_com/house.py new file mode 100644 index 0000000..2524884 --- /dev/null +++ b/seek/163_com/house.py @@ -0,0 +1,59 @@ +import datetime + +from DrissionPage.errors import ElementNotFoundError + +from database.tinformationsource.model import TInformationSource +from database.tnews.model import TNews +from log.log_manager import logger +from seek.seek_base import SeekBase + + +class House(SeekBase): + def __init__(self, information_source: TInformationSource): + super().__init__(information_source) + + def get_news(self): + news_result = [] + _news_list = self.session.s_ele('.news-first').s_eles('.data_row news_article clearfix2 ') + for _news in _news_list: + try: + rs_news = TNews() + rs_news.title = _news.s_ele('.news_title').s_ele('tag:a').text + rs_news.url = _news.s_ele('tag:a').link + # rs_news.summary = _news.s_ele('tag:p').text + # rs_news.occurrence_date = self.process_time(tmp.s_eles('tag:span')[1].text) + rs_news.source = self.information_source.title + news_result.append(rs_news) + except ElementNotFoundError as e: + logger.error(f"ElementNotFoundError: {e} - Failed to find element in news item.") + except Exception as e: + logger.error(f'Unexpected error occurred: {e}') + return news_result + + +def get_news(information_source: TInformationSource) -> list: + instance = House(information_source) + news_list = instance.get_news() + instance.finish() + return news_list + + +def news_task(information_source: TInformationSource): + logger.info(f'{information_source.title} news_task start execute at {datetime.datetime.now()}', ) + instance = House(information_source) + instance.do_seek_task() + instance.finish() + logger.info(f'{information_source.title} news_task end execute at {datetime.datetime.now()}') + + +if __name__ == '__main__': + logger.info('This module is not for direct call!') + information_source_ = TInformationSource() + information_source_.is_static = True + information_source_.url = 'https://sz.house.163.com/' + information_source_.title = '房产_网易' + news_task(information_source_) + # news_list_ = get_news(information_source_) + # for news in news_list_: + # print(news) + logger.info('Done.') diff --git a/seek/__init__.py b/seek/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/seek/__pycache__/__init__.cpython-312.pyc b/seek/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000..9105fb5 Binary files /dev/null and b/seek/__pycache__/__init__.cpython-312.pyc differ diff --git a/seek/__pycache__/content_base.cpython-312.pyc b/seek/__pycache__/content_base.cpython-312.pyc new file mode 100644 index 0000000..a6a8d97 Binary files /dev/null and b/seek/__pycache__/content_base.cpython-312.pyc differ diff --git a/seek/__pycache__/seek_base.cpython-312.pyc b/seek/__pycache__/seek_base.cpython-312.pyc new file mode 100644 index 0000000..e36cd1f Binary files /dev/null and b/seek/__pycache__/seek_base.cpython-312.pyc differ diff --git a/seek/anjuke_com/__init__.py b/seek/anjuke_com/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/seek/anjuke_com/__pycache__/__init__.cpython-312.pyc b/seek/anjuke_com/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000..588fec9 Binary files /dev/null and b/seek/anjuke_com/__pycache__/__init__.cpython-312.pyc differ diff --git a/seek/anjuke_com/__pycache__/content.cpython-312.pyc b/seek/anjuke_com/__pycache__/content.cpython-312.pyc new file mode 100644 index 0000000..725ec0d Binary files /dev/null and b/seek/anjuke_com/__pycache__/content.cpython-312.pyc differ diff --git a/seek/anjuke_com/__pycache__/house.cpython-312.pyc b/seek/anjuke_com/__pycache__/house.cpython-312.pyc new file mode 100644 index 0000000..45dc55a Binary files /dev/null and b/seek/anjuke_com/__pycache__/house.cpython-312.pyc differ diff --git a/seek/anjuke_com/content.py b/seek/anjuke_com/content.py new file mode 100644 index 0000000..e3cce9b --- /dev/null +++ b/seek/anjuke_com/content.py @@ -0,0 +1,46 @@ +import datetime + +from DrissionPage.errors import ElementNotFoundError + +from database.tinformationsource.model import TInformationSource +from database.tnews.model import TNews +from log.log_manager import logger +from seek.content_base import ContentBase + + +class ArticleContent(ContentBase): + def __init__(self, news: TNews): + super().__init__(news) + + def get_content(self): + content_ = '' + try: + content_ = self.session.s_ele('.^info-content').text + except ElementNotFoundError: + content_ = 'not found element' + return content_ + + +def get_content(information_source: TInformationSource) -> list: + article_content = ArticleContent(information_source) + result = article_content.get_content() + article_content.finish() + return result + + +def content_task(news: TNews): + logger.info(f'{news.title} news_task start execute at {datetime.datetime.now()}', ) + ofweek_com_ai = ArticleContent(news) + ofweek_com_ai.do_seek_task() + ofweek_com_ai.finish() + logger.info(f'{news.title} news_task end execute at {datetime.datetime.now()}') + + +if __name__ == '__main__': + logger.info('This module is not for direct call!') + news_ = TNews() + news_.is_static = True + news_.url = 'https://sz.news.anjuke.com/louping-965203-pan528488.html' + content = get_content(news_) + logger.info(content) + logger.info('Done.') diff --git a/seek/anjuke_com/house.py b/seek/anjuke_com/house.py new file mode 100644 index 0000000..37caa93 --- /dev/null +++ b/seek/anjuke_com/house.py @@ -0,0 +1,62 @@ +import datetime + +from DrissionPage.errors import ElementNotFoundError + +from database.tinformationsource.model import TInformationSource +from database.tnews.model import TNews +from log.log_manager import logger +from seek.seek_base import SeekBase +from utils.time_utils import process_time + + +class House(SeekBase): + def __init__(self, information_source: TInformationSource): + super().__init__(information_source) + + def get_news(self): + news_result = [] + print(self.session.html) + _news_list = self.session.s_ele('.main-list').s_eles('.m-list-item clearfix') + for _news in _news_list: + try: + rs_news = TNews() + tmp_ = _news.s_ele('.item-col-right') + rs_news.title = tmp_.s_ele('tag:h3').text + rs_news.url = tmp_.s_ele('tag:a').link + rs_news.summary = tmp_.s_eles('tag:a')[1].text + rs_news.occurrence_date = process_time(tmp_.s_ele('.info__time').text) + rs_news.source = self.information_source.title + news_result.append(rs_news) + except ElementNotFoundError as e: + logger.error(f"ElementNotFoundError: {e} - Failed to find element in news item.") + except Exception as e: + logger.error(f'Unexpected error occurred: {e}') + return news_result + + +def get_news(information_source: TInformationSource) -> list: + instance = House(information_source) + news_list = instance.get_news() + instance.finish() + return news_list + + +def news_task(information_source: TInformationSource): + logger.info(f'{information_source.title} news_task start execute at {datetime.datetime.now()}', ) + instance = House(information_source) + instance.do_seek_task() + instance.finish() + logger.info(f'{information_source.title} news_task end execute at {datetime.datetime.now()}') + + +if __name__ == '__main__': + logger.info('This module is not for direct call!') + information_source_ = TInformationSource() + information_source_.is_static = True + information_source_.url = 'https://sz.news.anjuke.com/hot/' + information_source_.title = '房产_安居客' + news_task(information_source_) + # news_list_ = get_news(information_source_) + # for news in news_list_: + # print(news) + logger.info('Done.') diff --git a/seek/cnn_com/__init__.py b/seek/cnn_com/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/seek/cnn_com/content.py b/seek/cnn_com/content.py new file mode 100644 index 0000000..db7fe87 --- /dev/null +++ b/seek/cnn_com/content.py @@ -0,0 +1,58 @@ +import datetime + +from DrissionPage.errors import ElementNotFoundError + +from database.tinformationsource.model import TInformationSource +from database.tnews.model import TNews +from log.log_manager import logger +from seek.content_base import ContentBase + + +class ArticleContent(ContentBase): + def __init__(self, news: TNews): + super().__init__(news) + + def get_content(self): + content_ = '' + try: + content_ = self.session.s_ele('#detailContent').text + except ElementNotFoundError: + content_ = 'not found element' + return content_ + + def get_occurrence_date(self): + try: + header_time = self.session.s_ele('.header-time left') + year = header_time.s_ele('.year').text # 2023 + day = header_time.s_ele('.day').text # 12/27 + time = header_time.s_ele('.time').text # 08:05:11 + occurrence_date_ = f'{year}/{day} {time}' + print(occurrence_date_) + except ElementNotFoundError: + occurrence_date_ = None + return occurrence_date_ + +def get_content(information_source: TInformationSource) -> list: + article_content = ArticleContent(information_source) + result = article_content.get_content() + article_content.get_occurrence_date() + article_content.finish() + return result + + +def content_task(news: TNews): + logger.info(f'{news.title} news_task start execute at {datetime.datetime.now()}', ) + article_content = ArticleContent(news) + article_content.do_seek_task() + article_content.finish() + logger.info(f'{news.title} news_task end execute at {datetime.datetime.now()}') + + +if __name__ == '__main__': + logger.info('This module is not for direct call!') + news_ = TNews() + news_.is_static = True + news_.url = 'https://www.news.cn/politics/leaders/20241227/90e76f85ad4a43ba94802b07c5736e00/c.html' + content = get_content(news_) + logger.info(content) + logger.info('Done.') diff --git a/seek/cnn_com/edition.py b/seek/cnn_com/edition.py new file mode 100644 index 0000000..cd6a416 --- /dev/null +++ b/seek/cnn_com/edition.py @@ -0,0 +1,62 @@ +import datetime + +from DrissionPage.errors import ElementNotFoundError + +from database.tinformationsource.model import TInformationSource +from database.tnews.model import TNews +from log.log_manager import logger +from seek.seek_base import SeekBase + + +class Edition(SeekBase): + def __init__(self, information_source: TInformationSource): + super().__init__(information_source) + + def get_news(self): + news_result = [] + # _news_list = self.tab.s_ele('.zone zone--t-light zone-2-observer').s_eles('.stack') + # _news_list = self.tab.s_ele('.zone zone--t-light zone-2-observer').s_eles('.stack__items ') + _news_list = self.tab.s_ele('.zone zone--t-light zone-2-observer').s_eles('tag:a') + for _news in _news_list: + print(_news.html) + try: + rs_news = TNews() + rs_news.title = _news.text + rs_news.url = _news.link + # rs_news.summary = tmp_.s_eles('tag:a')[1].text + # rs_news.occurrence_date = self.process_time(tmp_.s_ele('.info__time').text) + rs_news.source = self.information_source.title + news_result.append(rs_news) + except ElementNotFoundError as e: + logger.error(f"ElementNotFoundError: {e} - Failed to find element in news item.") + except Exception as e: + logger.error(f'Unexpected error occurred: {e}') + return news_result + + +def get_news(information_source: TInformationSource) -> list: + instance = Edition(information_source) + news_list = instance.get_news() + instance.finish() + return news_list + + +def news_task(information_source: TInformationSource): + logger.info(f'{information_source.title} news_task start execute at {datetime.datetime.now()}', ) + instance = Edition(information_source) + instance.do_seek_task() + instance.finish() + logger.info(f'{information_source.title} news_task end execute at {datetime.datetime.now()}') + + +if __name__ == '__main__': + logger.info('This module is not for direct call!') + information_source_ = TInformationSource() + information_source_.is_static = False + information_source_.url = 'https://edition.cnn.com/' + information_source_.title = 'edition_CNN' + # news_task(information_source_) + news_list_ = get_news(information_source_) + for news in news_list_: + print(news) + logger.info('Done.') diff --git a/seek/content_base.py b/seek/content_base.py new file mode 100644 index 0000000..f3e1827 --- /dev/null +++ b/seek/content_base.py @@ -0,0 +1,50 @@ +from abc import ABC, abstractmethod + +from DrissionPage import Chromium, SessionPage, ChromiumOptions + +from database.database import get_session +from database.tnews.crud import update_news_by_id +from database.tnews.model import TNews +from log.log_manager import log + + +class ContentBase(ABC): + def __init__(self, news: TNews): + self.news = news + self.session = None # 初始化为 None + self.browser = None # 初始化为 None + if news.is_static: + self.session = SessionPage() + self.session.get(news.url) + else: + co = ChromiumOptions() + self.browser = Chromium(addr_or_opts=co) + # self.tab = self.browser.latest_tab + self.tab = self.browser.new_tab() + self.tab.get(news.url) + + @abstractmethod + def get_content(self): + """Abstract method to fetch news from a specific source.""" + pass + + def get_occurrence_date(self): + return None + + def do_seek_task(self): + """Saves the list of news to the database if the URL does not already exist.""" + self.news.content = self.get_content() + if self.news.occurrence_date is None: + self.news.occurrence_date = self.get_occurrence_date() + with get_session() as db: + update_news_by_id(db, self.news) + log(f'successful fetch {self.news.title} news content into the database.') + + def finish(self): + """Closes the browser and session.""" + if self.tab: + self.tab.close() + # if self.browser: + # self.browser.quit() + if self.session: + self.session.close() \ No newline at end of file diff --git a/seek/fang_com/__init__.py b/seek/fang_com/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/seek/fang_com/__pycache__/__init__.cpython-312.pyc b/seek/fang_com/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000..f1d04b1 Binary files /dev/null and b/seek/fang_com/__pycache__/__init__.cpython-312.pyc differ diff --git a/seek/fang_com/__pycache__/content.cpython-312.pyc b/seek/fang_com/__pycache__/content.cpython-312.pyc new file mode 100644 index 0000000..93410dd Binary files /dev/null and b/seek/fang_com/__pycache__/content.cpython-312.pyc differ diff --git a/seek/fang_com/__pycache__/house.cpython-312.pyc b/seek/fang_com/__pycache__/house.cpython-312.pyc new file mode 100644 index 0000000..4a8c7d9 Binary files /dev/null and b/seek/fang_com/__pycache__/house.cpython-312.pyc differ diff --git a/seek/fang_com/content.py b/seek/fang_com/content.py new file mode 100644 index 0000000..8d2cf1b --- /dev/null +++ b/seek/fang_com/content.py @@ -0,0 +1,46 @@ +import datetime + +from DrissionPage.errors import ElementNotFoundError + +from database.tinformationsource.model import TInformationSource +from database.tnews.model import TNews +from log.log_manager import logger +from seek.content_base import ContentBase + + +class ArticleContent(ContentBase): + def __init__(self, news: TNews): + super().__init__(news) + + def get_content(self): + content_ = '' + try: + content_ = self.session.s_ele('.^news-text').text + except ElementNotFoundError: + content_ = 'not found element' + return content_ + + +def get_content(information_source: TInformationSource) -> list: + article_content = ArticleContent(information_source) + result = article_content.get_content() + article_content.finish() + return result + + +def content_task(news: TNews): + logger.info(f'{news.title} news_task start execute at {datetime.datetime.now()}', ) + ofweek_com_ai = ArticleContent(news) + ofweek_com_ai.do_seek_task() + ofweek_com_ai.finish() + logger.info(f'{news.title} news_task end execute at {datetime.datetime.now()}') + + +if __name__ == '__main__': + logger.info('This module is not for direct call!') + news_ = TNews() + news_.is_static = True + news_.url = 'https://sz.news.fang.com/open/51863596.html' + content = get_content(news_) + logger.info(content) + logger.info('Done.') diff --git a/seek/fang_com/house.py b/seek/fang_com/house.py new file mode 100644 index 0000000..3f50cc1 --- /dev/null +++ b/seek/fang_com/house.py @@ -0,0 +1,64 @@ +import datetime + +from DrissionPage.errors import ElementNotFoundError + +from database.tinformationsource.model import TInformationSource +from database.tnews.model import TNews +from log.log_manager import logger +from seek.seek_base import SeekBase +from utils.time_utils import process_time + + +class House(SeekBase): + def __init__(self, information_source: TInformationSource): + super().__init__(information_source) + + def get_news(self): + news_result = [] + _news_list = self.session.s_ele('.news-list').s_eles('tag:li') + for _news in _news_list: + try: + rs_news = TNews() + tmp = _news.s_ele('.txt') + rs_news.title = tmp.s_ele('tag:a').text + rs_news.url = tmp.s_ele('tag:a').link + rs_news.summary = tmp.s_ele('tag:p').text + rs_news.occurrence_date = process_time(tmp.s_eles('tag:span')[1].text) + rs_news.source = self.information_source.title + news_result.append(rs_news) + except ElementNotFoundError as e: + if _news.s_ele('.item'): + # 此为视频内容,跳过 + continue + logger.error(f"ElementNotFoundError: {e} - Failed to find element in news item.") + except Exception as e: + logger.error(f'Unexpected error occurred: {e}') + return news_result + + +def get_news(information_source: TInformationSource) -> list: + instance = House(information_source) + news_list = instance.get_news() + instance.finish() + return news_list + + +def news_task(information_source: TInformationSource): + logger.info(f'{information_source.title} news_task start execute at {datetime.datetime.now()}', ) + instance = House(information_source) + instance.do_seek_task() + instance.finish() + logger.info(f'{information_source.title} news_task end execute at {datetime.datetime.now()}') + + +if __name__ == '__main__': + logger.info('This module is not for direct call!') + information_source_ = TInformationSource() + information_source_.is_static = True + information_source_.url = 'https://sz.news.fang.com/' + information_source_.title = '房产_房天下' + news_task(information_source_) + # news_list_ = get_news(information_source_) + # for news in news_list_: + # print(news) + logger.info('Done.') diff --git a/seek/focus_cn/__init__.py b/seek/focus_cn/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/seek/focus_cn/__pycache__/__init__.cpython-312.pyc b/seek/focus_cn/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000..158038b Binary files /dev/null and b/seek/focus_cn/__pycache__/__init__.cpython-312.pyc differ diff --git a/seek/focus_cn/__pycache__/content.cpython-312.pyc b/seek/focus_cn/__pycache__/content.cpython-312.pyc new file mode 100644 index 0000000..1e7e688 Binary files /dev/null and b/seek/focus_cn/__pycache__/content.cpython-312.pyc differ diff --git a/seek/focus_cn/__pycache__/house.cpython-312.pyc b/seek/focus_cn/__pycache__/house.cpython-312.pyc new file mode 100644 index 0000000..9fc7899 Binary files /dev/null and b/seek/focus_cn/__pycache__/house.cpython-312.pyc differ diff --git a/seek/focus_cn/content.py b/seek/focus_cn/content.py new file mode 100644 index 0000000..a96c1cc --- /dev/null +++ b/seek/focus_cn/content.py @@ -0,0 +1,46 @@ +import datetime + +from DrissionPage.errors import ElementNotFoundError + +from database.tinformationsource.model import TInformationSource +from database.tnews.model import TNews +from log.log_manager import logger +from seek.content_base import ContentBase + + +class ArticleContent(ContentBase): + def __init__(self, news: TNews): + news.is_static = True + super().__init__(news) + + def get_content(self): + try: + content_ = self.session.s_ele('.article').text + except ElementNotFoundError: + content_ = 'not found element' + return content_ + + +def get_content(information_source: TInformationSource) -> list: + article_content = ArticleContent(information_source) + result = article_content.get_content() + article_content.finish() + return result + + +def content_task(news: TNews): + logger.info(f'{news.title} news_task start execute at {datetime.datetime.now()}', ) + ofweek_com_ai = ArticleContent(news) + ofweek_com_ai.do_seek_task() + ofweek_com_ai.finish() + logger.info(f'{news.title} news_task end execute at {datetime.datetime.now()}') + + +if __name__ == '__main__': + logger.info('This module is not for direct call!') + news_ = TNews() + news_.is_static = True + news_.url = 'https://www.focus.cn/a/842171870_124752' + content = get_content(news_) + logger.info(content) + logger.info('Done.') diff --git a/seek/focus_cn/house.py b/seek/focus_cn/house.py new file mode 100644 index 0000000..92c9032 --- /dev/null +++ b/seek/focus_cn/house.py @@ -0,0 +1,62 @@ +import datetime + +from DrissionPage.errors import ElementNotFoundError + +from database.tinformationsource.model import TInformationSource +from database.tnews.model import TNews +from log.log_manager import logger +from seek.seek_base import SeekBase +from utils.time_utils import process_time + + +class House(SeekBase): + def __init__(self, information_source: TInformationSource): + super().__init__(information_source) + + def get_news(self): + news_result = [] + self.tab.wait.ele_displayed('.FeedList') + _news_list = self.tab.s_ele('.cbd-recommend').s_eles('.FeedList') + for _news in _news_list: + try: + rs_news = TNews() + rs_news.title = _news.s_ele('.item-text-content-title').text + link = _news.s_ele('tag:a').link + rs_news.url = link.split('?')[0] + rs_news.summary = _news.s_ele('.item-text-content-description').text + rs_news.occurrence_date = process_time(_news.s_ele('.extra-info-item').text) + rs_news.source = self.information_source.title + news_result.append(rs_news) + except ElementNotFoundError as e: + logger.error(f"ElementNotFoundError: {e} - Failed to find element in news item.") + except Exception as e: + logger.error(f'Unexpected error occurred: {e}') + return news_result + + +def get_news(information_source: TInformationSource) -> list: + instance = House(information_source) + news_list = instance.get_news() + instance.finish() + return news_list + + +def news_task(information_source: TInformationSource): + logger.info(f'{information_source.title} news_task start execute at {datetime.datetime.now()}', ) + instance = House(information_source) + instance.do_seek_task() + instance.finish() + logger.info(f'{information_source.title} news_task end execute at {datetime.datetime.now()}') + + +if __name__ == '__main__': + logger.info('This module is not for direct call!') + information_source_ = TInformationSource() + information_source_.is_static = False + information_source_.url = 'https://sz.focus.cn/zixun/' + information_source_.title = '房产_搜狐焦点' + news_task(information_source_) + # news_list_ = get_news(information_source_) + # for news in news_list_: + # print(news) + logger.info('Done.') diff --git a/seek/leju_com/__init__.py b/seek/leju_com/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/seek/leju_com/__pycache__/__init__.cpython-312.pyc b/seek/leju_com/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000..1c197b5 Binary files /dev/null and b/seek/leju_com/__pycache__/__init__.cpython-312.pyc differ diff --git a/seek/leju_com/__pycache__/content.cpython-312.pyc b/seek/leju_com/__pycache__/content.cpython-312.pyc new file mode 100644 index 0000000..4544717 Binary files /dev/null and b/seek/leju_com/__pycache__/content.cpython-312.pyc differ diff --git a/seek/leju_com/__pycache__/house.cpython-312.pyc b/seek/leju_com/__pycache__/house.cpython-312.pyc new file mode 100644 index 0000000..8cf4b99 Binary files /dev/null and b/seek/leju_com/__pycache__/house.cpython-312.pyc differ diff --git a/seek/leju_com/content.py b/seek/leju_com/content.py new file mode 100644 index 0000000..bef1e90 --- /dev/null +++ b/seek/leju_com/content.py @@ -0,0 +1,45 @@ +import datetime + +from DrissionPage.errors import ElementNotFoundError + +from database.tinformationsource.model import TInformationSource +from database.tnews.model import TNews +from log.log_manager import logger +from seek.content_base import ContentBase + + +class ArticleContent(ContentBase): + def __init__(self, news: TNews): + super().__init__(news) + + def get_content(self): + try: + content_ = self.session.s_ele('.^sf_news_contend').text + except ElementNotFoundError: + content_ = 'not found element' + return content_ + + +def get_content(information_source: TInformationSource) -> list: + article_content = ArticleContent(information_source) + result = article_content.get_content() + article_content.finish() + return result + + +def content_task(news: TNews): + logger.info(f'{news.title} news_task start execute at {datetime.datetime.now()}', ) + ofweek_com_ai = ArticleContent(news) + ofweek_com_ai.do_seek_task() + ofweek_com_ai.finish() + logger.info(f'{news.title} news_task end execute at {datetime.datetime.now()}') + + +if __name__ == '__main__': + logger.info('This module is not for direct call!') + news_ = TNews() + news_.is_static = True + news_.url = 'https://sz.leju.com/news/2024-12-18/18427272536617796292963.shtml' + content = get_content(news_) + logger.info(content) + logger.info('Done.') diff --git a/seek/leju_com/house.py b/seek/leju_com/house.py new file mode 100644 index 0000000..1f9fc90 --- /dev/null +++ b/seek/leju_com/house.py @@ -0,0 +1,60 @@ +import datetime + +from DrissionPage.errors import ElementNotFoundError + +from database.tinformationsource.model import TInformationSource +from database.tnews.model import TNews +from log.log_manager import logger +from seek.seek_base import SeekBase +from utils.time_utils import process_time + + +class House(SeekBase): + def __init__(self, information_source: TInformationSource): + super().__init__(information_source) + + def get_news(self): + news_result = [] + _news_list = self.session.s_ele('.sf_listPage').s_eles('tag:li') + for _news in _news_list: + try: + rs_news = TNews() + rs_news.title = _news.s_ele('tag:a').text + rs_news.url = _news.s_ele('tag:a').link + rs_news.summary = _news.s_ele('tag:p').text + rs_news.occurrence_date = process_time(_news.s_ele('.tag').text) + rs_news.source = self.information_source.title + news_result.append(rs_news) + except ElementNotFoundError as e: + logger.error(f"ElementNotFoundError: {e} - Failed to find element in news item.") + except Exception as e: + logger.error(f'Unexpected error occurred: {e}') + return news_result + + +def get_news(information_source: TInformationSource) -> list: + instance = House(information_source) + news_list = instance.get_news() + instance.finish() + return news_list + + +def news_task(information_source: TInformationSource): + logger.info(f'{information_source.title} news_task start execute at {datetime.datetime.now()}', ) + instance = House(information_source) + instance.do_seek_task() + instance.finish() + logger.info(f'{information_source.title} news_task end execute at {datetime.datetime.now()}') + + +if __name__ == '__main__': + logger.info('This module is not for direct call!') + information_source_ = TInformationSource() + information_source_.is_static = True + information_source_.url = 'https://sz.leju.com/news/' + information_source_.title = '房产_新浪乐居' + # news_task(information_source_) + news_list_ = get_news(information_source_) + for news in news_list_: + print(news) + logger.info('Done.') diff --git a/seek/mittr_com/__init__.py b/seek/mittr_com/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/seek/mittr_com/__pycache__/__init__.cpython-312.pyc b/seek/mittr_com/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000..1bb918b Binary files /dev/null and b/seek/mittr_com/__pycache__/__init__.cpython-312.pyc differ diff --git a/seek/mittr_com/__pycache__/content.cpython-312.pyc b/seek/mittr_com/__pycache__/content.cpython-312.pyc new file mode 100644 index 0000000..d7fd2a7 Binary files /dev/null and b/seek/mittr_com/__pycache__/content.cpython-312.pyc differ diff --git a/seek/mittr_com/__pycache__/mit_t_r.cpython-312.pyc b/seek/mittr_com/__pycache__/mit_t_r.cpython-312.pyc new file mode 100644 index 0000000..35a5f9b Binary files /dev/null and b/seek/mittr_com/__pycache__/mit_t_r.cpython-312.pyc differ diff --git a/seek/mittr_com/content.py b/seek/mittr_com/content.py new file mode 100644 index 0000000..5b3a4bd --- /dev/null +++ b/seek/mittr_com/content.py @@ -0,0 +1,45 @@ +import datetime + +from DrissionPage.errors import ElementNotFoundError + +from database.tinformationsource.model import TInformationSource +from database.tnews.model import TNews +from log.log_manager import logger +from seek.content_base import ContentBase + + +class ArticleContent(ContentBase): + def __init__(self, news: TNews): + super().__init__(news) + + def get_content(self): + try: + content_ = self.tab.s_ele('.content').text + except ElementNotFoundError: + content_ = 'not found element' + return content_ + + +def get_content(information_source: TInformationSource) -> list: + article_content = ArticleContent(information_source) + result = article_content.get_content() + article_content.finish() + return result + + +def content_task(news: TNews): + logger.info(f'{news.title} news_task start execute at {datetime.datetime.now()}', ) + ofweek_com_ai = ArticleContent(news) + ofweek_com_ai.do_seek_task() + ofweek_com_ai.finish() + logger.info(f'{news.title} news_task end execute at {datetime.datetime.now()}') + + +if __name__ == '__main__': + logger.info('This module is not for direct call!') + news_ = TNews() + news_.is_static = False + news_.url = 'https://www.mittrchina.com/news/detail/14218' + content = get_content(news_) + logger.info(content) + logger.info('Done.') diff --git a/seek/mittr_com/mit_t_r.py b/seek/mittr_com/mit_t_r.py new file mode 100644 index 0000000..50ce8fc --- /dev/null +++ b/seek/mittr_com/mit_t_r.py @@ -0,0 +1,63 @@ +import datetime + +from DrissionPage.errors import ElementNotFoundError + +from database.tinformationsource.model import TInformationSource +from database.tnews.model import TNews +from log.log_manager import logger +from seek.seek_base import SeekBase +from utils.time_utils import process_time + + +class MittrChinaCom(SeekBase): + def __init__(self, information_source: TInformationSource): + super().__init__(information_source) + + def get_news(self): + news_result = [] + self.tab.wait.ele_displayed('.last-item') + _news_list = self.tab.s_ele('.lastest-list').s_eles('.last-item') + + for _news in _news_list: + try: + tnews = TNews() + tnews.title = _news.s_ele('tag:a').text + tnews.url = _news.s_ele('tag:a').link + _time = _news.parent().s_ele('.time').text + tnews.occurrence_date = process_time(_time) + tnews.source = self.information_source.title + news_result.append(tnews) + except ElementNotFoundError as e: + logger.error(f"ElementNotFoundError: {e} - Failed to find element in news item.") + except Exception as e: + logger.error(f'Unexpected error occurred: {e}') + + return news_result + + +def get_news(information_source: TInformationSource) -> list: + mittr = MittrChinaCom(information_source) + news_list = mittr.get_news() + mittr.finish() + return news_list + + +def news_task(information_source: TInformationSource): + logger.info(f'{information_source.title} news_task start execute at {datetime.datetime.now()}', ) + mittr = MittrChinaCom(information_source) + mittr.do_seek_task() + mittr.finish() + logger.info(f'{information_source.title} news_task end execute at {datetime.datetime.now()}') + + +if __name__ == '__main__': + logger.info('This module is not for direct call!') + information_source_ = TInformationSource() + information_source_.is_static = False + information_source_.url = 'https://www.mittrchina.com/' + information_source_.title = '科技_麻省理工科技评论' + news_task(information_source_) + # news_list_ = get_news(information_source_) + # for news in news_list_: + # print(news) + logger.info('Done.') diff --git a/seek/ofweek_com/__init__.py b/seek/ofweek_com/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/seek/ofweek_com/__pycache__/__init__.cpython-312.pyc b/seek/ofweek_com/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000..5703bb4 Binary files /dev/null and b/seek/ofweek_com/__pycache__/__init__.cpython-312.pyc differ diff --git a/seek/ofweek_com/__pycache__/ai.cpython-312.pyc b/seek/ofweek_com/__pycache__/ai.cpython-312.pyc new file mode 100644 index 0000000..567d03c Binary files /dev/null and b/seek/ofweek_com/__pycache__/ai.cpython-312.pyc differ diff --git a/seek/ofweek_com/__pycache__/content.cpython-312.pyc b/seek/ofweek_com/__pycache__/content.cpython-312.pyc new file mode 100644 index 0000000..fe3d097 Binary files /dev/null and b/seek/ofweek_com/__pycache__/content.cpython-312.pyc differ diff --git a/seek/ofweek_com/ai.py b/seek/ofweek_com/ai.py new file mode 100644 index 0000000..4427be2 --- /dev/null +++ b/seek/ofweek_com/ai.py @@ -0,0 +1,62 @@ +import datetime + +from DrissionPage.errors import ElementNotFoundError + +from database.tinformationsource.model import TInformationSource +from database.tnews.model import TNews +from log.log_manager import logger +from seek.seek_base import SeekBase +from utils.time_utils import process_time + + +class OfweekComAi(SeekBase): + def __init__(self, information_source: TInformationSource): + super().__init__(information_source) + + def get_news(self): + news_result = [] + _news_list = self.session.s_ele('.main-cont-left w640').s_eles('.^top-title') + + for _news in _news_list: + try: + __news = TNews() + __news.title = _news.s_ele('tag:a').text + __news.url = _news.s_ele('tag:a').link + _time = _news.parent().s_eles('tag:span')[4].text + __news.occurrence_date = process_time(_time) + __news.source = self.information_source.title + news_result.append(__news) + except ElementNotFoundError as e: + logger.error(f"ElementNotFoundError: {e} - Failed to find element in news item.") + except Exception as e: + logger.error(f'Unexpected error occurred: {e}') + + return news_result + + +def get_news(information_source: TInformationSource) -> list: + ofweek_com_ai = OfweekComAi(information_source) + news_list = ofweek_com_ai.get_news() + ofweek_com_ai.finish() + return news_list + + +def news_task(information_source: TInformationSource): + logger.info(f'{information_source.title} news_task start execute at {datetime.datetime.now()}', ) + ofweek_com_ai = OfweekComAi(information_source) + ofweek_com_ai.do_seek_task() + ofweek_com_ai.finish() + logger.info(f'{information_source.title} news_task end execute at {datetime.datetime.now()}') + + +if __name__ == '__main__': + logger.info('This module is not for direct call!') + information_source_ = TInformationSource() + information_source_.is_static = True + information_source_.url = 'https://www.ofweek.com/ai/' + information_source_.title = '人工智能_维科网' + news_task(information_source_) + # news_list_ = get_news(information_source_) + # for news in news_list_: + # print(news) + logger.info('Done.') diff --git a/seek/ofweek_com/content.py b/seek/ofweek_com/content.py new file mode 100644 index 0000000..803cd22 --- /dev/null +++ b/seek/ofweek_com/content.py @@ -0,0 +1,46 @@ +import datetime + +from DrissionPage.errors import ElementNotFoundError + +from database.tinformationsource.model import TInformationSource +from database.tnews.model import TNews +from log.log_manager import logger +from seek.content_base import ContentBase + + +class ArticleContent(ContentBase): + def __init__(self, news: TNews): + super().__init__(news) + + def get_content(self): + content_ = '' + try: + content_ = self.session.s_ele('.artical-content').text + except ElementNotFoundError: + content_ = 'not found element' + return content_ + + +def get_content(information_source: TInformationSource) -> list: + article_content = ArticleContent(information_source) + result = article_content.get_content() + article_content.finish() + return result + + +def content_task(news: TNews): + logger.info(f'{news.title} news_task start execute at {datetime.datetime.now()}', ) + ofweek_com_ai = ArticleContent(news) + ofweek_com_ai.do_seek_task() + ofweek_com_ai.finish() + logger.info(f'{news.title} news_task end execute at {datetime.datetime.now()}') + + +if __name__ == '__main__': + logger.info('This module is not for direct call!') + news_ = TNews() + news_.is_static = True + news_.url = 'https://www.ofweek.com/ai/2024-12/ART-201721-8120-30654143.html' + content = get_content(news_) + logger.info(content) + logger.info('Done.') diff --git a/seek/seek_base.py b/seek/seek_base.py new file mode 100644 index 0000000..65a19ed --- /dev/null +++ b/seek/seek_base.py @@ -0,0 +1,57 @@ +from abc import ABC, abstractmethod + +from DrissionPage import Chromium, SessionPage, ChromiumOptions + +from database.database import get_session +from database.tinformationsource.model import TInformationSource +from database.tnews.crud import create_news_list_if_url_not_exists +from log.log_manager import log + + +class SeekBase(ABC): + def __init__(self, information_source: TInformationSource): + self.information_source = information_source + self.session = None # 初始化为 None + self.browser = None # 初始化为 None + self.tab = None + if information_source.is_static: + self.session = SessionPage() + self.session.get(information_source.url) + else: + co = ChromiumOptions() + self.browser = Chromium() + self.tab = self.browser.new_tab() + self.tab.get(information_source.url) + + @abstractmethod + def get_news(self): + """Abstract method to fetch news from a specific source.""" + pass + + def do_seek_task(self): + """Saves the list of news to the database if the URL does not already exist.""" + news_list = self.get_news() + for news in news_list: + if news.primary_category is None: + news.primary_category = self.information_source.primary_category + if news.secondary_category is None: + news.secondary_category = self.information_source.secondary_category + if news.tertiary_category is None: + news.tertiary_category = self.information_source.tertiary_category + if news.label is None: + news.label = self.information_source.label + if news.lang is None: + news.lang = self.information_source.lang + with get_session() as db: + inserted_news = create_news_list_if_url_not_exists(db, news_list) + log(f'Inserted {len(inserted_news)} {self.information_source.title} news items into the database.') + return inserted_news + + def finish(self): + """Closes the browser and session.""" + if self.tab: + self.tab.close() + # if self.browser: + # self.browser.quit() + if self.session: + self.session.close() \ No newline at end of file diff --git a/seek/the_paper_com/__init__.py b/seek/the_paper_com/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/seek/the_paper_com/__pycache__/__init__.cpython-312.pyc b/seek/the_paper_com/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000..468b283 Binary files /dev/null and b/seek/the_paper_com/__pycache__/__init__.cpython-312.pyc differ diff --git a/seek/the_paper_com/__pycache__/base.cpython-312.pyc b/seek/the_paper_com/__pycache__/base.cpython-312.pyc new file mode 100644 index 0000000..0aafe73 Binary files /dev/null and b/seek/the_paper_com/__pycache__/base.cpython-312.pyc differ diff --git a/seek/the_paper_com/__pycache__/content.cpython-312.pyc b/seek/the_paper_com/__pycache__/content.cpython-312.pyc new file mode 100644 index 0000000..4ce8f6f Binary files /dev/null and b/seek/the_paper_com/__pycache__/content.cpython-312.pyc differ diff --git a/seek/the_paper_com/__pycache__/international.cpython-312.pyc b/seek/the_paper_com/__pycache__/international.cpython-312.pyc new file mode 100644 index 0000000..11d2bcb Binary files /dev/null and b/seek/the_paper_com/__pycache__/international.cpython-312.pyc differ diff --git a/seek/the_paper_com/__pycache__/tech.cpython-312.pyc b/seek/the_paper_com/__pycache__/tech.cpython-312.pyc new file mode 100644 index 0000000..906d3c6 Binary files /dev/null and b/seek/the_paper_com/__pycache__/tech.cpython-312.pyc differ diff --git a/seek/the_paper_com/base.py b/seek/the_paper_com/base.py new file mode 100644 index 0000000..61fee97 --- /dev/null +++ b/seek/the_paper_com/base.py @@ -0,0 +1,32 @@ +from DrissionPage.errors import ElementNotFoundError + +from database.tinformationsource.model import TInformationSource +from database.tnews.model import TNews +from log.log_manager import logger +from seek.seek_base import SeekBase +from utils.time_utils import process_time + + +class Base(SeekBase): + def __init__(self, information_source: TInformationSource): + super().__init__(information_source) + + def get_news(self): + news_result = [] + _news_list = self.session.s_ele('.index_cards__AdZtA').s_eles('.ant-col ant-col-6') + + for _news in _news_list: + tnews = TNews() + try: + tnews.title = _news.s_ele('tag:a').text + tnews.url = _news.s_ele('tag:a').link + _time = _news.s_ele('.small_text__dR01h').s_eles('tag:span')[1].text + tnews.occurrence_date = process_time(_time) + tnews.source = self.information_source.title + news_result.append(tnews) + except ElementNotFoundError as e: + logger.error(f"ElementNotFoundError {tnews.title}: {e} - Failed to find element in news item.") + except Exception as e: + logger.error(f'Unexpected error occurred: {e}') + + return news_result diff --git a/seek/the_paper_com/content.py b/seek/the_paper_com/content.py new file mode 100644 index 0000000..92f17ed --- /dev/null +++ b/seek/the_paper_com/content.py @@ -0,0 +1,50 @@ +import datetime + +from DrissionPage.errors import ElementNotFoundError + +from database.tinformationsource.model import TInformationSource +from database.tnews.model import TNews +from log.log_manager import logger +from seek.content_base import ContentBase + + +class ThePaperContent(ContentBase): + def __init__(self, news: TNews): + super().__init__(news) + + def get_content(self): + content_ = '' + try: + content_ = self.session.s_ele('.^index_cententWrap').text + except ElementNotFoundError as e: + try: + # 视频 + content_ = self.session.s_ele('.^header_desc').text + except ElementNotFoundError as e: + content_ = 'not found element' + return content_ + + +def get_content(information_source: TInformationSource) -> list: + the_paper_content = ThePaperContent(information_source) + content = the_paper_content.get_content() + the_paper_content.finish() + return content + + +def content_task(news: TNews): + logger.info(f'{news.title} news_task start execute at {datetime.datetime.now()}', ) + ofweek_com_ai = ThePaperContent(news) + ofweek_com_ai.do_seek_task() + ofweek_com_ai.finish() + logger.info(f'{news.title} news_task end execute at {datetime.datetime.now()}') + + +if __name__ == '__main__': + logger.info('This module is not for direct call!') + news_ = TNews() + news_.is_static = True + news_.url = 'https://www.thepaper.cn/newsDetail_forward_29745442' + content = get_content(news_) + logger.info(content) + logger.info('Done.') diff --git a/seek/the_paper_com/international.py b/seek/the_paper_com/international.py new file mode 100644 index 0000000..64d7127 --- /dev/null +++ b/seek/the_paper_com/international.py @@ -0,0 +1,38 @@ +import datetime + +from database.tinformationsource.model import TInformationSource +from log.log_manager import logger +from seek.the_paper_com.base import Base + + +class International(Base): + def __init__(self, information_source: TInformationSource): + super().__init__(information_source) + + +def get_news(information_source: TInformationSource) -> list: + instance = International(information_source) + news_list = instance.get_news() + instance.finish() + return news_list + + +def news_task(information_source: TInformationSource): + logger.info(f'{information_source.title} news_task start execute at {datetime.datetime.now()}', ) + instance = International(information_source) + instance.do_seek_task() + instance.finish() + logger.info(f'{information_source.title} news_task end execute at {datetime.datetime.now()}') + + +if __name__ == '__main__': + logger.info('This module is not for direct call!') + information_source_ = TInformationSource() + information_source_.is_static = True + information_source_.url = 'https://www.thepaper.cn/channel_122908' + information_source_.title = '国际_澎湃新闻' + # news_task(information_source_) + news_list_ = get_news(information_source_) + for news in news_list_: + print(news) + logger.info('Done.') diff --git a/seek/the_paper_com/tech.py b/seek/the_paper_com/tech.py new file mode 100644 index 0000000..4ab219d --- /dev/null +++ b/seek/the_paper_com/tech.py @@ -0,0 +1,38 @@ +import datetime + +from database.tinformationsource.model import TInformationSource +from log.log_manager import logger +from seek.the_paper_com.base import Base + + +class Tech(Base): + def __init__(self, information_source: TInformationSource): + super().__init__(information_source) + + +def get_news(information_source: TInformationSource) -> list: + instance = Tech(information_source) + news_list = instance.get_news() + instance.finish() + return news_list + + +def news_task(information_source: TInformationSource): + logger.info(f'{information_source.title} news_task start execute at {datetime.datetime.now()}', ) + instance = Tech(information_source) + instance.do_seek_task() + instance.finish() + logger.info(f'{information_source.title} news_task end execute at {datetime.datetime.now()}') + + +if __name__ == '__main__': + logger.info('This module is not for direct call!') + information_source_ = TInformationSource() + information_source_.is_static = True + information_source_.url = 'https://www.thepaper.cn/channel_119908' + information_source_.title = '科技_澎湃新闻' + news_task(information_source_) + # news_list_ = get_news(information_source_) + # for news in news_list_: + # print(news) + logger.info('Done.') diff --git a/seek/xinhuanet_com/__init__.py b/seek/xinhuanet_com/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/seek/xinhuanet_com/__pycache__/__init__.cpython-312.pyc b/seek/xinhuanet_com/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000..7cf1d20 Binary files /dev/null and b/seek/xinhuanet_com/__pycache__/__init__.cpython-312.pyc differ diff --git a/seek/xinhuanet_com/__pycache__/content.cpython-312.pyc b/seek/xinhuanet_com/__pycache__/content.cpython-312.pyc new file mode 100644 index 0000000..d4566a5 Binary files /dev/null and b/seek/xinhuanet_com/__pycache__/content.cpython-312.pyc differ diff --git a/seek/xinhuanet_com/__pycache__/information.cpython-312.pyc b/seek/xinhuanet_com/__pycache__/information.cpython-312.pyc new file mode 100644 index 0000000..9506b8b Binary files /dev/null and b/seek/xinhuanet_com/__pycache__/information.cpython-312.pyc differ diff --git a/seek/xinhuanet_com/content.py b/seek/xinhuanet_com/content.py new file mode 100644 index 0000000..db7fe87 --- /dev/null +++ b/seek/xinhuanet_com/content.py @@ -0,0 +1,58 @@ +import datetime + +from DrissionPage.errors import ElementNotFoundError + +from database.tinformationsource.model import TInformationSource +from database.tnews.model import TNews +from log.log_manager import logger +from seek.content_base import ContentBase + + +class ArticleContent(ContentBase): + def __init__(self, news: TNews): + super().__init__(news) + + def get_content(self): + content_ = '' + try: + content_ = self.session.s_ele('#detailContent').text + except ElementNotFoundError: + content_ = 'not found element' + return content_ + + def get_occurrence_date(self): + try: + header_time = self.session.s_ele('.header-time left') + year = header_time.s_ele('.year').text # 2023 + day = header_time.s_ele('.day').text # 12/27 + time = header_time.s_ele('.time').text # 08:05:11 + occurrence_date_ = f'{year}/{day} {time}' + print(occurrence_date_) + except ElementNotFoundError: + occurrence_date_ = None + return occurrence_date_ + +def get_content(information_source: TInformationSource) -> list: + article_content = ArticleContent(information_source) + result = article_content.get_content() + article_content.get_occurrence_date() + article_content.finish() + return result + + +def content_task(news: TNews): + logger.info(f'{news.title} news_task start execute at {datetime.datetime.now()}', ) + article_content = ArticleContent(news) + article_content.do_seek_task() + article_content.finish() + logger.info(f'{news.title} news_task end execute at {datetime.datetime.now()}') + + +if __name__ == '__main__': + logger.info('This module is not for direct call!') + news_ = TNews() + news_.is_static = True + news_.url = 'https://www.news.cn/politics/leaders/20241227/90e76f85ad4a43ba94802b07c5736e00/c.html' + content = get_content(news_) + logger.info(content) + logger.info('Done.') diff --git a/seek/xinhuanet_com/information.py b/seek/xinhuanet_com/information.py new file mode 100644 index 0000000..f7345b7 --- /dev/null +++ b/seek/xinhuanet_com/information.py @@ -0,0 +1,59 @@ +import datetime + +from DrissionPage.errors import ElementNotFoundError + +from database.tinformationsource.model import TInformationSource +from database.tnews.model import TNews +from log.log_manager import logger +from seek.seek_base import SeekBase + + +class Information(SeekBase): + def __init__(self, information_source: TInformationSource): + super().__init__(information_source) + + def get_news(self): + news_result = [] + _news_list = self.session.s_ele('#focusListNews').s_eles('tag:li') + for _news in _news_list: + try: + rs_news = TNews() + rs_news.title = _news.s_ele('tag:a').text + rs_news.url = _news.s_ele('tag:a').link + # rs_news.summary = tmp_.s_eles('tag:a')[1].text + # rs_news.occurrence_date = self.process_time(tmp_.s_ele('.info__time').text) + rs_news.source = self.information_source.title + news_result.append(rs_news) + except ElementNotFoundError as e: + logger.error(f"ElementNotFoundError: {e} - Failed to find element in news item.") + except Exception as e: + logger.error(f'Unexpected error occurred: {e}') + return news_result + + +def get_news(information_source: TInformationSource) -> list: + instance = Information(information_source) + news_list = instance.get_news() + instance.finish() + return news_list + + +def news_task(information_source: TInformationSource): + logger.info(f'{information_source.title} news_task start execute at {datetime.datetime.now()}', ) + instance = Information(information_source) + instance.do_seek_task() + instance.finish() + logger.info(f'{information_source.title} news_task end execute at {datetime.datetime.now()}') + + +if __name__ == '__main__': + logger.info('This module is not for direct call!') + information_source_ = TInformationSource() + information_source_.is_static = True + information_source_.url = 'http://www.xinhuanet.com/' + information_source_.title = '资讯_新华网' + news_task(information_source_) + # news_list_ = get_news(information_source_) + # for news in news_list_: + # print(news) + logger.info('Done.') diff --git a/seek/zhihu_com/__init__.py b/seek/zhihu_com/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/seek/zhihu_com/__pycache__/__init__.cpython-312.pyc b/seek/zhihu_com/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000..d9d010f Binary files /dev/null and b/seek/zhihu_com/__pycache__/__init__.cpython-312.pyc differ diff --git a/seek/zhihu_com/__pycache__/zhihu.cpython-312.pyc b/seek/zhihu_com/__pycache__/zhihu.cpython-312.pyc new file mode 100644 index 0000000..3b8e6f2 Binary files /dev/null and b/seek/zhihu_com/__pycache__/zhihu.cpython-312.pyc differ diff --git a/seek/zhihu_com/__pycache__/zhihu_hot.cpython-312.pyc b/seek/zhihu_com/__pycache__/zhihu_hot.cpython-312.pyc new file mode 100644 index 0000000..a2fe52f Binary files /dev/null and b/seek/zhihu_com/__pycache__/zhihu_hot.cpython-312.pyc differ diff --git a/seek/zhihu_com/demo.py b/seek/zhihu_com/demo.py new file mode 100644 index 0000000..d178d8b --- /dev/null +++ b/seek/zhihu_com/demo.py @@ -0,0 +1,25 @@ +from database.database import get_session +from database.thotcontent.crud import get_hot_content_by_topic_id +from database.thottopic.crud import get_latest_hot_topic + +if __name__ == '__main__': + with get_session() as db: + # 1. 获取最新的热点话题 + latest_hot_topic = get_latest_hot_topic(db) + topic = latest_hot_topic.topic + print(latest_hot_topic) + # 2. 获取话题内容 + hot_contents = get_hot_content_by_topic_id(db, latest_hot_topic.id) + for hot_content in hot_contents: + print(hot_content) + # 统计hot_content.content的字数 + print(len(hot_content.content)) + topic_content = [hot_content.content for hot_content in hot_contents] + print(topic_content) + print(len(topic_content)) + print('---------------------------------------------------------------') + print(topic_content[0]) + print('---------------------------------------------------------------') + print(topic_content[1]) + print('---------------------------------------------------------------') + print(topic_content[2]) diff --git a/seek/zhihu_com/demo2.py b/seek/zhihu_com/demo2.py new file mode 100644 index 0000000..fc6bec5 --- /dev/null +++ b/seek/zhihu_com/demo2.py @@ -0,0 +1,116 @@ +import json + +from DrissionPage import Chromium +from DrissionPage import ChromiumOptions +from DrissionPage.errors import ElementNotFoundError + +from log.log_manager import logger + + +def get_content_from_meta(metas, itemprop): + content = None + for meta in metas: + if meta.attr('itemprop') == itemprop: + content = meta.attr('content') + return content + + +class Zhihu: + def __init__(self): + co = ChromiumOptions() + self.browser = Chromium() + self.tab = None + + def get_content(self, url): + """获取话题内容数据""" + global title, keywords, date_created, date_modified, follower_count, comment_count, answer_count, topic_description + contents_result = [] + try: + self.tab = self.browser.new_tab() + # 访问话题/问题页面 + self.tab.get(url) + + for _ in range(10): + # 等待内容加载 + self.tab.wait.ele_displayed('.List-item') + self.tab.wait(3) + # 向下滚动页面,直到所有内容加载完成 + self.tab.scroll.to_bottom() + self.tab.wait(1) + self.tab.scroll.up(100) + + # 获取话题/问题相关信息:话题内容、keywards、话题创建日期dateCreated、话题修改日期dateModified、回答数量answerCount、评论数量commentCount + question_page = self.tab.ele('.QuestionPage') + # 获取话题属性,为QuestionPage的前9个meta标签 + metas = question_page.eles('tag:meta')[0:9] + # print(metas) + title = get_content_from_meta(metas, 'name') + answer_count = get_content_from_meta(metas, 'answerCount') + comment_count = get_content_from_meta(metas, 'commentCount') + keywords = get_content_from_meta(metas, 'keywords') + date_created = get_content_from_meta(metas, 'dateCreated') + date_modified = get_content_from_meta(metas, 'dateModified') + follower_count = get_content_from_meta(metas, 'zhihu:followerCount') + # print(date_created, date_modified, answer_count, comment_count, keywords) + topic_description = "" + try: + topic_description = question_page.ele('.RichText ztext css-ob6uua').text + except ElementNotFoundError as e: + logger.error(f"元素缺失:不存在topic_description") + + # 获取所有内容条目 + content_items = self.tab.ele('.Question-mainColumn').eles('.List-item') + + total_characters = 0 + for item in content_items: + try: + content = item.ele('.RichContent-inner').text + # 计算content的字数 + content_len = len(content) + print(content_len) + if content_len > 1000 or content_len < 100: + logger.error(f"skip本条内容,内容长度:{content_len}") + continue + if total_characters > 5000: + logger.error(f"contents_result长度超过5000,跳出循环") + break + total_characters += content_len + contents_result.append(content) + # 打印contents_result的长度 + logger.info(f"contents_result长度:{len(contents_result)}") + except ElementNotFoundError as e: + logger.error(f"元素缺失:{str(e)}") + except ValueError as e: + logger.error(f"热度值转换失败:{str(e)}") + + except ElementNotFoundError as e: + logger.error(f"热榜容器元素未找到:{str(e)}") + except Exception as e: + logger.error(f"获取热榜数据异常:{str(e)}") + finally: + if self.tab: + self.tab.close() + # 返回json格式的数据 + return json.dumps({ + 'title': title, + 'answer_count': answer_count, + 'comment_count': comment_count, + 'topic_description': topic_description, + 'keywords': keywords, + 'date_created': date_created, + 'date_modified': date_modified, + 'follower_count': follower_count, + 'contents': contents_result + }, ensure_ascii=False) + + +if __name__ == '__main__': + # 测试用例 + logger.info('知乎采集测试') + # 执行采集任务 + zhihu = Zhihu() + result = zhihu.get_content('https://www.zhihu.com/question/588507809') + print(len(result)) + print(result) + logger.info('测试完成') + diff --git a/seek/zhihu_com/hot.py b/seek/zhihu_com/hot.py new file mode 100644 index 0000000..b677349 --- /dev/null +++ b/seek/zhihu_com/hot.py @@ -0,0 +1,85 @@ +import datetime +import re # 添加正则表达式库的导入 + +from DrissionPage.errors import ElementNotFoundError + +from database.database import get_session +from database.tinformationsource.model import TInformationSource +from database.tnews.model import TNews +from log.log_manager import logger +from seek.seek_base import SeekBase + + +class ZhihuHot(SeekBase): + def get_news(self): + """获取知乎热榜数据""" + news_result = [] + try: + # 访问热榜页面 + self.tab.get('https://www.zhihu.com/hot') + + # 等待热榜内容加载 + self.tab.wait.ele_displayed('.HotItem') + + # 获取所有热榜条目 + hot_items = self.tab.ele('.HotList-list').eles('.HotItem') + + for item in hot_items: + try: + news = TNews() + # 提取标题和链接 + news.title = item.ele('tag:a').attr('title').title() + news.url = item.ele('tag:a').link + + # 提取热度值(去除"热度"文字) + heat_value = item('.HotItem-metrics HotItem-metrics--bottom').text + logger.info(f"热度值:{heat_value}") + # 使用正则表达式提取数值部分 + match = re.search(r'(\d+\s*万)', heat_value) + + if match: + news.heat = match.group(1).replace(' ', '') # 去除空格 + else: + logger.error(f"无法提取热度值:{heat_value}") + logger.info(f"提取到的热度值:{news.heat}") + + news.source = self.information_source.title + news.occurrence_date = datetime.datetime.now() + news_result.append(news) + except ElementNotFoundError as e: + logger.error(f"元素缺失:{str(e)}") + except ValueError as e: + logger.error(f"热度值转换失败:{str(e)}") + + except ElementNotFoundError as e: + logger.error(f"热榜容器元素未找到:{str(e)}") + except Exception as e: + logger.error(f"获取热榜数据异常:{str(e)}") + + return news_result + +def get_news(information_source: TInformationSource) -> list: + """对外暴露的获取新闻接口""" + zhihu = ZhihuHot(information_source) + return zhihu.get_news() + +def news_task(information_source: TInformationSource): + """任务执行入口""" + with get_session() as db: + news_list = get_news(information_source) + # create_news_list_if_url_not_exists(db, news_list) + for news in news_list: + logger.info(f"采集到新闻:{news}") + +if __name__ == '__main__': + # 测试用例 + logger.info('知乎热榜采集测试') + information_source_ = TInformationSource() + information_source_.is_static = False # 知乎需要浏览器渲染 + information_source_.url = 'https://www.zhihu.com/hot' + information_source_.title = '热榜_知乎' + + # 执行采集任务 + news_task(information_source_) + logger.info('测试完成') + diff --git a/seek/zhihu_com/zhihu.py b/seek/zhihu_com/zhihu.py new file mode 100644 index 0000000..e75989b --- /dev/null +++ b/seek/zhihu_com/zhihu.py @@ -0,0 +1,173 @@ +import re + +from DrissionPage import Chromium +from DrissionPage import ChromiumOptions +from DrissionPage.errors import ElementNotFoundError + +from database.database import get_session +from database.thotcontent.crud import create_contents_top3_if_url_not_exists +from database.thotcontent.model import THotContent +from database.thottopic.crud import create_topics_if_url_not_exists, update_hot_topic +from database.thottopic.model import THotTopic +from log.log_manager import logger + +def get_content_from_meta(metas, itemprop): + content = None + for meta in metas: + if meta.attr('itemprop') == itemprop: + content = meta.attr('content') + return content + + +class Zhihu: + def __init__(self): + co = ChromiumOptions() + self.browser = Chromium() + self.tab = None + + def get_topics(self): + """获取知乎数据""" + topics_result = [] + try: + self.tab = self.browser.new_tab() + # 访问知乎主页面 + self.tab.get('https://www.zhihu.com') + + # 等待热榜内容加载 + self.tab.wait.ele_displayed('.Card TopstoryItem TopstoryItem-isRecommend') + + # 获取所有热榜条目 + hot_items = self.tab.ele('.Topstory-content').eles('.Card TopstoryItem TopstoryItem-isRecommend') + + for item in hot_items: + try: + topic = THotTopic() + topic.source = '知乎' + # 提取标题和链接 + topic.topic = item.ele('tag:h2').ele('tag:a').text + topic.url = item.ele('tag:h2').ele('tag:a').link + pattern = r'^https://www\.zhihu\.com/question/\d+' + result = re.findall(pattern, topic.url) + if result: + topic.url = result[0] + else: + continue + topics_result.append(topic) + except ElementNotFoundError as e: + logger.error(f"元素缺失:{str(e)}") + except ValueError as e: + logger.error(f"热度值转换失败:{str(e)}") + + except ElementNotFoundError as e: + logger.error(f"热榜容器元素未找到:{str(e)}") + except Exception as e: + logger.error(f"获取热榜数据异常:{str(e)}") + finally: + if self.tab: + self.tab.close() + return topics_result + + def get_content(self, topic: THotTopic, db): + """获取话题内容数据""" + contents_result = [] + try: + self.tab = self.browser.new_tab() + # 访问话题/问题页面 + self.tab.get(topic.url) + + for _ in range(10): + # 等待内容加载 + self.tab.wait.ele_displayed('.List-item') + self.tab.wait(3) + # 向下滚动页面,直到所有内容加载完成 + self.tab.scroll.to_bottom() + self.tab.wait(1) + self.tab.scroll.up(100) + + # 获取话题/问题相关信息:话题内容、keywards、话题创建日期dateCreated、话题修改日期dateModified、回答数量answerCount、评论数量commentCount + question_page = self.tab.ele('.QuestionPage') + # 获取话题属性,为QuestionPage的前9个meta标签 + metas = question_page.eles('tag:meta')[0:9] + # print(metas) + answer_count = get_content_from_meta(metas, 'answerCount') + comment_count = get_content_from_meta(metas, 'commentCount') + keywords = get_content_from_meta(metas, 'keywords') + date_created = get_content_from_meta(metas, 'dateCreated') + date_modified = get_content_from_meta(metas, 'dateModified') + follower_count = get_content_from_meta(metas, 'zhihu:followerCount') + # print(date_created, date_modified, answer_count, comment_count, keywords) + topic.content_count = int(answer_count) + topic.comment_count = int(comment_count) + topic.follower_count = int(follower_count) + topic.keywords = keywords + topic.date_created = date_created + topic.date_modified = date_modified + try: + topic.topic_description = question_page.ele('.RichText ztext css-ob6uua').text + except ElementNotFoundError as e: + logger.error(f"元素缺失:不存在topic_description") + update_hot_topic(db, topic) + + # 获取所有内容条目 + content_items = self.tab.ele('.Question-mainColumn').eles('.List-item') + + for item in content_items: + try: + content = THotContent() + content.topic_id = topic.id + content.url = item.ele('.ContentItem-time').ele('tag:a').link + upvote_str = item.ele('.Button VoteButton VoteButton--up FEfUrdfMIKpQDJDqkjte').text + match = re.search(r'(\d+\.?\d*)\s*万?', upvote_str) + if match: + number = float(match.group(1)) + content.content_upvote_count = int(number * 10000) if '万' in upvote_str else int(number) + else: + content.content_upvote_count = 0 + comment_str = item.ele('.Button ContentItem-action FEfUrdfMIKpQDJDqkjte Button--plain Button--withIcon Button--withLabel fEPKGkUK5jyc4fUuT0QP B46v1Ak6Gj5sL2JTS4PY RuuQ6TOh2cRzJr6WlyQp').text + match = re.search(r'(\d{1,3}(?:,\d{3})*)', comment_str) + if match: + content.content_comment_count = int(match.group(1).replace(',', '')) + else: + content.content_comment_count = 0 + content.content = item.ele('.RichContent-inner').text + contents_result.append(content) + except ElementNotFoundError as e: + logger.error(f"元素缺失:{str(e)}") + except ValueError as e: + logger.error(f"热度值转换失败:{str(e)}") + + except ElementNotFoundError as e: + logger.error(f"热榜容器元素未找到:{str(e)}") + except Exception as e: + logger.error(f"获取热榜数据异常:{str(e)}") + finally: + if self.tab: + self.tab.close() + return contents_result + + + +def get_topics() -> list: + zhihu = Zhihu() + topics = zhihu.get_topics() + return topics + +def gather_task(): + """任务执行入口""" + with get_session() as db: + zhihu = Zhihu() + topics = zhihu.get_topics() + inserted_topics = create_topics_if_url_not_exists(db, topics) + for topic in inserted_topics: + logger.info(f"采集到话题:{topic}") + contents = zhihu.get_content(topic, db) + create_contents_top3_if_url_not_exists(db, contents) + + +if __name__ == '__main__': + # 测试用例 + logger.info('知乎采集测试') + # 执行采集任务 + gather_task() + logger.info('测试完成') + diff --git a/seek/zhihu_com/zhihu_hot.py b/seek/zhihu_com/zhihu_hot.py new file mode 100644 index 0000000..984d90a --- /dev/null +++ b/seek/zhihu_com/zhihu_hot.py @@ -0,0 +1,156 @@ +from DrissionPage import Chromium +from DrissionPage import ChromiumOptions +from DrissionPage.errors import ElementNotFoundError + +from log.log_manager import logger + + +def get_content_from_meta(metas, itemprop): + content = None + for meta in metas: + if meta.attr('itemprop') == itemprop: + content = meta.attr('content') + return content + + +class ZhihuHot: + def __init__(self): + co = ChromiumOptions() + self.browser = Chromium() + + def get_topic_url_list(self) -> list: + """获取知乎热榜数据""" + _topic_url_list = [] + _tab = None + try: + _tab = self.browser.new_tab() + # 访问热榜页面 + _tab.get('https://www.zhihu.com/hot') + + # 等待热榜内容加载 + _tab.wait.ele_displayed('.HotItem') + + # 获取所有热榜条目 + hot_items = _tab.ele('.HotList-list').eles('.HotItem') + + for item in hot_items: + try: + # 提取标题和链接 + # title = item.ele('tag:a').attr('title').title() + url = item.ele('tag:a').link + _topic_url_list.append(url) + except ElementNotFoundError as e: + logger.error(f"元素缺失:{str(e)}") + except ValueError as e: + logger.error(f"热度值转换失败:{str(e)}") + + except ElementNotFoundError as e: + logger.error(f"热榜容器元素未找到:{str(e)}") + except Exception as e: + logger.error(f"获取热榜数据异常:{str(e)}") + finally: + if _tab: + _tab.close() + + return _topic_url_list + + def get_content(self, url): + """获取话题内容数据""" + contents_result = [] + _tab = None + global title, keywords, date_created, date_modified, follower_count, comment_count, answer_count, topic_description + try: + _tab = self.browser.new_tab() + # 访问话题/问题页面 + _tab.get(url) + + for _ in range(10): + # for _ in range(1): + # 等待内容加载 + _tab.wait.ele_displayed('.List-item') + _tab.wait(3) + # 向下滚动页面,直到所有内容加载完成 + _tab.scroll.to_bottom() + _tab.wait(1) + _tab.scroll.up(100) + + # 获取话题/问题相关信息:话题内容、keywards、话题创建日期dateCreated、话题修改日期dateModified、回答数量answerCount、评论数量commentCount + question_page = _tab.ele('.QuestionPage') + # 获取话题属性,为QuestionPage的前9个meta标签 + metas = question_page.eles('tag:meta')[0:9] + # print(metas) + title = get_content_from_meta(metas, 'name') + answer_count = get_content_from_meta(metas, 'answerCount') + comment_count = get_content_from_meta(metas, 'commentCount') + keywords = get_content_from_meta(metas, 'keywords') + date_created = get_content_from_meta(metas, 'dateCreated') + date_modified = get_content_from_meta(metas, 'dateModified') + follower_count = get_content_from_meta(metas, 'zhihu:followerCount') + # print(date_created, date_modified, answer_count, comment_count, keywords) + topic_description = "" + try: + unfold_topic_description = question_page.ele('.^Button QuestionRichText-more') + if unfold_topic_description: + unfold_topic_description.click() + topic_description = question_page.ele('.^QuestionRichText').text + except ElementNotFoundError as e: + logger.error(f"元素缺失:不存在topic_description") + + # 获取所有内容条目 + content_items = _tab.ele('.Question-mainColumn').eles('.List-item') + + total_characters = 0 + for item in content_items: + try: + content = item.ele('.RichContent-inner').text + # 计算content的字数 + content_len = len(content) + print(content_len) + if content_len > 1000 or content_len < 100: + logger.error(f"skip本条内容,内容长度:{content_len}") + continue + if total_characters > 5000: + logger.error(f"contents_result长度超过5000,跳出循环") + break + total_characters += content_len + contents_result.append(content) + # 打印contents_result的长度 + logger.info(f"contents_result长度:{len(contents_result)}") + except ElementNotFoundError as e: + logger.error(f"元素缺失:{str(e)}") + except ValueError as e: + logger.error(f"热度值转换失败:{str(e)}") + + except ElementNotFoundError as e: + logger.error(f"热榜容器元素未找到:{str(e)}") + except Exception as e: + logger.error(f"获取热榜数据异常:{str(e)}") + finally: + if _tab: + _tab.close() + return { + 'title': title, + 'topic_description': topic_description, + 'keywords': keywords, + 'url': url, + 'contents': contents_result, + 'date_created': date_created, + 'date_modified': date_modified, + 'follower_count': follower_count, + 'answer_count': answer_count, + 'comment_count': comment_count + } + + +if __name__ == '__main__': + # 测试用例 + logger.info('知乎采集测试') + # 执行采集任务 + zhihu_hot = ZhihuHot() + result = zhihu_hot.get_content('https://www.zhihu.com/question/14351228309') + print(len(result)) + print(result) + # topic_url_list = zhihu_hot.get_topic_url_list() + # print(topic_url_list) + logger.info('测试完成') + diff --git a/task/__init__.py b/task/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/task/__pycache__/__init__.cpython-312.pyc b/task/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000..1fdafe3 Binary files /dev/null and b/task/__pycache__/__init__.cpython-312.pyc differ diff --git a/task/__pycache__/manager_task.cpython-312.pyc b/task/__pycache__/manager_task.cpython-312.pyc new file mode 100644 index 0000000..8b3d96d Binary files /dev/null and b/task/__pycache__/manager_task.cpython-312.pyc differ diff --git a/task/content/__init__.py b/task/content/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/task/content/__pycache__/__init__.cpython-312.pyc b/task/content/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000..5a42756 Binary files /dev/null and b/task/content/__pycache__/__init__.cpython-312.pyc differ diff --git a/task/content/__pycache__/content_spider_task.cpython-312.pyc b/task/content/__pycache__/content_spider_task.cpython-312.pyc new file mode 100644 index 0000000..67a9965 Binary files /dev/null and b/task/content/__pycache__/content_spider_task.cpython-312.pyc differ diff --git a/task/content/content_spider_task.py b/task/content/content_spider_task.py new file mode 100644 index 0000000..ae0ae14 --- /dev/null +++ b/task/content/content_spider_task.py @@ -0,0 +1,38 @@ +import importlib + +from database.database import get_session +from database.tinformationsource.curd import get_active_information_sources +from database.tnews.crud import get_news_need_content +from log.log_manager import logger +from task.manager_task import execute_task + + +def content_spider_task(): + with get_session() as db: + # 1. 获取信息源数据 + information_sources = get_active_information_sources(db) + # 2. 获取需要获取内容的新闻数据 + news_list = get_news_need_content(db) + # 3. 遍历新闻数据 + for news in news_list: + for information_source in information_sources: + if information_source.title != news.source: + continue + if information_source.module is None or information_source.method is None: + logger.error(f"{information_source.title} module or method is None") + continue + news.is_static = information_source.is_static + # 动态导入模块和函数 + # 把模块路径最后一部分换成content + module_path = information_source.module.rsplit('.', 1)[0] + '.content' + module = importlib.import_module(module_path) + task_function = getattr(module, 'content_task') + try: + task_function(news) + except Exception as e: + logger.error(f"{information_source.title} task error: {e}") + + + +if __name__ == '__main__': + execute_task(content_spider_task) diff --git a/task/default/__init__.py b/task/default/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/task/default/__pycache__/__init__.cpython-312.pyc b/task/default/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000..d226729 Binary files /dev/null and b/task/default/__pycache__/__init__.cpython-312.pyc differ diff --git a/task/default/__pycache__/main_spider_task.cpython-312.pyc b/task/default/__pycache__/main_spider_task.cpython-312.pyc new file mode 100644 index 0000000..abe82a0 Binary files /dev/null and b/task/default/__pycache__/main_spider_task.cpython-312.pyc differ diff --git a/task/default/main_spider_task.py b/task/default/main_spider_task.py new file mode 100644 index 0000000..d2b8b9d --- /dev/null +++ b/task/default/main_spider_task.py @@ -0,0 +1,26 @@ +import importlib + +from database.database import get_session +from database.tinformationsource.curd import get_active_information_sources +from log.log_manager import logger +from task.manager_task import execute_task + + +def main_spider_task(): + with get_session() as db: + information_sources = get_active_information_sources(db) + for information_source in information_sources: + if information_source.module is None or information_source.method is None: + logger.error(f"{information_source.title} module or method is None") + continue + # 动态导入模块和函数 + module = importlib.import_module(information_source.module) + task_function = getattr(module, information_source.method) + try: + task_function(information_source) + except Exception as e: + logger.error(f"{information_source.title} task error: {e}") + + +if __name__ == '__main__': + execute_task(main_spider_task) diff --git a/task/hot_topic/__init__.py b/task/hot_topic/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/task/hot_topic/__pycache__/__init__.cpython-312.pyc b/task/hot_topic/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000..c97b7e0 Binary files /dev/null and b/task/hot_topic/__pycache__/__init__.cpython-312.pyc differ diff --git a/task/hot_topic/__pycache__/zhihu.cpython-312.pyc b/task/hot_topic/__pycache__/zhihu.cpython-312.pyc new file mode 100644 index 0000000..0520d7a Binary files /dev/null and b/task/hot_topic/__pycache__/zhihu.cpython-312.pyc differ diff --git a/task/hot_topic/zhihu.py b/task/hot_topic/zhihu.py new file mode 100644 index 0000000..6cdb86d --- /dev/null +++ b/task/hot_topic/zhihu.py @@ -0,0 +1,36 @@ +from database.tvideoscript.video_script import video_script_not_exists, VideoScript, create_video_script +from seek.zhihu_com.zhihu_hot import ZhihuHot +from task.manager_task import execute_task + + +def spider_task(): + zhihu_hot = ZhihuHot() + # 1. 获取热榜主题 + hot_topic_url_list = zhihu_hot.get_topic_url_list() + + # 2. 过滤掉已经在数据库存在的主题 + hot_topic_url_list = video_script_not_exists(hot_topic_url_list) + + # 3. 选择前10个主题 + hot_topic_url_list = hot_topic_url_list[:10] + # hot_topic_url_list = hot_topic_url_list[:3] + + # 4. 循环获取每个主题的内容 + for hot_topic_url in hot_topic_url_list: + print(hot_topic_url) + content = zhihu_hot.get_content(hot_topic_url) + print(content) + if content['contents'] is None or len(content['contents']) == 0: + print(f'skip {hot_topic_url}, no fitch content') + continue + # 5. 将内容保存到数据库中 + video_script = VideoScript(title=content['title'], + keywords=content['keywords'], + description=content['topic_description'], + content=content['contents'], + url=content['url']) + create_video_script(video_script) + + +if __name__ == '__main__': + execute_task(spider_task) \ No newline at end of file diff --git a/task/manager_task.py b/task/manager_task.py new file mode 100644 index 0000000..e9fb398 --- /dev/null +++ b/task/manager_task.py @@ -0,0 +1,112 @@ +import importlib +import time + +from apscheduler.schedulers.blocking import BlockingScheduler + +from config import config +from database.database import get_session +from database.tscheduler.crud import get_tasks_by_executor +from log.log_manager import log + +""" +这是一个特殊的任务,负责管理任务,命名为管理者任务。 + +工作流程: +1 检索数据库任务数据表 +2 检查是否已经在任务队列中,如果不在则添加 + +任务执行时间间隔为600秒。 + +""" + +def log_task_execution(task_name: str, start_time: float, end_time: float = None): + """辅助函数,记录任务的开始和结束日志""" + start_time_str = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(start_time)) + end_time_str = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(end_time)) + if end_time is None: + log(f"{task_name} start execute at {start_time_str}") + else: + elapsed_time = end_time - start_time + log(f"{task_name} end execute at {end_time_str}, use time {elapsed_time:.2f} seconds") + + +def execute_task(task: callable): + """执行任务并记录日志""" + start_time = time.time() + log_task_execution(task.__name__, start_time) # 先记录开始时间 + task() + end_time = time.time() + log_task_execution(task.__name__, start_time, end_time) # 记录结束时间 + +# 从数据库加载任务 +def load_tasks(scheduler: BlockingScheduler): + with get_session() as db: + tasks = get_tasks_by_executor(db, config.scheduler_name) + + for task in tasks: + module_path = task.module_path + function_name = task.function_name + trigger = task.trigger + interval_seconds = task.interval_seconds + task_id = task.id + + # 动态导入模块和函数 + module = importlib.import_module(module_path) + task_function = getattr(module, function_name) + + job = scheduler.get_job(str(task_id)) + # 检查任务是否已存在 + if not job: + if trigger == "interval": + scheduler.add_job( + task_function, + "interval", + seconds=interval_seconds, + id=str(task_id), + replace_existing=True, + misfire_grace_time=interval_seconds + ) + log(f"Task {task.task_name} added with interval {interval_seconds} seconds") + elif trigger == "cron": + # 解析 cron 表达式的字段 + fields = task.cron_expression.split() + # 确保字段长度符合七字段格式 + if len(fields) != 7: + raise ValueError("无效的 Quartz cron 表达式") + # 替换 Quartz 风格的 `?` 为 APScheduler 可接受的 `*` + if fields[5] == '?': + fields[5] = '*' # 替换 `day_of_week` 字段中的 `?` + # 使用 cron 表达式的字段添加任务 + scheduler.add_job( + task_function, # 要执行的任务 + 'cron', # 使用 cron 触发器 + second=fields[0], # 秒 + minute=fields[1], # 分钟 + hour=fields[2], # 小时 + day=fields[3], # 日期 + month=fields[4], # 月份 + day_of_week=fields[5], # 星期 + year=fields[6], # 年份 + id=str(task_id), + replace_existing=True + ) + log(f"Task {task.task_name} added with cron {task.cron_expression}") + elif trigger == "date": + scheduler.add_job( + task_function, + "date", + run_date=task["run_date_and_time"], + id=str(task_id), + replace_existing=True + ) + log(f"Task {task.task_name} added with date {task.execution_date}") + else: + log(f"Task Invalid trigger type: {trigger}") + else: + log(f"Task {task.task_name} already exists......") + run_time = job.next_run_time - job.trigger.start_date + log(f"Task {task.task_name} already exists, run time is {run_time}") + +# 管理者任务 +def manager_task(scheduler: BlockingScheduler): + load_tasks(scheduler) diff --git a/utils/__init__.py b/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/utils/__pycache__/__init__.cpython-312.pyc b/utils/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000..e83ec45 Binary files /dev/null and b/utils/__pycache__/__init__.cpython-312.pyc differ diff --git a/utils/__pycache__/time_utils.cpython-312.pyc b/utils/__pycache__/time_utils.cpython-312.pyc new file mode 100644 index 0000000..9b2d48a Binary files /dev/null and b/utils/__pycache__/time_utils.cpython-312.pyc differ diff --git a/utils/__pycache__/utils.cpython-312.pyc b/utils/__pycache__/utils.cpython-312.pyc new file mode 100644 index 0000000..06e47fd Binary files /dev/null and b/utils/__pycache__/utils.cpython-312.pyc differ diff --git a/utils/time_utils.py b/utils/time_utils.py new file mode 100644 index 0000000..56b7a10 --- /dev/null +++ b/utils/time_utils.py @@ -0,0 +1,50 @@ +import datetime +import re + +from log.log_manager import logger + + +def process_time(time_str): + """Processes and converts a time string into a datetime object.""" + current_time = datetime.datetime.now(datetime.timezone.utc) + if '分钟前' in time_str: + minutes = int(time_str.split('分钟前')[0]) + occurrence_time = current_time - datetime.timedelta(minutes=minutes) + elif '小时前' in time_str: + hours = int(time_str.split('小时前')[0]) + occurrence_time = current_time - datetime.timedelta(hours=hours) + elif '昨天' in time_str: + occurrence_time = current_time - datetime.timedelta(days=1) + elif '天前' in time_str: + occurrence_time = current_time - datetime.timedelta(days=int(time_str.split('天前')[0])) + elif '昨天' in time_str: + # time_str = '昨天HH:mm' + time_part = time_str.split('昨天')[-1].strip() + occurrence_time = (current_time - datetime.timedelta(days=1)).replace( + hour=int(time_part.split(':')[0]), + minute=int(time_part.split(':')[1]), + second=0 + ) + elif '前天' in time_str: + # time_str = '前天HH:mm' + time_part = time_str.split('前天')[-1].strip() + occurrence_time = (current_time - datetime.timedelta(days=2)).replace( + hour=int(time_part.split(':')[0]), + minute=int(time_part.split(':')[1]), + second=0 + ) + elif '年' in time_str and '月' in time_str and '日' in time_str: + time_pattern = r"(\d{4}年\d{1,2}月\d{1,2}日 \d{1,2}:\d{2})" + match = re.search(time_pattern, time_str) + time_str = match.group(1) + occurrence_time = datetime.datetime.strptime(time_str, "%Y年%m月%d日 %H:%M") + elif '/' in time_str: + occurrence_time = datetime.datetime.strptime(time_str, "%Y/%m/%d %H:%M:%S") + else: + try: + occurrence_time = datetime.datetime.strptime(time_str, '%Y-%m-%d') + except ValueError: + logger.error(f"Unable to parse date: {time_str}") + occurrence_time = current_time + + return occurrence_time \ No newline at end of file diff --git a/utils/utils.py b/utils/utils.py new file mode 100644 index 0000000..0d11260 --- /dev/null +++ b/utils/utils.py @@ -0,0 +1,6 @@ + +def get_md5(url) -> str: + import hashlib + m = hashlib.md5() + m.update(url.encode('utf-8')) + return m.hexdigest() \ No newline at end of file