From 8c1a740f0b41a3d0f86acf55d0fe2da15e9e7a2f Mon Sep 17 00:00:00 2001 From: konjacpotato Date: Wed, 12 Nov 2025 20:42:16 +0800 Subject: [PATCH] import peter --- Readme.md | 3 + config/__init__.py | 0 config/__pycache__/__init__.cpython-312.pyc | Bin 0 -> 129 bytes config/__pycache__/config.cpython-312.pyc | Bin 0 -> 191 bytes config/config.py | 4 + database/Readme.md | 5 + database/__init__.py | 0 database/__pycache__/__init__.cpython-312.pyc | Bin 0 -> 131 bytes database/__pycache__/database.cpython-312.pyc | Bin 0 -> 1545 bytes database/database.py | 37 ++++ .../__pycache__/crud.cpython-312.pyc | Bin 0 -> 4479 bytes .../__pycache__/model.cpython-312.pyc | Bin 0 -> 2016 bytes database/thotcontent/crud.py | 77 ++++++++ database/thotcontent/model.py | 23 +++ database/thottopic/__init__.py | 0 .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 141 bytes .../__pycache__/crud.cpython-312.pyc | Bin 0 -> 3596 bytes .../__pycache__/model.cpython-312.pyc | Bin 0 -> 3206 bytes database/thottopic/crud.py | 90 +++++++++ database/thottopic/model.py | 34 ++++ database/tinformationsource/__init__.py | 0 .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 150 bytes .../__pycache__/curd.cpython-312.pyc | Bin 0 -> 2406 bytes .../__pycache__/model.cpython-312.pyc | Bin 0 -> 3154 bytes database/tinformationsource/curd.py | 31 ++++ database/tinformationsource/model.py | 34 ++++ database/tnews/__init__.py | 0 .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 137 bytes .../tnews/__pycache__/crud.cpython-312.pyc | Bin 0 -> 4662 bytes .../tnews/__pycache__/model.cpython-312.pyc | Bin 0 -> 2223 bytes database/tnews/crud.py | 87 +++++++++ database/tnews/model.py | 25 +++ database/tscheduler/__init__.py | 0 .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 142 bytes .../__pycache__/crud.cpython-312.pyc | Bin 0 -> 2623 bytes .../__pycache__/model.cpython-312.pyc | Bin 0 -> 2756 bytes database/tscheduler/crud.py | 35 ++++ database/tscheduler/model.py | 26 +++ database/tvideoscript/__init__.py | 0 .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 144 bytes .../__pycache__/video_script.cpython-312.pyc | Bin 0 -> 3296 bytes database/tvideoscript/video_script.py | 47 +++++ log/__init__.py | 0 log/__pycache__/__init__.cpython-312.pyc | Bin 0 -> 126 bytes log/__pycache__/log_manager.cpython-312.pyc | Bin 0 -> 1829 bytes log/log_manager.py | 70 +++++++ log/log_prod.config | 22 +++ peter.py | 37 ++++ requirements.txt | Bin 0 -> 1164 bytes seek/163_com/__init__.py | 0 .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 135 bytes .../__pycache__/content.cpython-312.pyc | Bin 0 -> 2731 bytes .../163_com/__pycache__/house.cpython-312.pyc | Bin 0 -> 3529 bytes seek/163_com/content.py | 45 +++++ seek/163_com/house.py | 59 ++++++ seek/__init__.py | 0 seek/__pycache__/__init__.cpython-312.pyc | Bin 0 -> 127 bytes seek/__pycache__/content_base.cpython-312.pyc | Bin 0 -> 2921 bytes seek/__pycache__/seek_base.cpython-312.pyc | Bin 0 -> 3304 bytes seek/anjuke_com/__init__.py | 0 .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 138 bytes .../__pycache__/content.cpython-312.pyc | Bin 0 -> 2754 bytes .../__pycache__/house.cpython-312.pyc | Bin 0 -> 3988 bytes seek/anjuke_com/content.py | 46 +++++ seek/anjuke_com/house.py | 62 +++++++ seek/cnn_com/__init__.py | 0 seek/cnn_com/content.py | 58 ++++++ seek/cnn_com/edition.py | 62 +++++++ seek/content_base.py | 50 +++++ seek/fang_com/__init__.py | 0 .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 136 bytes .../__pycache__/content.cpython-312.pyc | Bin 0 -> 2735 bytes .../__pycache__/house.cpython-312.pyc | Bin 0 -> 3898 bytes seek/fang_com/content.py | 46 +++++ seek/fang_com/house.py | 64 +++++++ seek/focus_cn/__init__.py | 0 .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 136 bytes .../__pycache__/content.cpython-312.pyc | Bin 0 -> 2742 bytes .../__pycache__/house.cpython-312.pyc | Bin 0 -> 4015 bytes seek/focus_cn/content.py | 46 +++++ seek/focus_cn/house.py | 62 +++++++ seek/leju_com/__init__.py | 0 .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 136 bytes .../__pycache__/content.cpython-312.pyc | Bin 0 -> 2753 bytes .../__pycache__/house.cpython-312.pyc | Bin 0 -> 3821 bytes seek/leju_com/content.py | 45 +++++ seek/leju_com/house.py | 60 ++++++ seek/mittr_com/__init__.py | 0 .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 137 bytes .../__pycache__/content.cpython-312.pyc | Bin 0 -> 2717 bytes .../__pycache__/mit_t_r.cpython-312.pyc | Bin 0 -> 3858 bytes seek/mittr_com/content.py | 45 +++++ seek/mittr_com/mit_t_r.py | 63 +++++++ seek/ofweek_com/__init__.py | 0 .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 138 bytes .../ofweek_com/__pycache__/ai.cpython-312.pyc | Bin 0 -> 3783 bytes .../__pycache__/content.cpython-312.pyc | Bin 0 -> 2763 bytes seek/ofweek_com/ai.py | 62 +++++++ seek/ofweek_com/content.py | 46 +++++ seek/seek_base.py | 57 ++++++ seek/the_paper_com/__init__.py | 0 .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 141 bytes .../__pycache__/base.cpython-312.pyc | Bin 0 -> 2366 bytes .../__pycache__/content.cpython-312.pyc | Bin 0 -> 3007 bytes .../__pycache__/international.cpython-312.pyc | Bin 0 -> 2306 bytes .../__pycache__/tech.cpython-312.pyc | Bin 0 -> 2207 bytes seek/the_paper_com/base.py | 32 ++++ seek/the_paper_com/content.py | 50 +++++ seek/the_paper_com/international.py | 38 ++++ seek/the_paper_com/tech.py | 38 ++++ seek/xinhuanet_com/__init__.py | 0 .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 141 bytes .../__pycache__/content.cpython-312.pyc | Bin 0 -> 3569 bytes .../__pycache__/information.cpython-312.pyc | Bin 0 -> 3484 bytes seek/xinhuanet_com/content.py | 58 ++++++ seek/xinhuanet_com/information.py | 59 ++++++ seek/zhihu_com/__init__.py | 0 .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 137 bytes .../__pycache__/zhihu.cpython-312.pyc | Bin 0 -> 9863 bytes .../__pycache__/zhihu_hot.cpython-312.pyc | Bin 0 -> 7143 bytes seek/zhihu_com/demo.py | 25 +++ seek/zhihu_com/demo2.py | 116 ++++++++++++ seek/zhihu_com/hot.py | 85 +++++++++ seek/zhihu_com/zhihu.py | 173 ++++++++++++++++++ seek/zhihu_com/zhihu_hot.py | 156 ++++++++++++++++ task/__init__.py | 0 task/__pycache__/__init__.cpython-312.pyc | Bin 0 -> 127 bytes task/__pycache__/manager_task.cpython-312.pyc | Bin 0 -> 4567 bytes task/content/__init__.py | 0 .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 135 bytes .../content_spider_task.cpython-312.pyc | Bin 0 -> 1908 bytes task/content/content_spider_task.py | 38 ++++ task/default/__init__.py | 0 .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 135 bytes .../main_spider_task.cpython-312.pyc | Bin 0 -> 1509 bytes task/default/main_spider_task.py | 26 +++ task/hot_topic/__init__.py | 0 .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 137 bytes .../__pycache__/zhihu.cpython-312.pyc | Bin 0 -> 1382 bytes task/hot_topic/zhihu.py | 36 ++++ task/manager_task.py | 112 ++++++++++++ utils/__init__.py | 0 utils/__pycache__/__init__.cpython-312.pyc | Bin 0 -> 128 bytes utils/__pycache__/time_utils.cpython-312.pyc | Bin 0 -> 3113 bytes utils/__pycache__/utils.cpython-312.pyc | Bin 0 -> 491 bytes utils/time_utils.py | 50 +++++ utils/utils.py | 6 + 147 files changed, 2763 insertions(+) create mode 100644 Readme.md create mode 100644 config/__init__.py create mode 100644 config/__pycache__/__init__.cpython-312.pyc create mode 100644 config/__pycache__/config.cpython-312.pyc create mode 100644 config/config.py create mode 100644 database/Readme.md create mode 100644 database/__init__.py create mode 100644 database/__pycache__/__init__.cpython-312.pyc create mode 100644 database/__pycache__/database.cpython-312.pyc create mode 100644 database/database.py create mode 100644 database/thotcontent/__pycache__/crud.cpython-312.pyc create mode 100644 database/thotcontent/__pycache__/model.cpython-312.pyc create mode 100644 database/thotcontent/crud.py create mode 100644 database/thotcontent/model.py create mode 100644 database/thottopic/__init__.py create mode 100644 database/thottopic/__pycache__/__init__.cpython-312.pyc create mode 100644 database/thottopic/__pycache__/crud.cpython-312.pyc create mode 100644 database/thottopic/__pycache__/model.cpython-312.pyc create mode 100644 database/thottopic/crud.py create mode 100644 database/thottopic/model.py create mode 100644 database/tinformationsource/__init__.py create mode 100644 database/tinformationsource/__pycache__/__init__.cpython-312.pyc create mode 100644 database/tinformationsource/__pycache__/curd.cpython-312.pyc create mode 100644 database/tinformationsource/__pycache__/model.cpython-312.pyc create mode 100644 database/tinformationsource/curd.py create mode 100644 database/tinformationsource/model.py create mode 100644 database/tnews/__init__.py create mode 100644 database/tnews/__pycache__/__init__.cpython-312.pyc create mode 100644 database/tnews/__pycache__/crud.cpython-312.pyc create mode 100644 database/tnews/__pycache__/model.cpython-312.pyc create mode 100644 database/tnews/crud.py create mode 100644 database/tnews/model.py create mode 100644 database/tscheduler/__init__.py create mode 100644 database/tscheduler/__pycache__/__init__.cpython-312.pyc create mode 100644 database/tscheduler/__pycache__/crud.cpython-312.pyc create mode 100644 database/tscheduler/__pycache__/model.cpython-312.pyc create mode 100644 database/tscheduler/crud.py create mode 100644 database/tscheduler/model.py create mode 100644 database/tvideoscript/__init__.py create mode 100644 database/tvideoscript/__pycache__/__init__.cpython-312.pyc create mode 100644 database/tvideoscript/__pycache__/video_script.cpython-312.pyc create mode 100644 database/tvideoscript/video_script.py create mode 100644 log/__init__.py create mode 100644 log/__pycache__/__init__.cpython-312.pyc create mode 100644 log/__pycache__/log_manager.cpython-312.pyc create mode 100644 log/log_manager.py create mode 100644 log/log_prod.config create mode 100644 peter.py create mode 100644 requirements.txt create mode 100644 seek/163_com/__init__.py create mode 100644 seek/163_com/__pycache__/__init__.cpython-312.pyc create mode 100644 seek/163_com/__pycache__/content.cpython-312.pyc create mode 100644 seek/163_com/__pycache__/house.cpython-312.pyc create mode 100644 seek/163_com/content.py create mode 100644 seek/163_com/house.py create mode 100644 seek/__init__.py create mode 100644 seek/__pycache__/__init__.cpython-312.pyc create mode 100644 seek/__pycache__/content_base.cpython-312.pyc create mode 100644 seek/__pycache__/seek_base.cpython-312.pyc create mode 100644 seek/anjuke_com/__init__.py create mode 100644 seek/anjuke_com/__pycache__/__init__.cpython-312.pyc create mode 100644 seek/anjuke_com/__pycache__/content.cpython-312.pyc create mode 100644 seek/anjuke_com/__pycache__/house.cpython-312.pyc create mode 100644 seek/anjuke_com/content.py create mode 100644 seek/anjuke_com/house.py create mode 100644 seek/cnn_com/__init__.py create mode 100644 seek/cnn_com/content.py create mode 100644 seek/cnn_com/edition.py create mode 100644 seek/content_base.py create mode 100644 seek/fang_com/__init__.py create mode 100644 seek/fang_com/__pycache__/__init__.cpython-312.pyc create mode 100644 seek/fang_com/__pycache__/content.cpython-312.pyc create mode 100644 seek/fang_com/__pycache__/house.cpython-312.pyc create mode 100644 seek/fang_com/content.py create mode 100644 seek/fang_com/house.py create mode 100644 seek/focus_cn/__init__.py create mode 100644 seek/focus_cn/__pycache__/__init__.cpython-312.pyc create mode 100644 seek/focus_cn/__pycache__/content.cpython-312.pyc create mode 100644 seek/focus_cn/__pycache__/house.cpython-312.pyc create mode 100644 seek/focus_cn/content.py create mode 100644 seek/focus_cn/house.py create mode 100644 seek/leju_com/__init__.py create mode 100644 seek/leju_com/__pycache__/__init__.cpython-312.pyc create mode 100644 seek/leju_com/__pycache__/content.cpython-312.pyc create mode 100644 seek/leju_com/__pycache__/house.cpython-312.pyc create mode 100644 seek/leju_com/content.py create mode 100644 seek/leju_com/house.py create mode 100644 seek/mittr_com/__init__.py create mode 100644 seek/mittr_com/__pycache__/__init__.cpython-312.pyc create mode 100644 seek/mittr_com/__pycache__/content.cpython-312.pyc create mode 100644 seek/mittr_com/__pycache__/mit_t_r.cpython-312.pyc create mode 100644 seek/mittr_com/content.py create mode 100644 seek/mittr_com/mit_t_r.py create mode 100644 seek/ofweek_com/__init__.py create mode 100644 seek/ofweek_com/__pycache__/__init__.cpython-312.pyc create mode 100644 seek/ofweek_com/__pycache__/ai.cpython-312.pyc create mode 100644 seek/ofweek_com/__pycache__/content.cpython-312.pyc create mode 100644 seek/ofweek_com/ai.py create mode 100644 seek/ofweek_com/content.py create mode 100644 seek/seek_base.py create mode 100644 seek/the_paper_com/__init__.py create mode 100644 seek/the_paper_com/__pycache__/__init__.cpython-312.pyc create mode 100644 seek/the_paper_com/__pycache__/base.cpython-312.pyc create mode 100644 seek/the_paper_com/__pycache__/content.cpython-312.pyc create mode 100644 seek/the_paper_com/__pycache__/international.cpython-312.pyc create mode 100644 seek/the_paper_com/__pycache__/tech.cpython-312.pyc create mode 100644 seek/the_paper_com/base.py create mode 100644 seek/the_paper_com/content.py create mode 100644 seek/the_paper_com/international.py create mode 100644 seek/the_paper_com/tech.py create mode 100644 seek/xinhuanet_com/__init__.py create mode 100644 seek/xinhuanet_com/__pycache__/__init__.cpython-312.pyc create mode 100644 seek/xinhuanet_com/__pycache__/content.cpython-312.pyc create mode 100644 seek/xinhuanet_com/__pycache__/information.cpython-312.pyc create mode 100644 seek/xinhuanet_com/content.py create mode 100644 seek/xinhuanet_com/information.py create mode 100644 seek/zhihu_com/__init__.py create mode 100644 seek/zhihu_com/__pycache__/__init__.cpython-312.pyc create mode 100644 seek/zhihu_com/__pycache__/zhihu.cpython-312.pyc create mode 100644 seek/zhihu_com/__pycache__/zhihu_hot.cpython-312.pyc create mode 100644 seek/zhihu_com/demo.py create mode 100644 seek/zhihu_com/demo2.py create mode 100644 seek/zhihu_com/hot.py create mode 100644 seek/zhihu_com/zhihu.py create mode 100644 seek/zhihu_com/zhihu_hot.py create mode 100644 task/__init__.py create mode 100644 task/__pycache__/__init__.cpython-312.pyc create mode 100644 task/__pycache__/manager_task.cpython-312.pyc create mode 100644 task/content/__init__.py create mode 100644 task/content/__pycache__/__init__.cpython-312.pyc create mode 100644 task/content/__pycache__/content_spider_task.cpython-312.pyc create mode 100644 task/content/content_spider_task.py create mode 100644 task/default/__init__.py create mode 100644 task/default/__pycache__/__init__.cpython-312.pyc create mode 100644 task/default/__pycache__/main_spider_task.cpython-312.pyc create mode 100644 task/default/main_spider_task.py create mode 100644 task/hot_topic/__init__.py create mode 100644 task/hot_topic/__pycache__/__init__.cpython-312.pyc create mode 100644 task/hot_topic/__pycache__/zhihu.cpython-312.pyc create mode 100644 task/hot_topic/zhihu.py create mode 100644 task/manager_task.py create mode 100644 utils/__init__.py create mode 100644 utils/__pycache__/__init__.cpython-312.pyc create mode 100644 utils/__pycache__/time_utils.cpython-312.pyc create mode 100644 utils/__pycache__/utils.cpython-312.pyc create mode 100644 utils/time_utils.py create mode 100644 utils/utils.py diff --git a/Readme.md b/Readme.md new file mode 100644 index 0000000..0a4df64 --- /dev/null +++ b/Readme.md @@ -0,0 +1,3 @@ +# Peter + +电影里面的蜘蛛侠叫Peter Parker. diff --git a/config/__init__.py b/config/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/config/__pycache__/__init__.cpython-312.pyc b/config/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bbef6004ce0fb1ef7cd62fec2ec9b81b021b1400 GIT binary patch literal 129 zcmX@j%ge<81g&@C(n0iN5P=Rpvj9b=GgLBYGWxA#C}INgK7-W!Qi!mMNzPA6jVVYi zNi700^U^ZYW8&j8^D;}~^MJV3^Dh7^VlIY~;;_lhPbtkwwJTx;YGDN8Vi4maGb1Bo5i^hl E0D&nSPyhe` literal 0 HcmV?d00001 diff --git a/database/__pycache__/database.cpython-312.pyc b/database/__pycache__/database.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..22615c98b096c8d2fa0bd18ab444d8eda1c551cf GIT binary patch literal 1545 zcmaJ>O>7%Q6rS1ju6OPA+G+FSCKOCkR4jxOrzmYrL`akhiJ~5WNMJ>>HoFtE$*k9z z-TXL8P^gel5GXea0&ZMTS`-Py6}49`tr}D_0uh8Nk(*0;s3-^C+TJwkp`)GI_uhOn z@6CJP%+HBL3_*K+YNtF9LFlhQ37ggy4t@mT3bK%eDROWT<3JXu;D|-BDT_4XNJYt! zi*i$s(5Ryn6^sN#*=hE$E zM}BjCN0hcm;@h|-Vnnc&{16@b(`%`0$9^!G#auVticeO2$EX-(!oZR+88LiPB9*dT zAz6WI9`QWetvJR!VLW9KlN!wM?Ilv0F+2iGWa^e-@#eWoQS}qI@ zhQ`9hbBkwY%R}y~FP+Wh^XGH<;d8nCNWL&KG&GbS^cEIv#t7XHGQZy$3{S&gD^ZY= zfjSxt5)9c0PeR-kV)`ZDc8I&^^W;1sRf9rQUdgs7;m3~};EZg^pgeWNXu69PKPz#y z>bkV#*{g(WjLWxA!tmSFLFg1> zXV)?8sy=Cr9}j2eLYjvL_*;S>6r$PI4x|4PeJzA1!RRtvf_3~pUdGF4MSK@6F*RUB|I~ zu3lI%Ni~3Sg}Ic@80I_+YQiPLn9I26`HV{@b%DA9kG9FpG4X=RWHSHuzf+htU5iXt z39x3`YR7OWmS@DFEp z7QP?ZdH!bC%e%^GT}eD_L;?J;5j))DKQ9l?FLzJ$;4eDI(03gm{G^VLiPx3k@q&20 zAO-T6I3b`L!m)`?@kXZzkW1MlkF}owYR@ncHsLY2zXmnmBTkLMBt1sjBAtQZ;wxmzjdKB_2wJwL;zPD;u}Fi!D;K2F(_^3?^|>L z0L1#6uu;dg7Abj!)Pu(S%>eLXX{U$yr)XX(Zm;VEq;e7f4 literal 0 HcmV?d00001 diff --git a/database/database.py b/database/database.py new file mode 100644 index 0000000..c80c549 --- /dev/null +++ b/database/database.py @@ -0,0 +1,37 @@ +from contextlib import contextmanager + +from sqlalchemy import create_engine +from sqlalchemy.orm import sessionmaker, declarative_base + +from log.log_manager import logger + +Base = declarative_base() + +DATABASE_URL = 'postgresql+psycopg://postgres:K8u3fg0o@47.119.128.161:60001/squirrel' +engine = create_engine( + DATABASE_URL, + pool_size=10, + max_overflow=20, + pool_timeout=30, + pool_recycle=1800, # 防止数据库端连接过期 + connect_args={ + 'connect_timeout': 15, + 'keepalives_idle': 60, + 'keepalives_interval': 10, + 'keepalives_count': 5 + } +) +Base.metadata.create_all(engine) + +@contextmanager +def get_session(): + session = sessionmaker(bind=engine)() + try: + yield session + session.commit() # 自动提交成功的事务 + except Exception as e: + session.rollback() # 异常时回滚 + logger.error(f"Database operation failed: {str(e)}") + raise # 重新抛出异常 + finally: + session.close() # 确保会话关闭 \ No newline at end of file diff --git a/database/thotcontent/__pycache__/crud.cpython-312.pyc b/database/thotcontent/__pycache__/crud.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fee1d836020d3db7ffa24bd3496744118ae73a49 GIT binary patch literal 4479 zcmdT{Z%i9U7N7C1|G*kdjEPg6KW^1FI2TM+SAhnK66J~#5Fvl6q>4b6<6WqYvANlG z1Q*kSL<&emp}IRIytKZ!>u1du1j;EoBs3FTNKu1IkOrj}S@ zWJDsG@Jd1nh6~GK6Xn$#2P1v*QRuFt6pCW~P05hLEBY#?9F)JJt>|<1-cVBsPniUK z7Tv-peVoRHL{YPZ6Jui$MY9rVm`L&~K9gn=FA^$5raSd|zdi8s;lXf1lm^EoMIwV@ zNC{mG$IL>zV19?_A^Els++Y zVz!SGy)(V@m)CgDy0A0N@5}=;DpM8@3rtydP{+fRr4-d`C@oq=A=_M03;j=dOW|s( zqb&#(W!98`L1%^$rc5alf#)nWmhd27nK4k5YNw(Zl9)Li6)$00CIQSztgz^_Y35%f zB{E5`K%|C zyMS+NZf@(X*Ei?yY`%VF^Zk#$`puoKxocnF{VjX>_1(qIKit^7@_ShR>gxNOfBx0x z!qxw*E`*E52MLs>$ILfF#~=j8%4P%<)?9|aC&w=(0M){YB*S$)AyY|uaM-$Hp|Oi% zsQZ^4)s#S-CAOzsAv@a0~RCR1OSMmKTRd*~DIA?DvN0UUFq(jx~qL2nyW#Lb##fg)4^tdeYHA`rGT#Ab&2w*NC zz)oP%c0}T`L=;IZB7kPqk562{Ak!cLKrkTT=|?*&c?v%B(CepR$f?cOFw@)-Ed}&KfA%(=lW;*)t!g#@zecl{EL7E4ZCibZtR#(O&{Ob)tu&= zzHIc&TQ@v=-tBm^H3KXj1Ng}Q4U3mw1i%Lv}AuwNDS z-xfZozg55LS6hx}gcDg|Kotfu!s#@BTE}gOj7a$ZSasw8oE%E1Sj^BJT`1F?r;6^3 zW0O|kK1{!O2D<7R%(ir?EnW9)h7S8xp+6%Gr1^n@4taSrGOn3p5vbq7)Z>gsY36~m zK>RQd+)jwq(qGC{Ok3a2`!q{pcvzMc(gr&u2ca{n5$aHEwY&gnsL-WMo(0Ai;CZM5 zm-&p)niT@75V-SVMmU@mx>cb&BOFQdM+yqGCdZ+emEEpx0P#OedEZrKK5FRcT0KRcDoop)@n6 zI0I5@Dj-4Km|Lw~kyn5@^dwbQkfsFH1)_G2K zp3692p6>g%)w$mM_`9udw!YK0&^v!@sp(UD<5J(v#+%Z8`$52a&w&j{05uCWi-M5} zI@fIb3O5txJk=rp6WGa5#AoO&h}xl!!2zQr5L>% zQV=i#nWmD$HIn*Xv21V_Sj!CA2@EITk$a*0-i9m~YUyWdw!PF=V6>}3&Wvo0%uXi( zo@sHuB|_(OXwAm;)n9Ub^^d{zHdeSeIpEIjhbK4|iihCx&PpwIhh=vhjgP@aiB&Zk z7l|t@6*4quh@|miB&;ykB=lIf5Zgr|ncBMGKgpQXO^(4UT0zQJpv!R>Da(30iKn}_=j^WK zj-uDrwDC_3v|>aovC!6%V6OB-O`+sx#2+5qL7Rnur6eZ*loSg6)pzzjx^)iTd1szy zX5V?{editpgMI{U*36BjIfVYCgG%Ge&UzM{>qtg2qac+L7>4?+!m6ym+P+J1shq&s zKBu@=#)i3zffDo{8w^BprY@b(xYDfq%2s12-BH42p$zB|qc9p(7 zT@FB~P5UDLH9CqgmQR)pDXvJm4g)U?u#r$P^^c{DgqD<)2ygMdnqsO+%QKKP@F*r0 zf6^d{EuG#VlWJ;OL=~vA2&Nx402A8ILf;SK6ODAn7<>{#(h=X59|F7Jb-I( zumx7bE`k5CY8d5*f+@R%!X+=Ic+fB zcy>q0ciPHT$-KOACR|ma0XIVY4RklT@iG#c(oKmR=5Sn-@o)+om<-c*;WX6lumK5G-g#IBTy7G8hc{jhtXBOg1LS5ki?cnbU2=cVo$ zLJAXQp^LHuw5uDW);Gwm!h-{($W@>0X*@iCB=fI4iC*k(&(LX}riFfaW@5dIpi=kNdz__cx z#TLUaP42mKD8Kh`F?=N3KOQe|M;2Q)Pw~0%{fotxp=|GX^F;fV)&e)Q*xEkTmy6Bs zFSee@M#oQ0#IL+p;7%-d?8-IHZ_6K-iyb)6HBauCdhN!m1r9HE?#^wWKbSu;QtTYf zb1jqIQ@uBO3f$=8rmeT%$jQa_y$k&d2Osx8?0+x9;ct$@WF=@k4y5L-aMeqNE3_!+FGOSfQ4Xsb-C%c0{d|1rDd zgrwlOp?9S;-54bpAgjKTm0cBw(8s5wZwK|Nnp@s|rM%crb^$" \ No newline at end of file diff --git a/database/thottopic/__init__.py b/database/thottopic/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/database/thottopic/__pycache__/__init__.cpython-312.pyc b/database/thottopic/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..044710df0ced49e023e649af435305038432e0b2 GIT binary patch literal 141 zcmX@j%ge<81b<^Tq=V?kAOanHW&w&!XQ*V*Wb|9fP{ah}eFmxdr4wNllboND8dH#3 zl3Em#l30?Mlvtb^Q<9NiQj%YgnH&=zpP83g5+AQuPXQ$B$#x@`E)>%GlsGg!n9?98__ z-+nvu`+oC%-%o)+4TAQtePwLGhtNy%;xx_{^4CFFL=uviBpPRi7zRl$nHhCS?pb!o zWy>tcZi$mwnF)bDtnx6!NgkO8*$c8q5+$F^OEoaVO2RBRp*_nG z4WeJs9@_78-GQJ)r_2J=kCvEd#BI7qB+2CCsqt||Gd)-y#j<*J$z{5vE0~3^RkG98 z*1yA-KOByyBzZV3Ycd{|MzoPDBdR>C!I|ihhU0irYEMs@VjRmOnj9+}76-rtwG{zj z8m+VZ&90fQxf5$_kjOnVJ+r+;I5%@{ZfcDUtqTpSY(pNDQ5jIg1_jCwP`nN(tpKe9 zSfUCTMXdx+{!`vEOtsl-Vc?tXgamM|vN;a8;mWu$JpIH`LZuc$WsX1zf=)$qEKyh1 zRV>15nFVkqCTECvP44qa8Bbw?k;#uLN$`~Ep2SI$8&$BXVGk@|FX_nkBW_G~N8N1J z%3mm|rc8`k1XYQR*49?+htX`YT}mAtQ$}N8Gd2NarXQ&|2uegzjYBt$HiFw1FWrMu$Wose+^yS z&c1*Su4;72lxQqmX9?}9kfwxGE2tfH0K3xYkMvPCf;7D1O143iIWJ?X#{ zqTDk1k#t&~kV-hlSsdF2DrTKBp~_g3C7WX=M?HueU{NKQ1K}LX?JA|5T7`2>pidt4 zCUg)~wlz*)c)4Z!9Jdi_`nK`w#`_T?bTAt_s)vp~?s?L;()V=W`QWob{aDlpUC4$8 z^w5A2x-?h&dtLK}zxGzweAn%cg}!xvQ`Wyn_wTva@xmX;)uH;vTmxYx4N`d?)$g9; z*1a{iYUXQhcf9b1a&DLc;`*Cb*&P-y{e{REQ=~F>76RBD?D^dWus01>j%T?F&zdep zqK=KYED!D^SX;@!3JP1tWUUzFV^^l&w0IY29U+?S(1BNkz^&u+$G_|`gqEz(rVDNN zmG6_^C6|->duI&cqpZ-Q3$WO`%Jy2|vFbG?ZE{Hk@==`JWTz3C-t1Wr-VVEB_)L(9 zSkW>r=e}(gG=Mw;96mtk9_oorZ8~l|4n_i`(f5zPC~?MeG}Rc{{|4F|t~m+YF}ymQ@m= z6OkM&`635RZwgB?23DEEWE!dsIi@M&vdK!a8n<}KP+qF!#CDo$Ab2K0S1lnX-VX{V z;8Bl3SBdIAUD)@?mkpoP!zZ)hGkW-p5k6}OA7_PLUFbE0zE!r*Lbgy=fRzFl|KEzu ziotG^8<+8zTnw_9l)`0qQ?xNFL?Zk?sGKBf#Tu-8rh9I1jorRU4fgM&v-a`xKZEct zNtI#VG=$N5YxirCXW=r_T59uJtB~eAMIu^ht_2QDS!eQ+oFo-6siK_gO+uqpEE8fB zQkZC|7GoqP_xc<>YAWQcpS;(lG+|LDXIolBiRcIpQ{F~shy7}dq7 zA$~gD^NU9$S;sf;yB%5c?zH6v!77!ho1ze@2}xf`&_(=aqf?`|7i#qOaj4*vR{ADT z(8A4RlTi6#l1ojuGNI8JPGb?hI+Qtd=SztG$BO>mPOvqZluzLg;6ri=)f99&H^VT$ zqV^omGQoR2c?3gF^fSTbNFKqEYuU;)EwjsmD|LAUV(uWroM#^F%_B1Y;boZ9U-jqU d<@AGoGI#nt*Gx2*Y^if;XAjS>ZYiF;6d?m7L@HbK)FHj6mJtK(H61U zuucoKNJnU*+XcJGL>QtofYMq5uc4h zHk#_Ec-J+GcO%~&yXo~#!Fi-|+7l?-!lQuA@bH|V@8*I-;lQ%E*Ep~2xZ=Rzljziu#-$#@jzIZqDu4~~RDV@`4k ziWa<*S1JE5Xq-QyA~v3i(4)XZ#7 zhf@-kj+*(56YPMNW`vB!C_-`+v{DA7Pp!Qo(2a2fOnZda6 z)njIsR$1(F;JS19V2SHyR0s9%`af3dwyf4-;UrL8lW_8sC687MCxXFkIG0y zan*1|C6Op}jPE1UHt*fq`uReDHr(-~AaEB1Bv->~hJ!4fEC=0$6D+*~}GaexKx-Q5pZoVUE^sDNu^ z#(%uD`?|><=$&!)v_g$~XB<7PF}`=k)6;6Uc<)SAPiwJ?q#(UBbl~}bL&kk4*I}^- z3tT37KL~vBDM*-9Fd4011kN8Di%L8iOCS}=W56SK0j|X{74|&wJqC@0gkWOYaI>t8 z5;DuqRF~ol7QeN@YRW4hQq*7k?Tb(U;@JDg7JHT^Gog>qe|SDKw1QSo|Gxis{j1Ne z4Qk=YpTpdGnA1itXcuHnx%@ARQf+UWPp` zU6woX@E{glSR4Xzo05C5{uCCzqQDU=8Wd}m`ddv~YWVNInsj)nW+k+CRQH8aC+F;0 zCY1BnEjX8ISKPY)MC#1kSF_BCT-V{H$rW{NT<;pwnA-G}Mf+Rd&N5@UhL%NVre@_$ zy`na>1uF;Z#r2@1F^vls7nS!QU@3RxnatE0qlFXt zkxLrWv~Xp~{vPmg2@-$**$l6DJhwKmcKVM4e;D|5NaH58#G88K)zpjW`Ydxbcd+O7 zH&+?`@EI*UPP%t|Jv^QbPiU_tpfSH?8>By{sIDOz;}^Qu{TiIbwepVND7Lv)(i Qrto{)4xc_FpUBGp2CiGKQUCw| literal 0 HcmV?d00001 diff --git a/database/thottopic/crud.py b/database/thottopic/crud.py new file mode 100644 index 0000000..58f4759 --- /dev/null +++ b/database/thottopic/crud.py @@ -0,0 +1,90 @@ +from database.thottopic.model import THotTopic + + +def create_hot_topic(db, hot_topic: THotTopic): + db.add(hot_topic) + db.commit() + db.refresh(hot_topic) + return hot_topic + + +# 插入数据库之前判断数据库中是否已经存在,根据news.url 判断 +def create_topic_if_url_not_exists(db, hot_topic: THotTopic): + # 检查是否已经存在具有相同 URL 的记录 + existing_topic = db.query(THotTopic).filter(THotTopic.url == hot_topic.url).first() + + if existing_topic: + # 如果记录已存在,直接返回已有的记录 + return existing_topic + + # 如果记录不存在,插入新的记录 + db.add(hot_topic) + db.commit() + db.refresh(hot_topic) + return hot_topic + + +def create_topics_if_url_not_exists(db, topics: list[THotTopic]): + inserted_topics = [] # 用于保存实际插入的新闻记录 + + for topic in topics: + # 检查是否已经存在具有相同 URL 的记录 + existing_topic = db.query(THotTopic).filter(THotTopic.url == topic.url).first() + + if not existing_topic: + # 如果记录不存在,插入新的记录 + db.add(topic) + inserted_topics.append(topic) + + # 批量提交所有插入的记录 + db.commit() + + # 刷新所有新插入的记录 + for topic in inserted_topics: + db.refresh(topic) + + return inserted_topics + +def hot_topic_not_exists(db, url_list: list) -> list: + """ + url如果在数据库中已经存在,则去除掉 + :param db: + :param url_list: + :return: + """ + hot_topics = db.query(THotTopic).filter(THotTopic.url.in_(url_list)).all() + for hot_topic in hot_topics: + url_list.remove(hot_topic.url) + return url_list + +def get_hot_topic_by_id(db, hot_topic_id: int): + return db.query(THotTopic).filter(THotTopic.id == hot_topic_id).first() + + +def get_hot_topics(db, skip: int = 0, limit: int = 100): + return db.query(THotTopic).offset(skip).limit(limit).all() + +# 根据THotTopic.update_time排序,获取最新的THotTopic +def get_latest_hot_topic(db): + return db.query(THotTopic).order_by(THotTopic.update_time.desc()).first() + + +def update_hot_topic(db, hot_topic: THotTopic): + db.merge(hot_topic) + db.commit() + db.refresh(hot_topic) + return hot_topic + + +# def update_hot_topic(db, hot_topic_id: int, updates: dict): +# db.query(THotTopic).filter(THotTopic.id == hot_topic_id).update(updates) +# db.commit() +# return db.query(THotTopic).filter(THotTopic.id == hot_topic_id).first() + + +def delete_hot_topic(db, hot_topic_id: int): + hot_topic = db.query(THotTopic).filter(THotTopic.id == hot_topic_id).first() + if hot_topic: + db.delete(hot_topic) + db.commit() + return hot_topic diff --git a/database/thottopic/model.py b/database/thottopic/model.py new file mode 100644 index 0000000..a2dbf51 --- /dev/null +++ b/database/thottopic/model.py @@ -0,0 +1,34 @@ +from dataclasses import dataclass +from datetime import datetime +from typing import Optional + +from sqlalchemy import Column, String, Integer, TIMESTAMP, func +from sqlalchemy.dialects.postgresql import BIGINT + +from database.database import Base + + +@dataclass +class THotTopic(Base): + __tablename__ = 't_hot_topic' + + id: int = Column(BIGINT, primary_key=True, autoincrement=True, comment='序号') + topic: str = Column(String, nullable=False, comment='话题') + topic_description: Optional[str] = Column(String, nullable=True, comment='话题描述') + url: Optional[str] = Column(String, nullable=True, comment='话题链接') + source: Optional[str] = Column(String, nullable=True, comment='话题来源') + keywords: Optional[str] = Column(String, nullable=True, comment='话题关键词') + content_count: int = Column(Integer, default=0, nullable=False, comment='话题内容数量') + comment_count: int = Column(Integer, default=0, nullable=False, comment='话题评论数量') + follower_count: int = Column(Integer, default=0, nullable=False, comment='话题关注者数量') + date_created: Optional[datetime] = Column(TIMESTAMP(timezone=True), nullable=True, comment='话题创建时间') + date_modified: Optional[datetime] = Column(TIMESTAMP(timezone=True), nullable=True, comment='话题修改时间') + top_content_url: Optional[str] = Column(String, nullable=True, comment='热内内容链接') + top_content_upvote_count: Optional[int] = Column(BIGINT, nullable=True, comment='热门内容点赞数量') + top_content_comment_count: Optional[int] = Column(Integer, nullable=True, comment='热门内容评论数量') + create_time: datetime = Column(TIMESTAMP(timezone=True), server_default=func.now(), nullable=False, comment='创建时间') + update_time: Optional[datetime] = Column(TIMESTAMP(timezone=True), server_default=func.now(), nullable=False, comment='更新时间') + ai_script: Optional[str] = Column(String, nullable=True, comment='内容脚本') + + def __repr__(self): + return f"" \ No newline at end of file diff --git a/database/tinformationsource/__init__.py b/database/tinformationsource/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/database/tinformationsource/__pycache__/__init__.cpython-312.pyc b/database/tinformationsource/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..08d9c63f4959857cf5323497787bac82b38d3447 GIT binary patch literal 150 zcmX@j%ge<81al-)(n0iN5P=Rpvj9b=GgLBYGWxA#C}INgK7-W!GL5i`NzPA6jVVYi zNiB*=Ni0cBN-R!|Dap)B%P-1JEXmBzE6y)1N=}W5kI&4@EQycTE2#X%VUwGmQks)$ XSHuc5g%OC0L5z>gjEsy$%s>_ZxJV?g literal 0 HcmV?d00001 diff --git a/database/tinformationsource/__pycache__/curd.cpython-312.pyc b/database/tinformationsource/__pycache__/curd.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dbae38ac115ceda8fad8c0a81e68fd5f46d6bbdf GIT binary patch literal 2406 zcmd5;&1)M+6rb5IeXQN2vQn~1Y^>n8-IQ3g=|?DbX^Anm3vO!1jR|s%tR1;4OSUt+ z3K-T$e%cfObVU$EpVO?gBY|-D)cU;#1I%Q3WoSVx+@qG6Tz3k zmkfo;;K#sMaNdZsSWz+(Fph_DN$?Y!s^0S#rq`*Lx;?d4T<4X%ZLY7)tyg)0c`5U7 zYRU5F3PM;8c4GL~E`jreIHW{z`lvZ{FPx7BmL!SAf=&;X=r@8cUgHyY{Ty832Q9jU1!jqIn*?079Z?oKY&vWs=~tEOty zRHLpIU8%@>@i!FC>?&U?bSm(F!_xQ{Sn9&eZC&PPp=cjS{<>d&zF?d88K}Bk)~S!9 zgE%NZhgz{_q4|%z2DG`Up+@`)QWtROzgIqOGkVg?L=Wc3@dtCH0`F=$xW({c^h9&b z_V05~Dw~$=UrU~3m<5}o4*jB^6F;}V!I$w-NKq}bkNH(t#|zKGe{)(QrSy05NlTIF zfcwE@n?T>{p>&15xY;Jyw-ibj=JFj8lIT==@*&2a|l$ AQ~&?~ literal 0 HcmV?d00001 diff --git a/database/tinformationsource/__pycache__/model.cpython-312.pyc b/database/tinformationsource/__pycache__/model.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e54d6365ba8de3febf8113baecd8214b593383f0 GIT binary patch literal 3154 zcmbVOe@q+K9Y5P=V`F|}FiC)tK--uw)+9?*tsyFe(y=WJ0%WOU<#zEs;M%i~ygLK7 zrqz=Y)vdaPYF%-fMxv9px`Qm9R&8ELQ?~uFN&CkUmNK7IY13jm{MAhKul_OZ``&SI zPWQ*Ov-I_Q-_Pg0@6UJd{#spK$-omR${Fnur&jNB}QNbi^PO1ev5^S zt&%ll^V>qKpAFgl_K?Hx2vztiLQcPv;x@?@s`OXVF)O)4RsJdqgDh6&MMkh+W&{VS zTCwRCR<;fKtD_Y@=O&r?ESgK;RX!;3isEx-t)%!q7-IYuf$>`>Awa*) z${-fm1uOh)tLzG^bNvofft-T<1WYNzB{+nNRi_bxzY@6tcbRajP-()eDDE!9t4w$` z#jDHk8WUba@!B%H&V<)eyuJ)?FyVC+Z!E(-CcK{FO=Wnq32&gdw+wGF;f)mEQ-<#~ z;U0?bGvN-QRcKpn-^p+jWjwLNXod`T2>VS8FRj6W9YzZ<975-6*Ulb$&|av*lRFF- zRPia37yGDX&#jmca@w%wG$EYMkw8Rr*QRmKAYyA#bSuZQSKEK)vEc3DvRMDMj;ed zwThq|q64ca4AEu9<<~ZUmh@RQXE-8B{6z`jde}{~tD-6)opSZ+!%xzhTR=(>i?hU? zhzmG{E5F%%`&WgVOPUi*&C6I&B2~nbkKZb-T-*G~-QtgbuGu13qUWlz_yo#n}9Mvhe-`u4%z?(x33na>NQs(ttQoV|> zDwc6bq9EeZ&AXZ%f<=;Mjm^*?AKtq8FctUNa3hJnLO@e|L{jx!uo54h-V^|!kvzDQ zEL=%|9nA@z%2-sRk#4ea@>XH-7lp-JrMFX`-FbJ4mlRZd@0W(+O%+FwJ^?cwuz=dq zO*r_VD$b!uJE$M5trgyQe>1-J@a|j1*W-ovQpME!g&RwS#UGVENNbLeEJUC*`yQi~ z-o0FS?c?H4(>pw*&d371bDgRfLNPgvXd(-V>xH%3#iieFE`0>;QllU|hcIVEOybvl zazocZ1vs*rt}I^vsQ6BLdnJnx0#kEBGt9z`fK0)-b&bC88DQy|0l9hOS~31Zy+Mo= z72ZgJ?z1~IKRQ8ki3+Ev(D}jO_7R6&7>0Bmxtq3TLBP#t`9AY9j2?fk!z~C1Tq2J@ zfY<=pzC6q*AS@up-D-jWvEOiwGd{;i%zNU?hw`8(^u?S#9rVET#oRp|Moad^Ts<9n zVf$iM-$@^f8(?GX0iih!7Vd&6REbrPG^LPmJH9w_?(2b|ETF(Fg3JcUje8Mp%YbS+ z{rce#KsO+%cQ&dyIS!*)%yF@OUv{$B0QbZGN*{Q$5XWhi z9H%!XjH@{AQiPYbk!p?u_a&rHF-blqUpF)vnoWfFQZ2{v;jm09E{7HHmApLlJlPZ# z<0dj~Cc;ZZ3lV#XAcp|A646EkIX;+#h)Jv9P9jK2X^u7aB7B_SC&~ONBFOg!hOa9| zvxVh(>?6}dMD!4G7{rGRK0?N%iE*_d$ll5yp;A|vzk9mlr~gsoSv;HaWG4RdY_4V^ zK9I0&uoL-uZ_<%!$~bfN{qYwPM>p91ytgeml~Ob3a^9i%w-y^W*rB{*oUVE<-+EyAczSTXH`h9r zWj%|q*;u~4Yx!(iSRcu?k7wEDBzQKSZ|hthN>8kx%(b1#vQ3N1275;L){|by)jzL$ z`+UBxd2udfPpj+aa&;r|!9?cP2*_`UiK6@_L@Dk`wZLlxx=q7UwXLa3K$jUV_?a8$tUtd^P|Ge;rg+Ilz z0Y3XmH0O=!X^Q=Kj*ty}?|wkI7Z5fAQ`t*%j~J%k`W@@%4AU}dB`cv#?W5Lwb8GUu zDIqhN{q_%X&9BCX6Y2*0>K1FUM=g4JNo7ZTHE_ilpSDQB list: + return db.query(TInformationSource).filter(TInformationSource.active == True).all() + +def update_information_source(db, information_source_id: int, updates: dict): + update_information = db.query(TInformationSource).filter(TInformationSource.id == information_source_id).first() + if update_information: + for key, value in updates.items(): + setattr(update_information, key, value) + db.commit() + db.refresh(update_information) + return update_information + + +def delete_update_information(db, information_source_id: int): + update_information = db.query(TInformationSource).filter(TInformationSource.id == information_source_id).first() + if update_information: + db.delete(update_information) + db.commit() + return update_information diff --git a/database/tinformationsource/model.py b/database/tinformationsource/model.py new file mode 100644 index 0000000..34ee9e7 --- /dev/null +++ b/database/tinformationsource/model.py @@ -0,0 +1,34 @@ +from dataclasses import dataclass + +from sqlalchemy import Column, String, Boolean, TIMESTAMP, func, INT +from sqlalchemy.dialects.postgresql import BIGINT + +from database.database import Base + + +@dataclass +class TInformationSource(Base): + __tablename__ = 't_information_source' + + id: int = Column(BIGINT, primary_key=True, autoincrement=True, comment='编号') + title: str = Column(String, nullable=False, comment='标题') + description: str = Column(String, nullable=True, comment='描述') + keywords: str = Column(String, nullable=True, comment='关键字') + url: str = Column(String, nullable=True, comment='网站链接') + rss: str = Column(String, nullable=True, comment='RSS链接') + api: str = Column(String, nullable=True, comment='API') + primary_category: str = Column(String, nullable=True, comment='一级类别') + secondary_category: str = Column(String, nullable=True, comment='二级类别') + tertiary_category: str = Column(String, nullable=True, comment='三级类别') + label: str = Column(String, nullable=True, comment='标签') + lang: str = Column(String, nullable=False, default='zh', comment='语言') + priority: int = Column(INT, nullable=False, default=100, comment='优先级') + active: bool = Column(Boolean, default=False, nullable=False, comment='是否启用:false未启用,true启用') + module: str = Column(String, nullable=True, comment='任务逻辑所在模块名称') + method: str = Column(String, nullable=True, comment='任务逻辑的函数名称') + create_time: str = Column(TIMESTAMP(timezone=True), server_default=func.now(), nullable=False, comment='创建时间') + update_time: str = Column(TIMESTAMP(timezone=True), server_default=func.now(), onupdate=func.now(), nullable=False, comment='更新时间') + is_static: bool = Column(Boolean, default=True, nullable=False, comment='是否是静态网站:false动态,true静态') + + def __repr__(self): + return f"" diff --git a/database/tnews/__init__.py b/database/tnews/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/database/tnews/__pycache__/__init__.cpython-312.pyc b/database/tnews/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..237c90d6c80d90ecf7c1698ec7c43901f9e9c1ea GIT binary patch literal 137 zcmX@j%ge<81a{T&=^*+sh(HIQS%4zb87dhx8U0o=6fpsLpFwJVX+&7XB4<0CU8 KBV!RWkOcsW_#Y<# literal 0 HcmV?d00001 diff --git a/database/tnews/__pycache__/crud.cpython-312.pyc b/database/tnews/__pycache__/crud.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3c38feb16ac10cbe3a4a20358edaf884481045ed GIT binary patch literal 4662 zcmds4Uu+vi8lUy<+8eJOJJhM1Qc4nZQ39akWiJAalMmZ*LHHV zYrv8#B~pMy7pdWo)Kr{~A|cVJNG)&WiMzLx?xl;V99AIZj;DLG@JT1}#P`kaI=1T+ zDg+NV($0Ll^Udtc_x@&`+Sq!OZk~ZG?P}FK&??p}GRX6`~PM$dIgXUJ!`p zp~93$6BnfO9!Hiy7B!hlROkYI==H)ES@TgZ$O_0lt!;svS2RD=ZJJ8`TA-#aX+h9X zow?dU4&X>FM1xb37KZnBXQmF2LxqQ8+x~;giwTze(l~uXx1=1@Ne_N>iU=(})ihLB zNRgxkr>Cwd3OAgu0&1XsQ@P>1^M2PRbvg|*2oq#Y7>|jTn9wxKo1DpJQ-+*v(%t$QcD<#3kjW08n~UwB+F}qvjs~{GMX@G z96z!kjO0^14#FH+m%Q(c%#SP%Eu2O1^!(|CF%-_spIJ1%mLltFXl|?us-(d>I0Ls% z0QSF?b&Q%lT`Q$qSYT5t?Zy8zmZH$=>%~2U)aC@_H5b?FmpnxegJ%N0qc&?1(qMg( zM%y$McP8$?P*W$oHp~G2W4MQyV#$BXQ&wPTBg;FL%7A+;G0!rVJe6X)!FeKVySsoXSon2u9)cXzV@wALeau8gOq;xJ=8H)F*45S?{`4(jQzLN!M= zBHOQwzB~Gtm*0P(9NB9|_I`Ho=Ha!&x1}$YFBJ2MKa?UT%8}>I$n&Mh>A7?N@SJ^-=} zur?M3T@EjX1KC4Z!Bm7J%km~>XK7BWLzrwMY&)n}!BkGC%%GYb67Vpy2vj=S8?ff0 zt+yU+=;9WT+W|u!#<6ch1@R=ZeeT@%;jTq#Bf8@w<*M>Y=ZB$kbkK|rewM#kSS#Ef z{&Mt-QFGsLDSE0L9W$e2rRcfE;J-sVHv;XKN0vrbhL+E+2fE6EZZpt*Epj)|UkQ

;UB|6#2Yx#kv`_)ZpbqONTx< z_2=VdwZ~L@KI!{(&&PYNPn*35OX|~Q^{A;HEvd)uNXNLvlJyDhcQcemezl=|$(fu% za|Wc%OlD2s*3xk^PSEVE*(pZ>^Nu!q3^a!BZ?1ul2W}0Sz0Z`?L;pYK@+VSpJ)g}c zAV2lM{#h>?VQYK2y8WQ`465#f3QqdH@GAH2H+%Qr(#+mxOX`ubdfZfxm(&w?q!abI z!YIsA%gd;dXSs&MiK&z4jDA?mZ+|#9^GzBPaZ>?Y!e;AqIMr=b(gcoOTr-qb8KG`G zX3M6MZV~g{?BM2_GzRsi;_6J0m}h(>=5LN15}KIx!6-{f>G8atn5LF8!!*huR9K;z zWHQef%_V6ZC?mBbjp|8`1vl0pIdn*-9cv!1ouR227N4d$%8*h5HKq1MS*qi27=q&n zj@F=Rfs${PAdlK9Q+X8RvJy3w=<2YkbYoBZQu}Hj^i+LP!23Ui_(JoqE$7Rj-DYTa zITSNPu~O)XlK07*6KfMUUaquB@&P+`)u<-?75%|+Gy=j_IBFnnKSSLMPS*8Ua+WgK zBqweNhz1*VG^&xpb=$u47%)2(*9-SXud}(3aVcTzwIoZk{t0x^WfjS|z1F`dW zIS@6--H%a=MUjmF{XxFkR9sjBsR24N2`;%LU6PF&t1gN~sVLuYb|{eL$SBUTNz$`Y&Cy3 z&HCI14!?t4{rVA+P`#}XS zUp-rJX^*=CezB@S#GVyv7fi^tT?hM`0v+3QuveoQyaVIObZ=8gnC$B^TBgQoRa@2O zzDJ?wMR;^{;)D9n_Rq@h2xqHTAVJqU^$Ork2d@u`l^^DEham)O^W=Y7b z8S-dLOyvwt<2X5C{rnqr@~Fx%m_V#fB@GTchHGCV{AWB!|EL^=ALKLi7&{4F=puay zs){HG!gr*<;+2H(YG0K=Qwaxz@byzw0!?MtHlgd5cKhJx7pnxs%9Da{{F+!L(EQ+a zs^^4?A`1s@AE*#${xm F`V*L$8{hx{ literal 0 HcmV?d00001 diff --git a/database/tnews/__pycache__/model.cpython-312.pyc b/database/tnews/__pycache__/model.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e266a04618e6a673bed290c43e049d8f6ca696b0 GIT binary patch literal 2223 zcmZWqU2M}<6t*4br%99kkp7XRK+9ivV6Y*E5M!)#Q2ts$$`+X-W8zDj%|GSZfrh6{ z8zYsb5yS+QwrSlaSf`c|_Ci-Cl*dVXX%$OTu2hL>B!M@k^2F2bwUd%)j&yuZ&UfxP z_x$-!uh&iC=Tcc6eQBqte~3_h92I6|0yDQMkrHW%l4*{n3C~E3%y5j!+a#OJa;(X- zl3jLi4%x{$Wf$j?8@LA9&ADX{=aCz^MiXO~yt0q;(G;K=>M$ibKBGh@@XgvRkFyoW zTvO5&Zg@_lFpU~SK^J0@plMieVFBoI84!MGT#u`YAcdXCaZr^KvVt5D9mbVWu^<#r!Y=GQ-@zEoS4n_e)tPUo0WIHlAjL%{F1P#P0zRj1bBCPCUNCp7K(IUk$ zr*S4Y8$$sW*hL2aY;)`^YhgJDZ~~WT--ksL(SY@iIcN25oEvyRV;9z9l&cQaP=oT; zpxkvRPYudfgKDfpd23KjH7H*dg=1?LnYpG}+KL-T*<54Of^AwWQMOcV0J%$R4Qfrz z$!qFR{u-1YG~&Cq)#=-7^lhfTqfXyhqi;9$U3L1kHTn)yAE?m>#gN!Nw{Ep2ohD-a zDxymaib0%#wc>`=`}fTC&f*Cm4;KJI9P!2~{(rf$Y&WgiZl2q+>MI0VvF%o3n+%7y zA^Y$kxU3~GDl>duBZw~G!oeL8^!rgOSkWozWEL{sj*lB zLZHL|PbSZD_U)b0LIycBH34Im(bM@gGa^mENZS^E`}{eRU8gE_vykHuOY2? z;8aB<8K?)%Pre4V=m6^RIt)o7PDlCPV`Rsf0Ma4ETp2T?dUpTbvzb)b2HP+OIYn?m zNJzSsDOM8jPOx7PB+ZJqGZ)orI7< zh7E)S2qCKq1_=of(oM)ZLe>+~O9(Ob~k>uTi)Kvp-WSm{PAcMfZ-xR1u7f z16-aFvYXH2^*N%GE+4@+1Jd^KBy7T1laK?ovv}EDrJneMsS|(u{26X$IOqRk!tg~? zeQ8^sjTQnMrZ;DIodb)J4;Ka1*fyRT%0#Ee zmfE)D+3`ZKXL@(GKeyEg9!nic4`d>fC-UsE>fhUq;K@bSpNZw!lZEb$(;sBT+@R4N zS!CO%aO9Dy)BQ&1)FRuK(emu6LQBV#W5%DIFj@}a+q{)$4-}f)GnZ!USv?mung>(; z>E1j$SZE7O9i9R6rRrX8_k~? zdx5h_zsJ0!s8Ao1I+p3goV|~MJFEE~3^yYWsfzi<1R5mYR list[TNews]: + return db.query(TNews).filter( + TNews.type == news_type, + TNews.ai_summary != None, + TNews.is_usage == False + ).order_by(TNews.occurrence_date.desc()).all() + + +def update_news_by_id(db, news: TNews): + db.merge(news) + db.commit() + + +def update_news(db, news_id: int, updates: dict): + news = db.query(TNews).filter(TNews.id == news_id).first() + if news: + for key, value in updates.items(): + setattr(news, key, value) + db.commit() + db.refresh(news) + return news + + +def delete_news(db, news_id: int): + news = db.query(TNews).filter(TNews.id == news_id).first() + if news: + db.delete(news) + db.commit() + return news diff --git a/database/tnews/model.py b/database/tnews/model.py new file mode 100644 index 0000000..7c683d6 --- /dev/null +++ b/database/tnews/model.py @@ -0,0 +1,25 @@ +from dataclasses import dataclass +from datetime import datetime +from typing import Optional + +from sqlalchemy import Column, String, Boolean, DateTime, BigInteger, text, INT +from database.database import Base + +@dataclass +class TNews(Base): + __tablename__ = 't_news' + + id: int = Column(BigInteger, primary_key=True, autoincrement=True, comment='编号') + title: Optional[str] = Column(String, nullable=True, comment='标题') + summary: Optional[str] = Column(String, nullable=True, comment='摘要') + url: Optional[str] = Column(String, nullable=True, comment='链接') + content: Optional[str] = Column(String, nullable=True, comment='内容/正文') + occurrence_date: Optional[datetime] = Column(DateTime(timezone=True), nullable=True, comment='发布日期') + source: Optional[str] = Column(String, nullable=True, comment='来源') + primary_category: str = Column(String, nullable=True, comment='一级类别') + secondary_category: str = Column(String, nullable=True, comment='二级类别') + tertiary_category: str = Column(String, nullable=True, comment='三级类别') + label: str = Column(String, nullable=True, comment='标签') + lang: str = Column(String, nullable=False, default='zh', comment='语言') + is_usage: bool = Column(Boolean, nullable=False, default=False, server_default=text('false'), comment='是否已用') + create_time: datetime = Column(DateTime(timezone=True), nullable=False, server_default=text('now()'), comment='创建日期') \ No newline at end of file diff --git a/database/tscheduler/__init__.py b/database/tscheduler/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/database/tscheduler/__pycache__/__init__.cpython-312.pyc b/database/tscheduler/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1ebf8e8cad1fb2b1998f59bfd01e6fd32923c0cb GIT binary patch literal 142 zcmX@j%ge<81V5+8rGx0lAOanHW&w&!XQ*V*Wb|9fP{ah}eFmxdr5j-tlboND8dH#3 zl3Em#l30?Mlvtb^Q&OCqk(yGP0~Cvo&&ShGuVi4ma MGb1Bo5i^hl0K-Ef1ONa4 literal 0 HcmV?d00001 diff --git a/database/tscheduler/__pycache__/crud.cpython-312.pyc b/database/tscheduler/__pycache__/crud.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..52e72603d91f442b500256b303ef391d92e7a380 GIT binary patch literal 2623 zcmd5;-Ai0Y6rZ`@yLVTu(Kg1kRnoe>p|N@JK}u;;VvL&Ni&Bbgk@em&*>$aU?%fc} z6_g595aL52+n0Vop%MEBKjga556ASdE)INmf4)KUbi)4|`P)a<7(RsyFH?$eW)0hgj z4z}hQ8)QcJOlH7{1tTVUy#%x7H7^PML_OLDJGqi_(vNWphl=F%lzW4Dr6S`(3v&K~ ztH{YCr9y*GtpMr{2}yxQjz%5Qz4|D{2#jx$y?S5z^%f^|o4!D&$S$39RH5cPPZ;jf z;$l7!CTH`U`8OOzDBc`LjC@>aU*8kwqqno}lE<>kEMPqA<$~N?&S%-ck9U@JdC41G zUJU$mVTDVQSZ#uF5>~+xx-n*jWAJzyRWNYEW(n!S8-yIapJ?`CFUEU{aD3m< zh5l)Y@fD8i6UKbL2tWxX?+JZA&;7trIMR`~fvTg*Z47O@&H@S6?A!|Mmv_M+&gTz; z0?6$2?cuHAFDLfxR}bugiaqfCXFsfdw|dX4WZv7iFCEyU6?=5w9=orN@ie^RHqz%m z1I&8{z;>0%X-DTBuvm!}e*r9EClK~}*MTbhTeB4T31XBmX6op8%eBJ|2*M%%y$%{R8_-#lEs{U%jtgl~4(b-C}MjSmI4AL$*kk{RqbMXITRi zEts%H>cG}HT|mmTNCOjJ1eas9Aj{9dD1QmnY4q2Lh@(wJ?2d`(BNJm}_*sZ+$JooD zj)VD;)p=m`R;=Fgt%{YAy`GBIQ=YC^{WV=p8GjkXOs>11v#P1;sR((qvo+^9TOl9Pd>%RQYJA{1fma@7UY6}b0ikWssijdDntBb z9K{kL)CIO8^qaY2iTRj?J}J@3v!hf+1^pa2yW#QiwmOatCse1kkzY*v@J#+)C~YJ+w~CJ0YrUM}-~c z#$`*cJOJZ=!WAV{ct2$Lm_r&W4H+bbBaXo0>(amhCC;#Ua)u3$74a^{%sOgAQV|#* zk%1J}6Bva?2^;{=D;P2k>X{Q5C1~fh?RHF2lqRtXOxbSB1r$ALz zO6jlUVpY>KvPXoI!?b;YXsXr3`)mn)zE0_E(}3aO_>loRVC2e9jakd?bLDT JPXdjc^lw3wzfS-F literal 0 HcmV?d00001 diff --git a/database/tscheduler/__pycache__/model.cpython-312.pyc b/database/tscheduler/__pycache__/model.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..688770d04afa614153e5502ce7e395a91f9d4d49 GIT binary patch literal 2756 zcma)8Yitx%6rP=(?rwLxTWGgfEG>OeSY9Tg2_a|aR*QAkEb+vpupfP!4aH`Cj=Mma^W$_lLAST=Ge`< z1&XH3F~ye&9@-;#X|Ld;eL^{1F8FD`5TFC5zKpLBf^^VDAQwq=5S-^Dg7c!_l-pWu zsE#@&uu-0sWtjM2f|Nl4;c-`78H|Z6AMxm(?J-^zMBTevR8T*X zbkANz8Wj6=@3vTsN35t*-RQ8Q`*r}o+u~E(SQ$l2YnnfcQU6Bbs1ZTCID#hkLG-kn zBoKwlI1>Kc)6^7Y5or(dA|F?_6{Z*g-*V0~<+8M#m-9`R+wswU6acLMF>Jtrt#Du~ zOl;7`nk$DK`XCA+?{ZiMELPembM+;Q7D;ZYgHZ{LK#5UhGtd%DtBYECbwAC2PxVoayD$4lnB*mTMiBS3HqX?_exS;jPaWy`MYI30J9H{0;C{*?CW@|^C5@Ky}#9EK4U@uz#2Wyps)nKyPY}UVa zaJ9p`m5xZ)JjRGPvbom5Xml{vVaD+K$Ohfty*E05IF*Nk5>Ob~o^-4A09iQtjd3hl z)IKr3`f~o9qb)Ts6JbM6^uL7lBuCVeDBeDW2;*O-Q09;A?YY9)ABtzM;{S7} z?_C?2zx2tyYolOc{&e#0%*Tzkp~m^|W{oe8>r3nh4D^DS$jQaOu?sCAw!hKU*|;#7 zT)2MSa-w7cJ7Ci*@q|H#;}Vi(Xhv!i5DTN17A8+vjyzf!J8aWJ=rD?^xO*}9vDyGE zD@XGaY2(QEIPIrCD14eKjGV#pmj z6L!hVp*~jS6{!ZGR-Qdm{%VQ8|f()bJdbXw# zzDFduQVFRVgzknK(<>Q<6~&msV&gKyNItAzX6j`{l4|i3cS@-Y3%tQn0E-n^;M+#R zheN_AR4T_}B?z550QZ}Ot4?ah0vE2-iZvcpiHfoJBz&5rHY`?S5dm?TcmR_4Yi-Y# z-e`SG`s~1ws9)LTsS6S=`y$~?sH^zsh5GG@RloRarTX%1>yvM#mHPzI>Doa)B#6~J$qW@v42;WT0ytZ?lgzJLcey>aX==}0YvwgI zx9X7AmFO9dj_uDQt3RlF&&)1G$!{$vJ#8)R?8B zPgbmGKEFBD^CR;e^NW(%Ka^X+YkLy4IC)RjG_AhuNy<|JoNhkXBw)sP%UFGu5@4ar zZK>+1b#qjvb12usYp*6Yk8dA)AxH5NZjk2obw(=n0N!cyl?urce)%%%OZOAk z3sCpV2YHrBksFhF4rxh=@wCci>UjZ==hcBxJ5MIcKIKYRW8Am2tOe3?*lhM2L3c3nMVHs Dq%d?p literal 0 HcmV?d00001 diff --git a/database/tscheduler/crud.py b/database/tscheduler/crud.py new file mode 100644 index 0000000..6f11fce --- /dev/null +++ b/database/tscheduler/crud.py @@ -0,0 +1,35 @@ +from database.tscheduler.model import TScheduler + +def create_task(db, task: TScheduler): + db.add(task) + db.commit() + db.refresh(task) + return task + +def get_task_by_id(db, task_id: int): + return db.query(TScheduler).filter(TScheduler.id == task_id).first() + +def get_active_tasks(db): + return db.query(TScheduler).filter(TScheduler.active == True).all() + +def get_tasks_by_executor(db, executor: str): + return db.query(TScheduler).filter( + TScheduler.executor == executor, + TScheduler.active == True + ).all() + +def update_task(db, task_id: int, updates: dict): + task = db.query(TScheduler).filter(TScheduler.id == task_id).first() + if task: + for key, value in updates.items(): + setattr(task, key, value) + db.commit() + db.refresh(task) + return task + +def delete_task(db, task_id: int): + task = db.query(TScheduler).filter(TScheduler.id == task_id).first() + if task: + db.delete(task) + db.commit() + return task \ No newline at end of file diff --git a/database/tscheduler/model.py b/database/tscheduler/model.py new file mode 100644 index 0000000..9cc29fc --- /dev/null +++ b/database/tscheduler/model.py @@ -0,0 +1,26 @@ +from dataclasses import dataclass +from datetime import datetime +from typing import Optional +from sqlalchemy import Column, Integer, String, Boolean, Text, DateTime +from database.database import Base + +@dataclass +class TScheduler(Base): + __tablename__ = 't_scheduler' + + id: int = Column(Integer, primary_key=True, autoincrement=True, comment='自动递增的唯一任务ID') + task_name: str = Column(String(64), nullable=False, comment='任务名称') + trigger: str = Column(String(10), nullable=False, comment='调度方式,interval、cron、date') + interval_seconds: Optional[int] = Column(Integer, nullable=True, comment='固定时间间隔(秒),用于 interval 类型') + cron_expression: Optional[str] = Column(String(255), nullable=True, comment='CRON 表达式,用于 cron 类型') + execution_date: Optional[datetime] = Column(DateTime, nullable=True, comment='执行时间,用于 date 类型') + task_payload: Optional[str] = Column(Text, nullable=True, comment='任务相关的参数或数据') + active: Optional[bool] = Column(Boolean, default=False, nullable=True, comment='任务状态,是否启用') + executor: Optional[str] = Column(String(32), nullable=True, comment='任务执行者') + handler: Optional[str] = Column(String(32), nullable=True, comment='任务执行程序') + last_run: Optional[datetime] = Column(DateTime, nullable=True, comment='上一次执行时间') + next_run: Optional[datetime] = Column(DateTime, nullable=True, comment='下一次执行时间') + create_time: datetime = Column(DateTime, default=datetime.utcnow, nullable=True, comment='创建时间') + update_time: datetime = Column(DateTime, default=datetime.utcnow, nullable=True, comment='更新时间') + module_path: Optional[str] = Column(String(255), nullable=True, comment='任务逻辑所在模块名称') + function_name: Optional[str] = Column(String(256), nullable=True, comment='任务逻辑的函数名称') \ No newline at end of file diff --git a/database/tvideoscript/__init__.py b/database/tvideoscript/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/database/tvideoscript/__pycache__/__init__.cpython-312.pyc b/database/tvideoscript/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3574ff610c3613d1df6bc64d253a53dac7491bff GIT binary patch literal 144 zcmX@j%ge<81TFPv(?RrO5P=Rpvj9b=GgLBYGWxA#C}INgK7-W!(vPr;NzPA6jVVYi zNiB*=Ni0cBN-R!|2?@(gNzD&VF3Kz@iHVQT%*!l^kJl@xyv1RYo1apelWJGQ3e?XC R#Kj=SM`lJw#v*1Q3jp+IB2@qY literal 0 HcmV?d00001 diff --git a/database/tvideoscript/__pycache__/video_script.cpython-312.pyc b/database/tvideoscript/__pycache__/video_script.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..df234861e34b72e0fb84ed5661c2b933bcb1561b GIT binary patch literal 3296 zcmaJ^YfK#16}~e&J3ITtn8mDZAlRm@H^r_}LllUKgdad?jIBWYSQCxbI~RNHnO%Hm z)&_**ZRN+35JMD4#*NdEC~c^4Y|=(GjjP7#pSp^wf_5v+w3brUi{V$Pa1|-PdhX0F z86)I7+H)V@nfo~BJLk@R<@LG{ls7K^_2~b2g#JxC`2u~TEMA7{3X+h-$Vg#A3`5(j z%qnb%HQF}Wrf?z7Xmhe%;X}OQ2ssoXBq+|1(-^nQF2x;kE1rHN;PoKRvq-=>jh?5OT+e{s+5w)TcS`*f}|5sL&_o!b#s zvF<*KHBrT?3U9(3_JkId)ktnbMzv&`ml`&I4?P_3ro1uz?~Kxj%>kQ*o_@);cb}3{g{@=qUEt@c`aJJ!GK z?jlhoOnSr5vn{Vm@tZ+I`B-Pl4iO#?M$JtB2)NKhOo1-6J znE&vtxpP0Cy>?ya&G{_q*$d~hH-1mI8Pi~!l&F-jz3~_}&z5>Od;9a*E5Dn+@>$TP z`&3L$VIoSnGn|k$QUtp)x9^S+3~U$a+mW380Yu0RKN?DluY-W#W%|%@Xe}@ALLdXe zE+0n-5E~#lFIjOwI6(3-V1WR!10q<4a!o+^_uOqL=xFUN+}_?C*V^M<(a2_v%KBcu zssb!m-)pa`h)VUnzN!j~6Y6{2RTX)g*Y~o)7lL;18-0_M!2D!26~$Dn0RhJ3R4jL@ z)EpJxZaY}l5s6Et%Y9B;pM@Y93iEfe_8u0J<@*>tD&ZtLV>ar*Fyslz8GL-M42 zda}Fk8-#k8M)tpEwSZ7J)5Jp6#J)(YRV`FHTi8Wd0JR3!(A~ts3Km7(C5lR1O31YB z5yg{CZlY9Q<^AGJG^^d=n z@iiuQq-<%fai*|jm>(^k5Hf`g$=#`KX|7>r!w*LJ3ICn$%!Yl*p9}?tH@#Js=Jw5$ zSB~r-mnL_;k}2PxY)%~;Qu~!Ow|}Ofc&KwEF&@hl>`v}X6{We|Gphr`Yep+4e3{iv z$;Q;qG}km!w08LWqtz4TnWE-oQ>rb^HP4i8{GfJR%4~Z6Zu8w;e{23r^H(jC;m*mf z*D|GN>B4F5>>|h5TUhgK^jNyb%8$pwjM~`0&%hwi@GgU)o1eqLX{K9N9-9G0Sh)4!8OtRL z2DPD0=ozFkr{OE$5MtozLEW}P=rj{#T7!Zamo!!D4tk?1QjJM3x>?;8mL#30c{-}; z4uU%gR*wZ)@0^})Cj;h+qG}vs=K%=S0D0vg-+_Xf4 zJdTjIlwtGkI!@zuS@}+DrR@4-_b<)+1PYvC``BIR)iQ+6(5$}_8Eh-9oS&OP15a^4 ze;^4%pC^)p$&rLjNN`M71(dv1v%mV;>@TimuZ_;WGd}y~jqK-dKbZI=d+Rq3Zx3ca z8HVxy{9z!Qx|+Rod;aQ2vzPjfm+QL1B&;mWQD+VktR+ay>`*outh1j?VA5kGE1mC* z%5WtZoYZa6m8}e)ES#kw_eR6E z9fIvof_+D!qcyl1zH0Jc$XF$mhDI9Kd99}lzJ&?B0CbmnQVz?JV_502Tp-_F0fqU? zb8Egq=)VWUvV<;bUIH{Ob&ke}ad`4HMV2lKUD$43VK0yd0BEGB`#>z%7>2ozHrz*L z_tE-?=-D)S_96PweN?i*Bi8+h^)fB&P~{?`^`Z@i7L1{Va)BuwIkAYKTJXD=z=-q+ IL1nD?e@Z1IJOBUy literal 0 HcmV?d00001 diff --git a/database/tvideoscript/video_script.py b/database/tvideoscript/video_script.py new file mode 100644 index 0000000..0bf6722 --- /dev/null +++ b/database/tvideoscript/video_script.py @@ -0,0 +1,47 @@ +from dataclasses import dataclass +from datetime import datetime +from typing import Optional + +from sqlalchemy import Column, String, TIMESTAMP, func + +from database.database import Base, get_session +from utils import utils + + +@dataclass +class VideoScript(Base): + __tablename__ = 't_video_script' + + id: str = Column(String, primary_key=True, comment='唯一标识') + title: str = Column(String, nullable=False, comment='标题') + description: Optional[str] = Column(String, nullable=True, comment='描述') + keywords: Optional[str] = Column(String, nullable=True, comment='话题关键词') + url: str = Column(String, nullable=False, comment='话题链接') + script: str = Column(String, nullable=True, comment='视频脚本') + content: str = Column(String, nullable=True, comment='话题内容') + create_time: datetime = Column(TIMESTAMP(timezone=True), server_default=func.now(), nullable=False, comment='创建时间') + + def __repr__(self): + return f"" + +def create_video_script(video_script: VideoScript): + if video_script.id is None: + video_script.id = utils.get_md5(video_script.url) + + with get_session() as db: + db.add(video_script) + db.commit() + db.refresh(video_script) + return video_script + +def video_script_not_exists(url_list: list): + """ + url_list如果在数据库中已经存在,则去除掉 + :param url_list: + :return: + """ + with get_session() as db: + video_scripts = db.query(VideoScript).filter(VideoScript.url.in_(url_list)).all() + for video_script in video_scripts: + url_list.remove(video_script.url) + return url_list \ No newline at end of file diff --git a/log/__init__.py b/log/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/log/__pycache__/__init__.cpython-312.pyc b/log/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9b69d315df810c93d222cd5adac2adaa07f2863e GIT binary patch literal 126 zcmX@j%ge<81Sh&9(?RrO5P=Rpvj9b=GgLBYGWxA#C}INgK7-W!l8vy6NzPA6jVVYi zNiB-W$xn}okI&4@EQycTE2#X%VUwGmQks)$SHucb%?QNBAjU^#Mn=XWW*`dyH%=Qc literal 0 HcmV?d00001 diff --git a/log/__pycache__/log_manager.cpython-312.pyc b/log/__pycache__/log_manager.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7588fedf20cb53b0293d04ea2add78a8387cecd4 GIT binary patch literal 1829 zcmZ`(U2Ib~6t?fZx#>;Pv`w0{X$RcFA_!Uo+9oD-lx?E?sVqoCnpo5+y2Nfsl%MMQ zwri^*Qp8k{I&Da7MWRSUnx;-@sM_ns#`|8<71Ut%(1}s#(^7R08&5mFNg%*x$;bBb zckJWOpZ!a7vkO7le!@)MRS~)`2KA!VIvd|$gk}*z1ZPnk+pQFr>{gB|8X^*ri9#Gi zO=04klhT-|qzT3vl`{7F^(CtLX^A@_qV-v{7{xUSQ5R`F zhp5(%uGatrUze)gxSQa(hq}!!>dti6rW+d4b{)SomQO^~0Rf;?F67j>n z9`hxNGfh=gqs?q-998?&Q(q6GeE0vvz5E|>>eDzl5zcIV5kr(ZChnuY%=Q-%)LBPh zMA~WlkVFFKlz3nqM}lht9$OQqJ;)WaP_VdiBzEu^_a+N@R>;!BMxJD;8IianOGjyz zH=U|_*hC;(NTt&GRP?xIQX^MGa_1;DPZk)BDBO|E8jP7vSew9+{mVS0}q-jO$+bnVR=#yQHEkpkdETlmACaIPmz^zh-{@O!=C->kuyi5MJ4u=uC$ls`$I{G#gcBlhYBKJubxd)O-1Q_w#!<2+0GL2XhPn;v z#!fH;2!p&Q(MN8CPJ_HB@hD<=rfCv?iDxlJGtCMb$KS~x<fD9Z3qL(Lzo0jQ9(;Co^^1i+f1LT_=Fbl=&9D7%`O%H9A+5DfK7Dxe26xtO z;~A11u%IAn;}n=#WPz=gjPO2JET!(qJn7Xv2xZ`JdA?kM|6~%u816;GhHpDidDbD51Z-dy$8 zCQTh;g!V~c#Nytg#||a>4}6f=e=K(J$RX~7l@e8aqQ)*)KqjNMWn2LTSiA*LHPEnO z8o5S?eX-oZhO=pl;{?b#ou&J2<#88DC#_mvOHu2nO_sSCGBU+~1^HU0O@kQ_J|T9S zpT|vma|J=*K64NF1aT|~wjyJU@1ejwEvCT}^=T@FP`ojcbZ za+|9nJ6hF>7ipn#;8lBSYdIL2&0Nf^YmRQ1cDAV%AJTl~;L&o2h-nuwesz6YhfgYc zE3ctoX9@XMoUZAysj;H6t|EWOq7eR4NB=Fdf z`l1pnzFRy~?vBo%DMjBdh28NR0}K?bjpZVg0DinWv)iRXb8-T7XJ=-&yZn5Q z@Pr9cykmhm_9*zd#SKq_CfYnrv~Z0p++xbe0*|~Y?~W%&$*b?mXc61t9U5Mkt;OGn zQM^M$2VFxn$?v?<7W0ny5)G`d=C38670mlGmhhw8$Wk=LhcOTNTuuj-@|w9~nHo=c zQ+B^X$h{O*Z%SQZ#328!FlKhi`(g;O%`d$44Xw_Msdb=lX-=U zXGf~q`*Im?U`8H!-!d!w*JxG#Ni|Jr$s2wX>tkYm{$d*c`dwG&bGoOuk`6U#&N;CD zXcE0Y6FlP{BgTEZmX*7Bdr7q?B@vpeBl$G3F^%fBRNZ08?2Xl#aV8wC_o*G!EV&1t zA}MsNu9EvRC#UAv>y8bq<32aqHX)?yu|3@sJURQDoVzA&cXXhO8FhW?e0NjB%U8Wo Ubw>{NFQc84lKw5arsp&I3k8CzegFUf literal 0 HcmV?d00001 diff --git a/seek/163_com/__init__.py b/seek/163_com/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/seek/163_com/__pycache__/__init__.cpython-312.pyc b/seek/163_com/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..abbde1f7c1dbb6167390637262e5ddbeb964fe7c GIT binary patch literal 135 zcmX@j%ge<81oc}p(n0iN5P=Rpvj9b=GgLBYGWxA#C}INgK7-W!Qj4&PNzPA6jVVYi zNiB*gPEE~@F*GxdPtMPciI30B%PfhH*DI*}#bJ}1pHiBWYFESx)W!(J#UREz>`$Rk2Z8b2kA>+^e1!al3va;M8SDEgA~Wrgbi`R7eYQ ze^$~$z<2Sm)^&9+A!1S@JTRlUXIiNiz76-of%duow!0vys21T-wY#|_S`=2>ycKAM zc+@+Y>iIY8yvNgVtUvt*-J3e|be<-*{f@sGPf)vOOy_xCu*MDRxKS)&e(m@tYW!(rDqInL)yVEnoFeXPa!{? zq^L$xl190ll@>{sYSKb5#WZ<=q`;Onq*?G|g>ZRC|w@!LL34+j! z0C`j)_X43CZPh^hX3yO~-#YA(OcNT&69%8-)`!3>lXg-jGq}HH%%IMep>4Q3gI2bT z>$FTK$cx1CdxCxg3OU?@+C4y6Y_YeBROlBmU`LH=1b9nVJTTehQeK> zK!Vy#Zkm7#2&f9XHnaFOOW>^%k_f|O2b$4QO-!4jh$!xMPIC)l=fthlyCq&NGBjZx z1yUgmNp63s9(#7V{6)DIOH^Zt@Aj|84%cEws<9)BCmVhei7$p9J@k=C{~v^+?!Fr* z>*3g9xZ#84V-w^5@$Lk<)j1@8Plx108v>w^&U00@J5)7qXhj|QpsIdQOzW;hU~_R2 zE8@V}p(Z#oT-O0ORA5F;Oy28`Z=xObk#wS<2gsuZxIOhffAj4>2~2g^m`Vb8djJ9-jyT;?@a53fylE!c1#h z03{qyHBYI5ui$yu#Q)4jCAeph&S3hs3@=1qk*{Sh+>}_DWM^ziLde_SoIMK=WIBV$ z-ZD1jo2mUzUP=F#SM!kl@zPEO+pTJ)&8v!OLCO_=jc1A$SJIYJ>TQv7YzmvQL_lCr zZHB1m$i_->m{54Lwf%+r_QJx*FJJ;rS@Nw^x<1=fe%v#EzEZ;G%_*kO3z zlkkCC1u!q%%tkS5a95{+tM~`e2HI{`nVvH(xB4+VVh56bH0E;vWw+-{6#xZKyDb-m zZnrfJ-6C$$;}h{XaKw$>YvM30vDca30BOjS(qBpV0U3NiUcXQF+$S&GCy9stb{b!L z{~>{?5ofe}rT*z!@bBeAlnCQfWuuoc0zEek=F0&)R|P<=l0R`N+l zQr$a3GC>szD!@vS0yf&hErOuUQylmuKnv6;P_$_B);=6d8DuW@Q=1fhDp9$G;kCm_(P$fj_Nym1mjWS#{!AIKiFIPPmqX|t(vZd(C`Cu! z;4*xQ=P2znd>J7nFj_Fgj6daPv}gn}!BmjZej}7=OSN%`M>9wZY#}X(+aCB_`>C)N z!h+U@Ba4C--V#z#$}jR-1n73$p+#555F)cY!og+PbFMjR?GL~|+NgajJa8G5>eM=L zr_xntktPAx-QYTb>+v}FuN`;pv2*ZRT}_=^+$}EA{Wof2F2OlH7Y&@jS!*t5UCQON z+C@Tg1m?T0%w`vJB%@k-E<2yglQf1|@yZ-tGlB3Mxuqpc6@&8_ubfv+oc6q;V||m* z?pJ`h4aa~{iqlYv*EsB3s-n|j}n3-Ue4uB>~!hPj@Bj6sO^&=T6^A-=d-M-X?7DMg9W%_BKKrz7VqLnyh(=w;>dL2io^}77mmO4QbG` z*c^+3((vg&Sa1GMcWhsS{~Lb7j!65&z%DZ&*gFq@0Vuew$Sub_x;Yjbt8(a_?+cJB z^JoNFy`IKVL`F75tKZ|72Ry1dYGP4*7is?Y8vDbcHvw8U#P5pl{L&rQIO~x22iP3` zuQmhJ=6j7ZYq9vmv$%%VeQ%;QE)kq7gp*mgk0%y&Vp@fGQd2EeA-Oe~&MGRg^t6Fx z_*Al}ufHtU^%YCE3|tT`b!k#9^d5u&lX78HJ|SOHbpva%m6I3stR`a@D`Y*(R*-cI zXOe}%Z)NfNDo$IVi79-@xpX>D2-YU$M28bFv1tN`z}-{8fa7O_NoZm?f`!*DM`R8; zf}v+u9ABOo?sfFL_ulcVtE)Jx5t{tOPk$5HI!@@~dK#}%gi82H2Ni@_T^OEB6A5Yr_%Y*I@yYlPlYC%A+yCiJed$5YGZ-=W(?)!x4Ca5dI{Z{hAjB{o`$ zjoy!zW2Y*ybEVihdrB?GZfv)G7CW+ivBrm^)7;~MBkzvh2~_1{AB5iz-~UEgo~+2z zC3*UHFO=okiab}6=k5f46s-=8{A}f?EB8msLoZi`PM3yG!)&8A`O?M;h&#=B#qZPQLkJZK+>Qc9C-FLgnGvQfw<@m?y=N+FLF0xA9?j7 z^`m+FjA9?YUXj$2q<;Ch`w-Ou?>wE!hc7)Gu@6sGq}NK)YhTuURQlYc*!qj};`y`u z@6U!Vbc!E~{a+suKOTutdZO<_MEoQo0-Xp@M9r!htSGSSN+ze}4N6B8RHoLvuW&v8rfN|HnJNw{fq6;Fc9WAbWlM$dhFO}xnpee za#iZ9Nb!;sFH0kvvsEE-`;E8W*b!pYaE~qYu+7*LI!$vHdJ2MW4Sl(_o`Mo>0J4Hh zwe|pWoJH-hw%nk+8Gzvi;5O&HjR8Pwfk-RtY3$qjs5jpnJqxh4+%mm|`&v`pJkb9s zD*pc!)h#&lxxyhAcrDdjk>QYuCF6CR&RbYkEx9nzA_X{Wl_i2Mt_keb(+6QXK>%}X z=^5{oF};&ikIKIX9i)=bcKh|W zUbnlCRl3JZ-Q#xGar^k?KlW8-Q>EEdW%gQW_F8%Nx_w!-r5if}ZcRNQQW5$~LjR61 z*i5_w4PN?Wy)tvPG;_5wvrw8@D9>E8FDbTk{gI%u6s3TXQT42%6h^Mx)J@sN3>m0w z&XVEjDQh}`q9UhN!-!ug^xw3sRdaG;!Ym}+hZj7oVeSzq#1btIo!bw&-Ojb;sIIWmB;PINYTHc#(*-3&&*kGXL z^fdM8rLBut1?zy9a1uUQ7$93F)Q+sRWgr&^HL7x$z-#A&*wA=fSh?&P|aiU@5Md??VR=&T*fij?d96pQGs~ z=*1^!_z4<$>hIwOw=X|MFw~L)7qjChYY4`N!Y& list: + article_content = ArticleContent(information_source) + result = article_content.get_content() + article_content.finish() + return result + + +def content_task(news: TNews): + logger.info(f'{news.title} news_task start execute at {datetime.datetime.now()}', ) + ofweek_com_ai = ArticleContent(news) + ofweek_com_ai.do_seek_task() + ofweek_com_ai.finish() + logger.info(f'{news.title} news_task end execute at {datetime.datetime.now()}') + + +if __name__ == '__main__': + logger.info('This module is not for direct call!') + news_ = TNews() + news_.is_static = True + news_.url = 'https://www.163.com/dy/article/JKC1V4E70519DDQ2.html' + content = get_content(news_) + logger.info(content) + logger.info('Done.') diff --git a/seek/163_com/house.py b/seek/163_com/house.py new file mode 100644 index 0000000..2524884 --- /dev/null +++ b/seek/163_com/house.py @@ -0,0 +1,59 @@ +import datetime + +from DrissionPage.errors import ElementNotFoundError + +from database.tinformationsource.model import TInformationSource +from database.tnews.model import TNews +from log.log_manager import logger +from seek.seek_base import SeekBase + + +class House(SeekBase): + def __init__(self, information_source: TInformationSource): + super().__init__(information_source) + + def get_news(self): + news_result = [] + _news_list = self.session.s_ele('.news-first').s_eles('.data_row news_article clearfix2 ') + for _news in _news_list: + try: + rs_news = TNews() + rs_news.title = _news.s_ele('.news_title').s_ele('tag:a').text + rs_news.url = _news.s_ele('tag:a').link + # rs_news.summary = _news.s_ele('tag:p').text + # rs_news.occurrence_date = self.process_time(tmp.s_eles('tag:span')[1].text) + rs_news.source = self.information_source.title + news_result.append(rs_news) + except ElementNotFoundError as e: + logger.error(f"ElementNotFoundError: {e} - Failed to find element in news item.") + except Exception as e: + logger.error(f'Unexpected error occurred: {e}') + return news_result + + +def get_news(information_source: TInformationSource) -> list: + instance = House(information_source) + news_list = instance.get_news() + instance.finish() + return news_list + + +def news_task(information_source: TInformationSource): + logger.info(f'{information_source.title} news_task start execute at {datetime.datetime.now()}', ) + instance = House(information_source) + instance.do_seek_task() + instance.finish() + logger.info(f'{information_source.title} news_task end execute at {datetime.datetime.now()}') + + +if __name__ == '__main__': + logger.info('This module is not for direct call!') + information_source_ = TInformationSource() + information_source_.is_static = True + information_source_.url = 'https://sz.house.163.com/' + information_source_.title = '房产_网易' + news_task(information_source_) + # news_list_ = get_news(information_source_) + # for news in news_list_: + # print(news) + logger.info('Done.') diff --git a/seek/__init__.py b/seek/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/seek/__pycache__/__init__.cpython-312.pyc b/seek/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9105fb5d2467e6532a4d42580a339134d4afd759 GIT binary patch literal 127 zcmX@j%ge<81b)S-=^*+sh(HIQS%4zb87dhx8U0o=6fpsLpFwJV$wgSjB17d#K&jmWtPOp>lIY~;;_lhPbtkwwJTx;DrW@ZVi4maGb1Bo5i^hl030nF A5dZ)H literal 0 HcmV?d00001 diff --git a/seek/__pycache__/content_base.cpython-312.pyc b/seek/__pycache__/content_base.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a6a8d973ad5b5f8d25cd1067713ede6ad5749db9 GIT binary patch literal 2921 zcmbUjU27XhaPLl^*4c_O5fbLC6K4iuI?m7F)4H6g~l76fBu@W-+50to#^5pVciV7Cas64H^*8K}TzxB{QyIg0a!Q1E4Z1%Jk0 z2xJ0`CK$m&C=+6^&j=SW*4wMZtAk&goek7S;&_PPUH%f=H>XAR2w1H;H zoD&(twqjLz0BtW#>Z*;EBA&98Y+1?cK=fZt0}6m(m}6SijOxt0$nHr1mJmi6PDdGD z=dh6Db$%9ZkNvtZi&k95OhEU|qD)Zt!yM8BFo(-QIpjo!&7zHq_K<2}$4Av`o^X1U zUY`DTiyT!n%}pbP(%H}vU_fucYG_H%3 z^x*os;#$L@GP0u{-P3f3x7$71H2*#4af1altzd!Aa(zss|w^b)1q#cBETspHW}Y-2KNVSEd479>g* zq9iq0c0!7hFXnAUsdP5PPs*cG&sb%cX3%zL*PL%5yd8_fB6>?Cx_J8T>8J7D`=896 zUl`dI_brahk8FvFH8HV#^}+Rf*Ehx97wyt|d-vSX(|FGh`#0hrQ_knH_|o*((_69r zwOIdp>yPzCtwb-N>3>L}My-TRf$$Vw-oSVQ>HG?JF zlycbC#wD5vQVw!iQYC8=Yx!JWlPt4DG@MF5K~Ff+1!63!Kq3cVt|qkmPL){WtI$k$ zHz=n?Ju-uK0w~-{5&*x`OeAw@auB#klAdi)*g@r*npPqN7d5QV`cX+XD{f+7LTEIA zXQCZ%tEa)kUTG9GLB6Aj<~Ia}dgSkwt#9N~EwA91JNBvzGfB`?s|+6Je`=zZvSHUM zglF3zy2IJ6-af5cd)d0;+6U#Iqn|onuO2$T_t}Ykp7i>9AL+t3f>&3d3UNK?7?T)k#rLd z6ir)F4T4p@EaBU5GL^2hTP00{Zjvh*ZUk4Psx&w9qE&0&*IQ%I5IQl5MnmoFC^wO&M z`DWCIUWD3(oqc z7n~eTELw+}28r)Yf48hM$BJ2U*f60luvl^1f}|F8$?X-Xv@DPqD0KW9WhAr}zExBL zIjHN_cx8X1)2g(7;I{^00(XIDj^91*72foGdP7V=L4!hk>h7spH9xo}9^4eA9RZ15 ztkk6C7TP+BY5^;X)1oK^Q!g1578T`-l4?|U2<@jNPAB?E)hK0Q*hf)nNVO!=>dC4v zp_$Gm8lKiiFwLSJUi9}YU)x2n>?Ba2^DmzFx9$o6?NI1ti1T;83!+;Ivi9n!>ocK>rJFygFInXRN!)Rw#8hK_% zy)&x4%Zj=n$|?x#&=v*U_Bp|M%tN6sg_ct2%T8=F%Z*J6O(}g-tCT>VdhXqwU0Fzd z=w0^SbIv{YyXW3HKld-4ogE0;=hE*MI}!-}jWS`2G>M%e5L-w`I%gr9%W*cJ<2lOn zmSBrH(T?OIb~G1dGQo=3@m!qIqLr{aavhA0SV>#TNsNwKopx8Qi$gs60n%gZNRJ!b z6QQwRE>%h>9si_^!V}@rv!JeG zOD#H{YTO3_g%T`hG2bw<(eq7aXA+1lWS|_Uqa3euhEU*jejV<{i0HyPdeT(PMRjo< z!oRY-!fF#!RDiT}3&OO(_ zMDVl)k}zG>^}xUKnC?mha6lq}GvOGpm_`+efupIQ44y2yhE-?)@_EOlM$5AN-QW+W zZ|5D|xV>U{2EOfrldw)*09VPblt^4v&7$e4YPl!$plm3T0L1+nbPrJd_+(Xl79Xtl z_E*JCDI5;>0vE3*GePpy7s*pE`iCFCRh`+qUYEKbUf;MLNNHb6*D|Mr%vnEk_PI3q zb&veC=XmwhQ`+cXMOK<<+A6<1;fiP$uuhE0>rzl$?l4gHw$K7ZUjhn z?o;$BhkF6Wr5nh&4|)=(K~IjT#a~ClwH&I5 z^XLRR11@_H3h4)kgWu-#ec=Oik5eM$YxCNV4OjM-44Kv<*(o$a00M%_bPX~GWC7VM zF#i3U*Ja&-1yHPH&4Nm%mto6tkm(t=E1ShVfLTStgJ3o3 zUcsiV;gXt%qPXbbQX@Yda*e!G)OSyi9>`9f`HCoFX$yu$L`y3!V(Lejetw*!$bw}Q zNyIZf%TOXLY)PV3bcskw!s`p9(`=f}qS&RL0$EXRhPR|UP|}UNkV4(N<)f|qw_8jP z;))Qs571wa9NU`xX!hstRTEzx$y8_Rsl!3)4L|ipEj3;r84pG-_#+pBksJQVjYs0M z?vreO*iQ}DQrY@wHW=*f6`B#tfeOE%0!@C^p%T&a?@9C()nSS ze^}lq*HT0E%utY-^fQw|X4=n8KN5e@K_^QaB|jzC({hkL>!;5K>2rSi+#~UqiS484 z=)gCxp{~BoJ0CB+NWcAC{QK`eP0v9kKFzv}O;=t4QE!_B^jQBsXN*t>bgI6pEDlc>WX5p2! zqAoY9adu7-@FB2;MDkP-cLR+Fpxa#_q6H|P?ov6`@|}j@dsJ@fyN$&xJK@ zZe#A5l!l4{HSYX}=Uatxz?TM|OY*jWq+_g<%qa<4oQs-ms4D4DRol^5ElNwO`lD6N zYRq7GZX-;K5vEy{1sg;aFs-fkZ*(lx`wu3wcbhn(a)a8Em$m(CmG}9xm*;LVCh% zlMM+L(qt%Pv)-;}YqKcBi4BV#kQ{F7(csSJ@v6@90ZGAYItzc*){5|U$1FiK4*Kfh zN&I}{3g5=lU^K18?l5%Q0>^P*p%ec=7kqT#??`!xF1|#UU!uWpqFHXJ>i!48z>fZ3 Dp;o(b literal 0 HcmV?d00001 diff --git a/seek/anjuke_com/__init__.py b/seek/anjuke_com/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/seek/anjuke_com/__pycache__/__init__.cpython-312.pyc b/seek/anjuke_com/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..588fec9fe05c0659e82eed24a6c2637cc3d52838 GIT binary patch literal 138 zcmX@j%ge<81SSVE(n0iN5P=Rpvj9b=GgLBYGWxA#C}INgK7-W!(u}Z*NzPA6jVVYi zNiB*gPEE~@NzBVC%}$L^&d-gBkI&4@EQycTE2#X%VUwGmQks)$SHud`$q2;7AjU^# LMn=XWW*`dy(_gU?Nt=J5KGccf;&j zX)RYWQY2C#q;Np3gg_j)p#B3O@gH!pNh+I#azPxpS)`z2$-ciZ-1O2}2B5sm63hohv^ijjuh!)~Q*JaOqZde=SOdI0iBGZP~ST3T4 zd9WyH5s*iCR2#W4O^8^O2oIGN?@WKMMX$sCu%Ue(yzWg%F0RFRTpjIAi57?1m_LJ^ z5gzwX=EnccIv@Abb&lV?4YlhA_A*HzD(bSh%Jy+a#4PptkM*lcSvm{{cZ_RaHQvFEpC{$ zQbd?2+=C5dYrx``1t5Xws<7%Xn_sjA-Yq4GC=5=hcV4wHhl(QNxH`PdZHS^9!CG=x z)x7R$1oDlWXt5V0C(^@9f$KV8i3-fHg~>eOdME~QKX${Y7>Dlm8tFq?+tvIKeMUZ){cuxaW0Kvr zD+v*QeP{G7K#*w}@x80<=yy{4pS+U(FRv~^_UCK+6l7r4&RZ80%Z8LI{30(@ZLZ{P zrIzf|a-@bGT`D9ns18G5I{~|713`qeyv^-Wg*&ogu6oE1Yo?0q>i$FQhciGPE-OdO z;wmbDZ`FM19tH{fyn+`Khv3jd@L6@}0AFnA!neP(q z*|V+Lv+ddW*6jS|?73#M-lBIZ@qJZR)s8F<$;00SL zH+Z>IPQ$SAWAMDpUW@vG{@)Ma@p9~4{^`G2hVic?&BGcX3}syt_!RY zXMAU$dl65CT;W9F*-yg{GJgRm?Ga|4bGut?HM*?(_ZdA7kj@=kx;q}@N^4<=M zHclaGnz3VB?D&u2_(!K3Cp&wRjgxDyc4B8?IP>Gk-fIUoN0i3tySq0AwpgNXc7r9l KQNoh${r?6h8EupR literal 0 HcmV?d00001 diff --git a/seek/anjuke_com/__pycache__/house.cpython-312.pyc b/seek/anjuke_com/__pycache__/house.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..45dc55af6ed8ce405177b3e84e4b53b3ba172e3c GIT binary patch literal 3988 zcmb_fZ)_9E6`!?tZO5^bC5iJF$l3vd32}7%36SGB+LDCmq})k?>ds187VjikXZ`19 z*CO#bRZgWMNLMvTt)l95YOksUC!`bisi^(XR)SCM2OEdlw+EkED&4oJp%uFO(l=wT zZBS5EsUz*on|W{E%zN|Z_h$d>_j?hPS^w|n7n>0J8|~Ods1n;D4xtUCB9+rol(r6? zOE{tqj?zxunc$;5qj_COxS}pb3%WbuiFz3A(wh?AsF%@hy*c5F`Z(l3lSuVkLuwQD z-f>peiuzSA=GA80GRLdFYkaho_U9a`A7~NpQd<@eB1EPf2z%oD?2Xo2we=3{l@_WW z^LHv86m3^U+%9)iyGY#yy+Ezk2E9(Z2lGJo1wA{j)gZIXEpy?Huc?aZFlTj)>o|dv z=5)%ONM(}hI3Xzl<82qFl5;7NP)seAoJnO!48y2!VH#gGfN<%l`FTtQJu?_DepfLl z+m|M(7&Z*q)Dk#m?~=Avr=jgb(76srgi(}JQPiPw*g5A|Lvx(!xaN!Us`DC(3TtSI zi@Ih|n74%Usf>ZGHm$xL*NnpRxoBWwj4Y;-aV3^c znTnYjIhTrMs1uEmSW3ksX>4LLVt`*plw>@!h~-!+F|t7Ih@_WEE2zr0FtZ}|x_%fl zc0u)g6+QO&Z@3Gd?tIVCBhTS&7(}sZs8AZY;bW${px!|9XoWLt{2Yomx`jO^Ho<548LL*465iXv&)|Q ziv752``N)Bn+N-rvNQ0;4TAF)9RaCepIS zVS`>gsZvNARpFV(CX=zj&hFC%j4^QbDk^t&uQdbUt*JlNZ|J3#{z6Otrbj9VdNyWn z&Xxj$g}~r#UomjJ6gXW7oX)>-p%}Qh=KU(rzc#+*@cGYjk9+$+9$a^qyLNAUaPx!w zz_DW2@pZl|?Yrgu)O-76Q94$VP8Fn6>+YZV%X>q=jQ=8jyQ{eOwbI_v!roCBZ7zon z+#33H==OXuGC!CuTDG?hql^Lr?j;jiG6ub@AGF(sN>*25$C_c zJr+ARd^dgh{^NI*yEFNda(>U{lBhfsm2V%n@1`R++P>F*F6_7;Zkj;C{Ubf&yMzb3d_X_g(>?yW@Zfa;=&+k+x1^H5vJ3|) zCsJxgr?gL&-_Iy|g+pk9V3~;w(X;bcGhYnrI%GwT(A;Fu`*D^XE};O&DvC=6eV;tP zhPE7>Fq|KH<6j8cf4ka*KGJT>RU?sLGeeRuiM$RT@r%@F<2A_R;S9m@A}&xzP-r@%FP{lzJodA1=Hyr zWTr>K=tk3bO5yQ7q?l5+w}A@?*~klf zDp1L@7wB#<)ACSgrEgrznM_?}@nrebluVy|Yz1-z#KI01Rpcnp27Q!Cl_07H7u^XN zgWE}0|e(~d{pZ_{5oJ%EfB-}+#0M}|iM>LiL-%;jqgry~eB`PZj z-d3)_A4bG%_z9?_vkPfDj=CJ%M1{qYXDj@!MBoFnlAxYcPPB!dSA-oJRZnOJ zv&4+4=>{wXfXPUfSSx1?Y1c5>^=Ai1=*f{$w)=*iRew)pt4T9t2sKlSk-b!s#?EJ` z0*RFjdAj1+Gh_l-6!(nZKm{1jabKanAwnW7H*0K+sJ!QUib%KSKczlx|{j&FPu{V2Q{sRZ@b list: + article_content = ArticleContent(information_source) + result = article_content.get_content() + article_content.finish() + return result + + +def content_task(news: TNews): + logger.info(f'{news.title} news_task start execute at {datetime.datetime.now()}', ) + ofweek_com_ai = ArticleContent(news) + ofweek_com_ai.do_seek_task() + ofweek_com_ai.finish() + logger.info(f'{news.title} news_task end execute at {datetime.datetime.now()}') + + +if __name__ == '__main__': + logger.info('This module is not for direct call!') + news_ = TNews() + news_.is_static = True + news_.url = 'https://sz.news.anjuke.com/louping-965203-pan528488.html' + content = get_content(news_) + logger.info(content) + logger.info('Done.') diff --git a/seek/anjuke_com/house.py b/seek/anjuke_com/house.py new file mode 100644 index 0000000..37caa93 --- /dev/null +++ b/seek/anjuke_com/house.py @@ -0,0 +1,62 @@ +import datetime + +from DrissionPage.errors import ElementNotFoundError + +from database.tinformationsource.model import TInformationSource +from database.tnews.model import TNews +from log.log_manager import logger +from seek.seek_base import SeekBase +from utils.time_utils import process_time + + +class House(SeekBase): + def __init__(self, information_source: TInformationSource): + super().__init__(information_source) + + def get_news(self): + news_result = [] + print(self.session.html) + _news_list = self.session.s_ele('.main-list').s_eles('.m-list-item clearfix') + for _news in _news_list: + try: + rs_news = TNews() + tmp_ = _news.s_ele('.item-col-right') + rs_news.title = tmp_.s_ele('tag:h3').text + rs_news.url = tmp_.s_ele('tag:a').link + rs_news.summary = tmp_.s_eles('tag:a')[1].text + rs_news.occurrence_date = process_time(tmp_.s_ele('.info__time').text) + rs_news.source = self.information_source.title + news_result.append(rs_news) + except ElementNotFoundError as e: + logger.error(f"ElementNotFoundError: {e} - Failed to find element in news item.") + except Exception as e: + logger.error(f'Unexpected error occurred: {e}') + return news_result + + +def get_news(information_source: TInformationSource) -> list: + instance = House(information_source) + news_list = instance.get_news() + instance.finish() + return news_list + + +def news_task(information_source: TInformationSource): + logger.info(f'{information_source.title} news_task start execute at {datetime.datetime.now()}', ) + instance = House(information_source) + instance.do_seek_task() + instance.finish() + logger.info(f'{information_source.title} news_task end execute at {datetime.datetime.now()}') + + +if __name__ == '__main__': + logger.info('This module is not for direct call!') + information_source_ = TInformationSource() + information_source_.is_static = True + information_source_.url = 'https://sz.news.anjuke.com/hot/' + information_source_.title = '房产_安居客' + news_task(information_source_) + # news_list_ = get_news(information_source_) + # for news in news_list_: + # print(news) + logger.info('Done.') diff --git a/seek/cnn_com/__init__.py b/seek/cnn_com/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/seek/cnn_com/content.py b/seek/cnn_com/content.py new file mode 100644 index 0000000..db7fe87 --- /dev/null +++ b/seek/cnn_com/content.py @@ -0,0 +1,58 @@ +import datetime + +from DrissionPage.errors import ElementNotFoundError + +from database.tinformationsource.model import TInformationSource +from database.tnews.model import TNews +from log.log_manager import logger +from seek.content_base import ContentBase + + +class ArticleContent(ContentBase): + def __init__(self, news: TNews): + super().__init__(news) + + def get_content(self): + content_ = '' + try: + content_ = self.session.s_ele('#detailContent').text + except ElementNotFoundError: + content_ = 'not found element' + return content_ + + def get_occurrence_date(self): + try: + header_time = self.session.s_ele('.header-time left') + year = header_time.s_ele('.year').text # 2023 + day = header_time.s_ele('.day').text # 12/27 + time = header_time.s_ele('.time').text # 08:05:11 + occurrence_date_ = f'{year}/{day} {time}' + print(occurrence_date_) + except ElementNotFoundError: + occurrence_date_ = None + return occurrence_date_ + +def get_content(information_source: TInformationSource) -> list: + article_content = ArticleContent(information_source) + result = article_content.get_content() + article_content.get_occurrence_date() + article_content.finish() + return result + + +def content_task(news: TNews): + logger.info(f'{news.title} news_task start execute at {datetime.datetime.now()}', ) + article_content = ArticleContent(news) + article_content.do_seek_task() + article_content.finish() + logger.info(f'{news.title} news_task end execute at {datetime.datetime.now()}') + + +if __name__ == '__main__': + logger.info('This module is not for direct call!') + news_ = TNews() + news_.is_static = True + news_.url = 'https://www.news.cn/politics/leaders/20241227/90e76f85ad4a43ba94802b07c5736e00/c.html' + content = get_content(news_) + logger.info(content) + logger.info('Done.') diff --git a/seek/cnn_com/edition.py b/seek/cnn_com/edition.py new file mode 100644 index 0000000..cd6a416 --- /dev/null +++ b/seek/cnn_com/edition.py @@ -0,0 +1,62 @@ +import datetime + +from DrissionPage.errors import ElementNotFoundError + +from database.tinformationsource.model import TInformationSource +from database.tnews.model import TNews +from log.log_manager import logger +from seek.seek_base import SeekBase + + +class Edition(SeekBase): + def __init__(self, information_source: TInformationSource): + super().__init__(information_source) + + def get_news(self): + news_result = [] + # _news_list = self.tab.s_ele('.zone zone--t-light zone-2-observer').s_eles('.stack') + # _news_list = self.tab.s_ele('.zone zone--t-light zone-2-observer').s_eles('.stack__items ') + _news_list = self.tab.s_ele('.zone zone--t-light zone-2-observer').s_eles('tag:a') + for _news in _news_list: + print(_news.html) + try: + rs_news = TNews() + rs_news.title = _news.text + rs_news.url = _news.link + # rs_news.summary = tmp_.s_eles('tag:a')[1].text + # rs_news.occurrence_date = self.process_time(tmp_.s_ele('.info__time').text) + rs_news.source = self.information_source.title + news_result.append(rs_news) + except ElementNotFoundError as e: + logger.error(f"ElementNotFoundError: {e} - Failed to find element in news item.") + except Exception as e: + logger.error(f'Unexpected error occurred: {e}') + return news_result + + +def get_news(information_source: TInformationSource) -> list: + instance = Edition(information_source) + news_list = instance.get_news() + instance.finish() + return news_list + + +def news_task(information_source: TInformationSource): + logger.info(f'{information_source.title} news_task start execute at {datetime.datetime.now()}', ) + instance = Edition(information_source) + instance.do_seek_task() + instance.finish() + logger.info(f'{information_source.title} news_task end execute at {datetime.datetime.now()}') + + +if __name__ == '__main__': + logger.info('This module is not for direct call!') + information_source_ = TInformationSource() + information_source_.is_static = False + information_source_.url = 'https://edition.cnn.com/' + information_source_.title = 'edition_CNN' + # news_task(information_source_) + news_list_ = get_news(information_source_) + for news in news_list_: + print(news) + logger.info('Done.') diff --git a/seek/content_base.py b/seek/content_base.py new file mode 100644 index 0000000..f3e1827 --- /dev/null +++ b/seek/content_base.py @@ -0,0 +1,50 @@ +from abc import ABC, abstractmethod + +from DrissionPage import Chromium, SessionPage, ChromiumOptions + +from database.database import get_session +from database.tnews.crud import update_news_by_id +from database.tnews.model import TNews +from log.log_manager import log + + +class ContentBase(ABC): + def __init__(self, news: TNews): + self.news = news + self.session = None # 初始化为 None + self.browser = None # 初始化为 None + if news.is_static: + self.session = SessionPage() + self.session.get(news.url) + else: + co = ChromiumOptions() + self.browser = Chromium(addr_or_opts=co) + # self.tab = self.browser.latest_tab + self.tab = self.browser.new_tab() + self.tab.get(news.url) + + @abstractmethod + def get_content(self): + """Abstract method to fetch news from a specific source.""" + pass + + def get_occurrence_date(self): + return None + + def do_seek_task(self): + """Saves the list of news to the database if the URL does not already exist.""" + self.news.content = self.get_content() + if self.news.occurrence_date is None: + self.news.occurrence_date = self.get_occurrence_date() + with get_session() as db: + update_news_by_id(db, self.news) + log(f'successful fetch {self.news.title} news content into the database.') + + def finish(self): + """Closes the browser and session.""" + if self.tab: + self.tab.close() + # if self.browser: + # self.browser.quit() + if self.session: + self.session.close() \ No newline at end of file diff --git a/seek/fang_com/__init__.py b/seek/fang_com/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/seek/fang_com/__pycache__/__init__.cpython-312.pyc b/seek/fang_com/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f1d04b13a7d666a501efc2c5bea22528dd4b7c6a GIT binary patch literal 136 zcmX@j%ge<81Rw5Zq=V?kAOanHW&w&!XQ*V*Wb|9fP{ah}eFmxdr5<4wlboND8dH#3 zl3El~oSK>)la`p59-o|_8xtR&nU`4-AFo$X`HRCQH$SB`C)KWq6{wF9h>JmtkIamW Jj77{q769!8AHM(q literal 0 HcmV?d00001 diff --git a/seek/fang_com/__pycache__/content.cpython-312.pyc b/seek/fang_com/__pycache__/content.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..93410dd4752928345c28285f23d3193e2f6e6c30 GIT binary patch literal 2735 zcmb_d-ER{|5Z}F9J5FN9b`pqjQ8*w_Ou$i9p!p~jZTbEPl%%4#QBSUK9URWSbnmQy zWh7UML{L>JNUcCZt@>ERKhR3N_n|Lw6J&FRr;3NZSq-faPo25TB~D5oDs`mWo!yz8 z+nM>z?9ZW48v*+1QhxF)A0fZt!XsEaw7icJa))R{qdH09Ea@~ar6h{Htjl?pVh+!A zU*4bcJG@VC$p=yahxhBP`L^M2t&>2c{JFO!KIPAHe;P&_4G+a2F&M)gnBqcCIXm7KK%pw*t)& zk9sFl-T!8tcY8XH^;h4Zb4}+uy-pKdzvFL46V&cLrt>^6Sfhq@+$a{bV?r1L_B+Q< z6|#oNr>&e(7&D3@!(rDqKFViIVEnoAifQsJN%`i;HJb8|kp#0l-W66Zqw^KZHY>nSGg6arw2j|# z9}Hj9JH%QQ-jd4XOF%}MHu+5M)6s-v`^@4r7j{5ZbA_CxstMUFP4x*7$!Tg5+v2c^eKvn;%@sSw;+U0_)75&VQZ41 z3G*Zj6;kioG#6+{a%+FR=e0ZK+vQqMqS}-Ae*a?6V6EqHwde5M$%daqVsqga&wV7) z_XnY{r24CSK|>&94H7*LH45Aop4) zSnlCWY-hNX@3@q#=HW1y&$r~X}3tdJmPiDqHe9!j+<#GT+R@KGv+=R zpawSgE$n)0ar0p1d_A$buJ*cg{);y((LqJPg+{)#-Ef~#>d4vJ$c5_2 zg~gGJ3&Udzk?|#V=}GsF?+z?>@2Q-tvuKU=R$1>7Qye~4Ww9j||A)1~hq2nw>FUtw z+R$iqXmoMt!ouJO3z3VD*qDP?Y$*A3uAr)=o#R(>rsDP#1vs~d3PsBap0SinTGzLa zm-??**0ec1FkqIF80{p!FiG%wz?kNRfrD=h9(wcO(4piNE3cP)M~wncCN_!J!B@NM zsK_}T_(FP;Cpo^|rqh4z?GWnpr|>2xt)|xEc9taEi0cB&!tP$vbN=m$uszsec;l1s zfj0#(FWiJiFDcSv$?08BN&;4yQHvjQ+ z0#hT-Xy-!Tp$38ZhXczvH9CBBoX!`Q3C@iOVZpi5FY=Z;3s;UKP!=Kwm)N1l!RQw! uD#z-ZYa2FAihxbLs7r9A3&C{YpV}IyZK`>ecg-DKk2}3!X~j1n$mN!p|WG8bzw-GE%+R zciUOpE77X@Ft4`YwkclqU*;15I-YWG z9M6~|nOs^OCL}{(zWvNtdMZOwim7GN6PX-IVwe@qjN>^22$!Cjp2k$rGlB8!4-|v4 z{aKPpV#AP4ErpY|m9*bHG_sOy0-9m0mp?O9>XoL}{hYDyMU#Z;uuoypQE;3>Klukji%u^x z>7K0FnYhRooLlO5)-?zPVM__#mYDI_S{r?Vm+N-DcD=u`&Zb~}-PZ3|FT7^|x>aVd zu2tCbv|P3()!hZ6Tlr&4fL#oss78oZ{g&E0!0I}`F$b&aJ{Gu{o(9LJ3iZ?E(A6pr zc|3u7k-6RG??z;3(e)j%^4Qi9;UmDnMc2<zO0Gk84W|(=G zsZ0;)TAq)a^JZS4Bc;}5^By`fvP$}EdWG^rTr+Vh-?0_24N3Vv>3}q%XgXFUGb2rD zX;s2C2$Zz6M0b?v`gp$QY#PsJanb}yOi@wFB$GKpusS5g+AX&MKnK_e5F&#D%f&i_ z(A2hg6VIELU}`3CcwI}+TFxBNVNouXQpf_K31|g26{eF|RuzB~20E@9x>+tIo5g9B z&>K!%R9;|aSl;3JB+gP?j=70Q87z-Y@&j2huIQ(23{2E9L0 zt&w)B!846bCSs$OJ*N7N0XWztRPE?oZmEX5Rwu7bR>FOyaNkXTIXqAaA1{TE-#u3j zk1YGX3h!DTUUvjqPjZjCcU|vWaaTKfR%frxe)eX$BevqKO1(F{pL=iim8C-!X|N;> zuDE~ITHP7>o%UPprn|iJt;)`U(#`>xZK+20+=zW1yE$5p9m7>RBHt=ZQkvqm8 z^S{skWnX#lhksQ{gCAA~&z1(ymiPT=#l0r>RU>;Uk)x%^(Mse*DRKhVv{l8PiWn`4 z(a#1Siv8ZC0*bocGMy~Q0TD&o5j#CN}4cT(;PgObKLWJhX;jw!EM7H;a;o@zV|(Tn7JS796lu6KO_JhbJI$cR#I4&A&zn?qvmu< z`(^p3IYqB=2oFkGi6Jo>yVlJrmBhLZ6^tXa$Qbl4oMgcz6m{5#0*yi663;KAbq6OL zDE5E<-w68uxI#h?p;%ZGt|cPDW{#v^DoivPSz($40*Y+CXcdqRG*GIzt%^dutT=#Q zF94usv)OzO6&w?Y&~h1b*34mUy5O#A#wD0z)_JtFVJdKSEN?yEH>MAJ1b(OBu2_&@6uS6_h0$o;}6#OaJ3~=S1lBPT4b1A-oORrtE_O^lU<8lb!-=uN*x+xL2C3 z0gjfNp>LB{T7%yR=>O!Epegzn$T*Y%0dA1VJXIgy(TGXJSb)rF?f27XY%E z7xUDRl4q~QZgA7`P-La=QOlXm%&~m3{Axy~FE_RWc?-m14whBqZJ-SrPpOtfo5@8j zAxB|4PWeBF52O-+?;y6zruHw_^vg+r;&JDDjc~tUl~1D8a-DToh*$`mPgMQN93Y-;UTZE6s3TX zQna)z=X=jw(hRAF84^(Gj48nrUQ#szRZL1Mx*j=`@494~S!3wnK_k!VYn;Abj8F_Fhne+FmTmrtkS(uf$dF{FmL&8^MQGOCLNzd? zwef}LcUv xjq2{b@Ll60)t$!}(NmQUF=EG;d~2~Svc^Z(JxCC1t5@6)`Oft);=7p_{{@#?UHkw5 literal 0 HcmV?d00001 diff --git a/seek/fang_com/content.py b/seek/fang_com/content.py new file mode 100644 index 0000000..8d2cf1b --- /dev/null +++ b/seek/fang_com/content.py @@ -0,0 +1,46 @@ +import datetime + +from DrissionPage.errors import ElementNotFoundError + +from database.tinformationsource.model import TInformationSource +from database.tnews.model import TNews +from log.log_manager import logger +from seek.content_base import ContentBase + + +class ArticleContent(ContentBase): + def __init__(self, news: TNews): + super().__init__(news) + + def get_content(self): + content_ = '' + try: + content_ = self.session.s_ele('.^news-text').text + except ElementNotFoundError: + content_ = 'not found element' + return content_ + + +def get_content(information_source: TInformationSource) -> list: + article_content = ArticleContent(information_source) + result = article_content.get_content() + article_content.finish() + return result + + +def content_task(news: TNews): + logger.info(f'{news.title} news_task start execute at {datetime.datetime.now()}', ) + ofweek_com_ai = ArticleContent(news) + ofweek_com_ai.do_seek_task() + ofweek_com_ai.finish() + logger.info(f'{news.title} news_task end execute at {datetime.datetime.now()}') + + +if __name__ == '__main__': + logger.info('This module is not for direct call!') + news_ = TNews() + news_.is_static = True + news_.url = 'https://sz.news.fang.com/open/51863596.html' + content = get_content(news_) + logger.info(content) + logger.info('Done.') diff --git a/seek/fang_com/house.py b/seek/fang_com/house.py new file mode 100644 index 0000000..3f50cc1 --- /dev/null +++ b/seek/fang_com/house.py @@ -0,0 +1,64 @@ +import datetime + +from DrissionPage.errors import ElementNotFoundError + +from database.tinformationsource.model import TInformationSource +from database.tnews.model import TNews +from log.log_manager import logger +from seek.seek_base import SeekBase +from utils.time_utils import process_time + + +class House(SeekBase): + def __init__(self, information_source: TInformationSource): + super().__init__(information_source) + + def get_news(self): + news_result = [] + _news_list = self.session.s_ele('.news-list').s_eles('tag:li') + for _news in _news_list: + try: + rs_news = TNews() + tmp = _news.s_ele('.txt') + rs_news.title = tmp.s_ele('tag:a').text + rs_news.url = tmp.s_ele('tag:a').link + rs_news.summary = tmp.s_ele('tag:p').text + rs_news.occurrence_date = process_time(tmp.s_eles('tag:span')[1].text) + rs_news.source = self.information_source.title + news_result.append(rs_news) + except ElementNotFoundError as e: + if _news.s_ele('.item'): + # 此为视频内容,跳过 + continue + logger.error(f"ElementNotFoundError: {e} - Failed to find element in news item.") + except Exception as e: + logger.error(f'Unexpected error occurred: {e}') + return news_result + + +def get_news(information_source: TInformationSource) -> list: + instance = House(information_source) + news_list = instance.get_news() + instance.finish() + return news_list + + +def news_task(information_source: TInformationSource): + logger.info(f'{information_source.title} news_task start execute at {datetime.datetime.now()}', ) + instance = House(information_source) + instance.do_seek_task() + instance.finish() + logger.info(f'{information_source.title} news_task end execute at {datetime.datetime.now()}') + + +if __name__ == '__main__': + logger.info('This module is not for direct call!') + information_source_ = TInformationSource() + information_source_.is_static = True + information_source_.url = 'https://sz.news.fang.com/' + information_source_.title = '房产_房天下' + news_task(information_source_) + # news_list_ = get_news(information_source_) + # for news in news_list_: + # print(news) + logger.info('Done.') diff --git a/seek/focus_cn/__init__.py b/seek/focus_cn/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/seek/focus_cn/__pycache__/__init__.cpython-312.pyc b/seek/focus_cn/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..158038b57c5c45ef9a831a5ff0750efbd951c173 GIT binary patch literal 136 zcmX@j%ge<81g?iO(n0iN5P=Rpvj9b=GgLBYGWxA#C}INgK7-W!Qjf5TNzPA6jVVYi zNiB*gPEE~@Ny|?zEsjsli;0iV%*!l^kJl@x{Ka9Do1apelWJGQ3e?94#Kj=SM`lJw J#v*1Q3jmKL9}oZl literal 0 HcmV?d00001 diff --git a/seek/focus_cn/__pycache__/content.cpython-312.pyc b/seek/focus_cn/__pycache__/content.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1e7e6883efb94c7eb759b1e46245d5cd723b566e GIT binary patch literal 2742 zcmb_dTW=Fb6h6BvOu6XMNUBha;Yj>2zLx6sao7rE7v;?4r?!+T{mDE z$*m$0)K&@)C`hPPAB*?_?F0XyFLo1TvxTRMhrU@2tq@N=v+K>R^r2FZv}exDIdf*_ zT)zD|5apmK3n!`?fH&WhX5j+0JUuvR3GhlAUW<+ooYKJY6taaiP||!QUTRZy;)HW zpd6$jHF#w&0Gkkj`le-P&swgA9^m~rasSl&z?ncQqK0Wi>1s_ujo_@?ouOrbM%SuzwQ6%+xGu!If9J;}+b0rYJY;JgOH=bnsr-M3rVC<(Y$PLdrW1abgAEWo9m;(Uxb66!6fDxW+m6&A;UX$nFT3 zd!Z~SD+cC@?Jcnk-?Rgig*6d?xR{)XOICZ%Pz)1an6W%Y@hWAOPf>D(oT(_NB?i^9 zj8~u4t|%ETZ5WDT@Ka^E@^a5dBa<0jrIS~wN!g@9>6OW>o+%nirhp1NB#EonSrF|@ z(e)2XxTpA0X+)8|s6b=$);aQZ@QE*QtF7jX-R*hg+qR4<$h3-w%Run6AO1NC3(yW_ znC9ynXqww>Aax}K)7;Ajx`6>4hh1QLT}2---YDV?;Sz>#9A@B}bO~mJxRfmU5@{zz zrAR?Hj7U8R%Q`A&i zno{qEpsmq`(bzbOtO89@YJam4eSM*PyIhaPYti@*{fp6q_2@_~Ix=^%>4k7?F7)EL z2f}@SfWU40Z=Gy}qI0392d6I$o`f%V$KihGko2Q4B*oi!85Giaswh^6qU3e8sBzh^ zD4!M6nxkPHeC+J97)n;4RU#RxX;?l2v+u}Lcf@`h!y*Kmb)sPOAbT;3w--Iz?|$$n z;P98v_b+8C!6}JyALmJ8mgRc1h&)zduxHci-G>i@doLxlinKXT_Rf=osr5pCp^3-*M)%B ztQ~?BK7MTordNOfEn0*f^NRYvhY+$_`_n63;O+vL<{8{TZh-E=w~`xd0Zg7^n>M5% z;I6NZt_KXUFwI+I1Fh;;6Z@ZDCH`Npx(>Yax5ylf6*FyIkr6wYDbs5-Q#7fZHsw<9 z8ZG6Ju&Rsu7!RsNcv)K>GiM?&_|&vX&0L;ZQbC__>fW#F3Wrns3?}0q1PvR81A2Cb zD>8asN$1ufhu_5$elgaMo3`U;l#yY5k&c^ZzBn^KxG-{iWPV?D+sMPA`smr(=-K+{ zh1%$a#nFq^k@0GHVu^hCq-W>%gNr?TE9V*{QYYJLWZM#vZ8=sWu_e;`4{t*cWA)+F zwc*qC;bd(%xj1~GI`n=ueDM((x9gR|Nj{w`C`xJ9#O0hJJBT8qyI zKl}uK_>uy;XHGIBJW&ZZ_WT6ahSp9ZiIy`=r{Q@**k;c9xHFGkLyRM@B^8-wwY6N_ z2dBgN-{7V1Z6{FgutPZIux5ORtSJeC@GFF#!QeAE@)Y(wg`H0!{@mLk#O6PK4mdP> ziO^N;JJbXme;QooLo?_RCWQIoGVrk(2I8M9{UU8{kWl3~FOO>Y;1W6X*dO`oL}je8 rskbsVd%h7K#^KcC;O4u#7lU%;#ESKN+Y*VbnLQ%0W(Y{HegD4!1*T#` literal 0 HcmV?d00001 diff --git a/seek/focus_cn/__pycache__/house.cpython-312.pyc b/seek/focus_cn/__pycache__/house.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9fc7899ccec6f382b2f67ee5188f31149e99237b GIT binary patch literal 4015 zcmb_fZ)_9E6`!@&_ByeXC5iKwKWir(m?RFp1A-zBP!AF!P^c8HB*a%IPRddxy>? z98m{HX{YW?@KK)8yzWZ4qi#mKbWcKv3XFE^-b7urj?o^yKH-b{IOIShNEMcm>cw?; zou$2^ezgwsYCUe4;#J==A8n-kltc9cE#hrz!W2LcGT|wA!nUcfnTLsD8}f zEiowCtctiva##g?n7Juyek4s-+hY|^9tDYu5q%H(y&0{RhT$O2bAsd@A%Hx_nS zE+ajIi6zLgmefpH4m&O0!1@%ahvk-6mSegC{>TQ^ElEo+dd?3`#8N7rn879{69zax zF_ns?4LO#axJ*rr%*>NUkj^wP%OkdGKa3gM;Pre7Jr?}eJvpI0+tK?-=-Y%r6f0)~ zrI807=1mj4*U&UtT=8a` ztM;nPUZuJ_U>Npc$U-GvvyaDK5#QR*ipM5Z$l7Z(3I9oxJqH%iY$@qM@$EKU?c)tq z+NNL?%{$f>xOmW}tNlXdyg2k>k%t>Gj=GTJb%f)=NcwI|g0kIh!7CX~Kw>(BVkx9TZ8$w-a^~{T-*LtzPK~=i}=stcbade z?kKsP`$4Y07}|5Y_mkdiWGo*#TL_)Yh0einOv ze&58ZXIrgT|LaUlV>(_A>Uqa!O zy(n55^yB6EGTLx(uKsNA+y6w+|I6Lv>LL`JOTwjGB-l)o2KM9|_b#0(itPn4m=l9}v3qH>$osCHegEt_A1Kzh zWce0$7%#a_Zy|F%3QpI$zE#nOqE4gqN&}$gOsa>qc!jb@5ZbD+-z`rAAlHC=wXDB# z^p#Ro)oN>iqvhh2qO{d~HK6~KSC0ReSMw0_vCMV`(Xy!+S0o5BF(o{QV`&phiYa9} zYi0o>Yv;m32`qW`c}felB~WmsAHbF~nVMz!Wck#TOg|Ob4&+S`3p-d=k+*<0XfcvX zNmNcQY6%&F;XXS5LwG?d@pacuzkfR0yt~lcn``dPHuYuuPW`#PFdEH`Mhl}CbE6mY zqnEO$l&mNP zg{3G3jD(^kWjWJ*?y_b`CCre3N~TN+{uv}ygIa7#F-6yd!Viw z#F7IUZ7!W0NK*j#YUTHT``vB%tJR-B{b2R!@~@u$;I|pqiBu9t!fj*_1gz#0L^BL1 z$lp??afFp6gB2<(2xqNK!p}# list: + article_content = ArticleContent(information_source) + result = article_content.get_content() + article_content.finish() + return result + + +def content_task(news: TNews): + logger.info(f'{news.title} news_task start execute at {datetime.datetime.now()}', ) + ofweek_com_ai = ArticleContent(news) + ofweek_com_ai.do_seek_task() + ofweek_com_ai.finish() + logger.info(f'{news.title} news_task end execute at {datetime.datetime.now()}') + + +if __name__ == '__main__': + logger.info('This module is not for direct call!') + news_ = TNews() + news_.is_static = True + news_.url = 'https://www.focus.cn/a/842171870_124752' + content = get_content(news_) + logger.info(content) + logger.info('Done.') diff --git a/seek/focus_cn/house.py b/seek/focus_cn/house.py new file mode 100644 index 0000000..92c9032 --- /dev/null +++ b/seek/focus_cn/house.py @@ -0,0 +1,62 @@ +import datetime + +from DrissionPage.errors import ElementNotFoundError + +from database.tinformationsource.model import TInformationSource +from database.tnews.model import TNews +from log.log_manager import logger +from seek.seek_base import SeekBase +from utils.time_utils import process_time + + +class House(SeekBase): + def __init__(self, information_source: TInformationSource): + super().__init__(information_source) + + def get_news(self): + news_result = [] + self.tab.wait.ele_displayed('.FeedList') + _news_list = self.tab.s_ele('.cbd-recommend').s_eles('.FeedList') + for _news in _news_list: + try: + rs_news = TNews() + rs_news.title = _news.s_ele('.item-text-content-title').text + link = _news.s_ele('tag:a').link + rs_news.url = link.split('?')[0] + rs_news.summary = _news.s_ele('.item-text-content-description').text + rs_news.occurrence_date = process_time(_news.s_ele('.extra-info-item').text) + rs_news.source = self.information_source.title + news_result.append(rs_news) + except ElementNotFoundError as e: + logger.error(f"ElementNotFoundError: {e} - Failed to find element in news item.") + except Exception as e: + logger.error(f'Unexpected error occurred: {e}') + return news_result + + +def get_news(information_source: TInformationSource) -> list: + instance = House(information_source) + news_list = instance.get_news() + instance.finish() + return news_list + + +def news_task(information_source: TInformationSource): + logger.info(f'{information_source.title} news_task start execute at {datetime.datetime.now()}', ) + instance = House(information_source) + instance.do_seek_task() + instance.finish() + logger.info(f'{information_source.title} news_task end execute at {datetime.datetime.now()}') + + +if __name__ == '__main__': + logger.info('This module is not for direct call!') + information_source_ = TInformationSource() + information_source_.is_static = False + information_source_.url = 'https://sz.focus.cn/zixun/' + information_source_.title = '房产_搜狐焦点' + news_task(information_source_) + # news_list_ = get_news(information_source_) + # for news in news_list_: + # print(news) + logger.info('Done.') diff --git a/seek/leju_com/__init__.py b/seek/leju_com/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/seek/leju_com/__pycache__/__init__.cpython-312.pyc b/seek/leju_com/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1c197b5f5e5e715923a9d4e1de706d7fbd899af7 GIT binary patch literal 136 zcmX@j%ge<81b!zn(n0iN5P=Rpvj9b=GgLBYGWxA#C}INgK7-W!Qjf5TNzPA6jVVYi zNiB*gPEE~@$w|#BjZeFDH(l4MTElm*AM4XIoo!ULyW$#Q{ z%asg@)Kmy5eLyLyK)hD{1K@%GfESyjvN_r()Cb-yQc%?=X6}568_EL`Yk78dXLfF9 z<~OrH1p-|J#(U>-3txK(`3)EDfVDj~4p2f?iAFT4lLXF^PIFR1qR7j-oMQ=Q^Gx^T zya}((d-RT+FX6L!uilyKN_0^okrPDgSR$H_ciom9^NDV)lQXT0`_oM8USf%W=I7qD zqy>QQ;X$qE!hS-;oJ6=Ut2k%csTRBq_rrnqx%alSAc?RR;$gM7wIo^?Ry*7kXa;!L zJ(=kHH|xC5)v>KV`3AjPI+y8X8r$(Zer6^{t-fPA&+)uDQ!tMgih1pr5CsAIy>oBn z(*==Bnwdg=wonu)4!fSY8NO%$;7V$A9}EX+(w=Pk3Vw zsv2lpQ7Vu2eK0kjDrkKEA~(61H#omAuk&nCO%-w=upN(IyexV^zZ7}=+HvLck!K_8HZOK^SadG^M8e-fDf@_IZ`fn{&2 zS-qL9N>dHH)GV*yW>MrH0|&;{2FE-w0QpVel!*&D(fcUk8kO24FgOF|S z51@nts^j%lgX80qV9hh&mDq)35RG|k zbXlDlLj`ofgKx=2p<5j-L$`?20r0_izPr+adg7)2IGWWR`p}*0A)5JQlae_nW8q$!)87NMB z-r!PvoaZR*GF&MkE-+d!#FRVkX0&K{Qr@_i(Qd<+YKgZn+GF@rfp~yJJQ_!u_cGFa zxaFp+c2+#7wO~Q>@h;$W<1UB8p{oIb zp7Lx>8WK3P#4T~r&cD+GrlOqPIcDG#PFqtM>r^J2*2W0Q5SVX2Gnt;vkd$iane=of zOA;7n#WPd*k_m*{$jr@Qy3jj~@%)RbN!ft~l1X6GR4hG(6OIkEuU{$J*bjp%;6RMx zoQC4O#$neizlvr#jlUd-3!3XPii@jgk&C;hQB<(SiA>hScDvqGM=_Z`34YTF^Z@$X zppN({cMVw$<{Nw-y$SJ?=bE>*gg)S=qP#7d*#%5&ucGK_-BOgO%N9&*%o0Crw|$C| zFjTNdF{x}>Uio6r*^!wajv;*-IDuG-SH zhMh?#Yc{RWc;^!Qp~+Y57ry5FhE`UGBmXNTiT_EG!vl>NE>T(Fg+^{VvJpx)_{}-k)vzwlC3~A3 zhsrnXfb6{^KzvT4eq`-%_fDH- z=S148wQeCN#9)})_AIy=k#hs`LHU%b8(5RAj6AESH5u1{N!HVH8bC+ZEu4zwdcT*( ziwihm!AeZAO3ow_SwgTjB1hY8j|uPthy{tQfB@UgI+M_lv;_+mfHah?-R#0VI z*Q&n~?!Gqj)=W7(Pz(>;Xf1_@%i+;tc=YySDLk>-axc7lb*#$!gD1HAJ-go-So2im zz1RKk`)`bwx1tP-k2-( z4VC*|DE7Sov!VO@hi+wm{o04G{eGx4{8D-NbaD7}Y5yy0o^@%U64_gh94$tUmLsFZ z$S5pttw_CPsjn#Y{bKm8G+1pz+vI8wl6ni?o+poesB_;x5f{9`-Iumq3%nI5>^^c! zy)|7psuXsfFH34sQop?4v4gGw6Pz_8x4(J2zp!((EFJ$uI{szVMY&&?6#IYm;_>ec z@gEQQ#scEU$GcDN6z}W|0DULgHRcij=n;XAdMM(j)f83~@QjkmXjy~O0Y&*iRyArI zB7iss&xr>MgWvuag8p;2 zB=!=DO0~tc&?4B%lJrvrfQA<<05d>9F>4fuJaS+ON&@#Rcy7Qq5BvrcgP3}A_#Ddf z(}>Wr6m`|qVP?7^`*ian%rWh}dfK#9NCw;2h~RoPlxt?wUF?SXfiMT*0|hSBSqKl5 zLi<)GDpFTjiWH?tN$OvjtO$WCr(Zw4E`%%o&Vta%F5@ZFX(}+&<6!h=)03O)JuIRP z@R)jOO9MpYEGmb!WP@^M0I_{d+M%3_0qom^rWU;g*0x+dy|^vSkpNfACFvVrOKb9* z4*j33;{RV(ErHLcayuBZDwb-_%iv^U$#@YbvKE$AOV0Id;sWGt=0&|V>=W3ltQ*X< zy%a0yyU%u|GnZIA*#Rx1&=(Rrfy7{0lxI;z4gqaaf6BEWss|UTFHfE;PM#}Go-a(O1?j@NfH#Mp5GV`X zMWK6L=xqkxGXc`6;3Gy>HJ+mWg$udBc&l~_-Fo!VFF*Uu+QScjoD)xG(l{3FA_;> zZF&{g3RDdA!UlsbyR*5D9$aU!QW%2wdklVHQyRpRnmb#mvtrC+R6L;%FD8WwPdAD59pta-VsP!2cmco@3f?(yaboS8ShdDiCLuo*?u+L|EXoG+;%s1xnlE z0@2gf2dD;=wR-ri+I^3bG1y4a(EJEK0Q?+x4}~70!w=Dk2k09QP~QX8{{TfFyF0nw z)rrRlx@t_|!iC85RRsNQVS{$nR*}2F75WZuAWBpvBm`D-pSZd!LTKd_3pnY>x-j%< z&w)};tRQT!?AZt3bs7LZHDtzh list: + article_content = ArticleContent(information_source) + result = article_content.get_content() + article_content.finish() + return result + + +def content_task(news: TNews): + logger.info(f'{news.title} news_task start execute at {datetime.datetime.now()}', ) + ofweek_com_ai = ArticleContent(news) + ofweek_com_ai.do_seek_task() + ofweek_com_ai.finish() + logger.info(f'{news.title} news_task end execute at {datetime.datetime.now()}') + + +if __name__ == '__main__': + logger.info('This module is not for direct call!') + news_ = TNews() + news_.is_static = True + news_.url = 'https://sz.leju.com/news/2024-12-18/18427272536617796292963.shtml' + content = get_content(news_) + logger.info(content) + logger.info('Done.') diff --git a/seek/leju_com/house.py b/seek/leju_com/house.py new file mode 100644 index 0000000..1f9fc90 --- /dev/null +++ b/seek/leju_com/house.py @@ -0,0 +1,60 @@ +import datetime + +from DrissionPage.errors import ElementNotFoundError + +from database.tinformationsource.model import TInformationSource +from database.tnews.model import TNews +from log.log_manager import logger +from seek.seek_base import SeekBase +from utils.time_utils import process_time + + +class House(SeekBase): + def __init__(self, information_source: TInformationSource): + super().__init__(information_source) + + def get_news(self): + news_result = [] + _news_list = self.session.s_ele('.sf_listPage').s_eles('tag:li') + for _news in _news_list: + try: + rs_news = TNews() + rs_news.title = _news.s_ele('tag:a').text + rs_news.url = _news.s_ele('tag:a').link + rs_news.summary = _news.s_ele('tag:p').text + rs_news.occurrence_date = process_time(_news.s_ele('.tag').text) + rs_news.source = self.information_source.title + news_result.append(rs_news) + except ElementNotFoundError as e: + logger.error(f"ElementNotFoundError: {e} - Failed to find element in news item.") + except Exception as e: + logger.error(f'Unexpected error occurred: {e}') + return news_result + + +def get_news(information_source: TInformationSource) -> list: + instance = House(information_source) + news_list = instance.get_news() + instance.finish() + return news_list + + +def news_task(information_source: TInformationSource): + logger.info(f'{information_source.title} news_task start execute at {datetime.datetime.now()}', ) + instance = House(information_source) + instance.do_seek_task() + instance.finish() + logger.info(f'{information_source.title} news_task end execute at {datetime.datetime.now()}') + + +if __name__ == '__main__': + logger.info('This module is not for direct call!') + information_source_ = TInformationSource() + information_source_.is_static = True + information_source_.url = 'https://sz.leju.com/news/' + information_source_.title = '房产_新浪乐居' + # news_task(information_source_) + news_list_ = get_news(information_source_) + for news in news_list_: + print(news) + logger.info('Done.') diff --git a/seek/mittr_com/__init__.py b/seek/mittr_com/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/seek/mittr_com/__pycache__/__init__.cpython-312.pyc b/seek/mittr_com/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1bb918beec3b6a3433fd9958dc8f6eb9ade85ca3 GIT binary patch literal 137 zcmX@j%ge<81d{hM(n0iN5P=Rpvj9b=GgLBYGWxA#C}INgK7-W!(ulB%NzPA6jVVYi zNiB*gPEE~@$;~V&DT+_d&y9(X&&%kivWGA7iZ@~g#3z&fMD&><`GKB3NeXEZIZ=Vv1w7sDinFuR*NjlJf7L1 zVmKT2_>kREjASDoAGSM-UD+;56mp7~9gD<_@UB~`Z$8^?c5-HR@o1iz-HR+6Gow75 zSIijjJv?sqTs%aGm{tgn6g2-#`)I~*!TpfXJ`dmW7bKfB6FjN+wwA<9!fIEr0?imt z1}C$9|7M-{1v;MfN8g}#Tjw&pOw+r5$KOn*soQtL=0#qTlVy3bTq&6+geVKx@0~tf z%9lmakXE@gRj!B}huzThB%gDD3ESn_S&oX`uarv?47_YOJQrLZ=T`K{=3!V|h5T@m zr6$QLCgo~gSt5CADvQx9Gu1_s4K0y*nhj5pG;@273u)zS-m>hn68to$v>8X+_-zis z@C98Va!YtkSsoK!r=29`v{m7(n%Ib0rz)M0d;nex#e&Ht?ZyH zbb(Hh14M=c!4MHx2+9@u9t3NO%#nHZJu*kr>SQ&NY57)7mdaAgBQiAZBLxzqO5+0v zrVcwUlYCwZyiZyYahTjl>#Xiz$}~+RakqPxONg2mvFgB%NVUn(gmWB*8fhwO=QEA| z$5-aB&#!!Z{o`*6YyC$z`j6H7k1f5{43k7^DSrQ6h$M#oAQUwZUwf?)?_Y{HL$G|{ zVB9}Fm?k&7N7e7>sG9D;)R&AR*LAl`*NbJdVj~~b^$#nC?Mno<6)&M81)Lje!6L_P z8z4gkX41i=z2tc&I#C~oZWQ#vaDNf*PD4-Lc>7NR(_i6GC?&AYJvF!2+Ab~K@oLKr z3oa|7^bi;@s2&(*fB}qe0cC*%*v7Oz3JgJyxuSqP=#yY2pg>N^f-*$}Mk=lPo_J1& z^H3qq1=xE6L~h*hD>6tRUe~GEvQ;R4ioI}oLqD8xj=%tT-#xT?@cFgfqqR30vAr9y zL-p99wb$p+iC!3OGVV4dx4bSG=C`vJmAd?*NrS1$-zaKAkWh4zL3N=Q&GkHG$y z@GxxSzOYpR{uvw#n7$npgy>81l^TSb5{XInXh#xa!T$CfSb!kY0^)iHZOgY)`=7j0 z{x7dCL-r@Ddo^sdx-^`Nnj<0Q8lUI6isYIhwdz2dlw&*CmZc*CgX%H_wi}X`gvJmt z4aucdT1`x^vv+>!+yBkT zTHm4C>kXFNV1sowxXv_>Pt{p!oeliM+UV`n#@Lzq*qM#7$@(v9(7c58fdx{2}-#CS4T7u`KmNRU7-^uEc3sPQk#)pUJ=H@cqQ>&#bJ#8u*s!i*~3Rio;^BR4ZT_}@l5(L@f0}g?s`>Njsx%Ud1IDmIKI@5*LmH2 z5NqQDc!4vrt+)8CB?DLDw!pe@`?mG)XDt!-2R9BcdImmlrv&DOpV1^{D|ifa{DbHK zZ9kjLwjAkKJ!VDh!C?rEg*+g+otC2mkl-i5Foy{Zr4L d8wb~VwA!gH?d6VjmTFtwVX0=EumSJ>e*;c`Vaosj literal 0 HcmV?d00001 diff --git a/seek/mittr_com/__pycache__/mit_t_r.cpython-312.pyc b/seek/mittr_com/__pycache__/mit_t_r.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..35a5f9b65866d6118e97ea771b3b9102ac29a3ef GIT binary patch literal 3858 zcmb_fZ)_9E6`!?topo#{3;Bb|-%SDsF2NK^xHeFZ-jRd|npO&@pz`c zD~aV)p^8d4QC*N)MTnl@L#uK^YEf0G==!M=l`6F#>`Rb++t4p3%H21sp%vo3^v&37 zn<)J!btLb+nfK<+dvD(S-uRD!Ks|zX?;SF=&x_Efbl^0hOlp=DuJ=^b9BQwvXEzTSIELfa zSi%}kq~h8TAqfKWJI{~CCle&5T6!Wro=A}>hFRhK7@jqO@ED1yDNF^$ag1l)R87kE zCrKiTO;fS-7>+tt(!LxtwD~0%T!SORD8gwd;?g+mo^&mvNltSu`6ImMUP2LJ8O?DK z&o~P4cHo=3Ws$e8=yCO}M9ki)SItsPW>?fzvlHEj{x)gIf6T2StHQkQ%Ag-YrewHk z#OBe5+*rtE3uY>bi7hIM9@i~J3At_F#Kt7?!E)QHC{aTNzZ8?|mgU8l`z{PlL=zgG zNMZ|<2@`@a5u=tW(L{_=ilvZna-KB8f^_REPK2FZ0>CqOz~{vx+7JWR>vCc+yZgXn zv40aLQM8;Olty*%V~+g+++9UeXo1V1X~uf>GhG{wGHcqoAT^>|O&I=#Cc5XArn#Kp zO~WPJ+NwGiTp8DC^qpTJwBYt4i-wUkC(Rf5jC*=J{qBl>zq6*=XLzNu=0C2Ns_PsI z)>o{C^_6So@DXzC+txPz-`TbxSi2lO8Nrc+9V>o7l-nGMYCqjjr5y@ZRc15Xbhk5K z?K2fyICSIJJS6ov>O+?7NbEtRZ^2{rIpb|1NXU3@BhB+cCFUG@7l3lX^F7avzm)n} z&U($!@n7kzqdGsRWJy)y3U@EA>!5DoSUSLH3`0XVt+ZgN zQ-f++2va;sw{8bjgK~PGd{7=%bpva%m5?X(xF+KgsL6Uh5^mWZHJQY5jnHdNJXBs_X4&4M zxhPIj_Z) z&9@xSwH$wVZmnf>x&G6Z?&YDP%NIDsZ3H`S*R2SJuD&}nw`T6`TkATK?|MDg_4dhdUliSx`_kMCar#~G%|oYsu17xaP>=AadH3lf!lNU8N}mi4bqK%j5P%NV z(RvbBV^~q(%#>I{OF>#GwqH@cn^KJuhwz|}RS80Cqa7&MnkY65s6!l~^}(c-;1oMZ zLSI4bLvh8V5cXmT6&!_y?RlO{*5oOA?RDH4CBrqIM=HDv+_D3z6Z zlR*xCL6zaAsmzw<;3VKYw_SB{_#Dc(#u1^FF61uR&fN6CJ=M)CFvqThz-p)Mtt582 z(8?vB=@J$mibW>8s~|}?{JXPl{cHY%i$eveWwqgE!wln}u&Z2r)o32pK3_{x#M<*lzk2Qc>Esa}(PD8e( zIT|3S+%$byY-v?~^(g-*uU!8xujb*j$I=}PeHBYJXJpe-i6!GX98FnRRxLT*R>K7t ztmTEo5+3sGwbu@A+9JhIhHmb7VwPo+?bi|teR;7R$YBr*xmZ4tBS4$NPhHUZuH{X=%wr# zH7i|S=W%V`@!mY&n&Vs7`CujQw$(TMXf8i;Avba%KQfUUnOGaSm>pKK(xu0|$}*H9 zMNHM>ijwX-e?>Rt5?IJU#S@kcFL7DZ36w55sv1VmaJv19WhKqQLx*N(XT$9BMClVb z46n~aDf(V`_Vb&6eQ#0u{G;DK`^nX3D?fPpo1c}YKKtd;XTSI;Eu2ooaX7S#90Osy z=`_(z6AI3^)hQfiRmfy@$?k!;tC!(@4@J28^e$DfFj$EVhG2GcRgPZbGK-bUM);!tD5{dO|CfEu}2oFj>7<7|H5r>9nE9nifL=c7)^rEFc5S`Wnm1 ze+m@atD6cWoux-fCzYgG^dTz2BCUuo#S^6OToChdw10(Y6q_8#3tlqgDw_b;bE zcJD6mjf=x9$)tgG{@5n~TW#SizpJn(1mAVOr_gnr5$y%}8;t1ugb!pJde-^gqKE{k Rw0foPF&`|pAij-x@lV@UT$TU; literal 0 HcmV?d00001 diff --git a/seek/mittr_com/content.py b/seek/mittr_com/content.py new file mode 100644 index 0000000..5b3a4bd --- /dev/null +++ b/seek/mittr_com/content.py @@ -0,0 +1,45 @@ +import datetime + +from DrissionPage.errors import ElementNotFoundError + +from database.tinformationsource.model import TInformationSource +from database.tnews.model import TNews +from log.log_manager import logger +from seek.content_base import ContentBase + + +class ArticleContent(ContentBase): + def __init__(self, news: TNews): + super().__init__(news) + + def get_content(self): + try: + content_ = self.tab.s_ele('.content').text + except ElementNotFoundError: + content_ = 'not found element' + return content_ + + +def get_content(information_source: TInformationSource) -> list: + article_content = ArticleContent(information_source) + result = article_content.get_content() + article_content.finish() + return result + + +def content_task(news: TNews): + logger.info(f'{news.title} news_task start execute at {datetime.datetime.now()}', ) + ofweek_com_ai = ArticleContent(news) + ofweek_com_ai.do_seek_task() + ofweek_com_ai.finish() + logger.info(f'{news.title} news_task end execute at {datetime.datetime.now()}') + + +if __name__ == '__main__': + logger.info('This module is not for direct call!') + news_ = TNews() + news_.is_static = False + news_.url = 'https://www.mittrchina.com/news/detail/14218' + content = get_content(news_) + logger.info(content) + logger.info('Done.') diff --git a/seek/mittr_com/mit_t_r.py b/seek/mittr_com/mit_t_r.py new file mode 100644 index 0000000..50ce8fc --- /dev/null +++ b/seek/mittr_com/mit_t_r.py @@ -0,0 +1,63 @@ +import datetime + +from DrissionPage.errors import ElementNotFoundError + +from database.tinformationsource.model import TInformationSource +from database.tnews.model import TNews +from log.log_manager import logger +from seek.seek_base import SeekBase +from utils.time_utils import process_time + + +class MittrChinaCom(SeekBase): + def __init__(self, information_source: TInformationSource): + super().__init__(information_source) + + def get_news(self): + news_result = [] + self.tab.wait.ele_displayed('.last-item') + _news_list = self.tab.s_ele('.lastest-list').s_eles('.last-item') + + for _news in _news_list: + try: + tnews = TNews() + tnews.title = _news.s_ele('tag:a').text + tnews.url = _news.s_ele('tag:a').link + _time = _news.parent().s_ele('.time').text + tnews.occurrence_date = process_time(_time) + tnews.source = self.information_source.title + news_result.append(tnews) + except ElementNotFoundError as e: + logger.error(f"ElementNotFoundError: {e} - Failed to find element in news item.") + except Exception as e: + logger.error(f'Unexpected error occurred: {e}') + + return news_result + + +def get_news(information_source: TInformationSource) -> list: + mittr = MittrChinaCom(information_source) + news_list = mittr.get_news() + mittr.finish() + return news_list + + +def news_task(information_source: TInformationSource): + logger.info(f'{information_source.title} news_task start execute at {datetime.datetime.now()}', ) + mittr = MittrChinaCom(information_source) + mittr.do_seek_task() + mittr.finish() + logger.info(f'{information_source.title} news_task end execute at {datetime.datetime.now()}') + + +if __name__ == '__main__': + logger.info('This module is not for direct call!') + information_source_ = TInformationSource() + information_source_.is_static = False + information_source_.url = 'https://www.mittrchina.com/' + information_source_.title = '科技_麻省理工科技评论' + news_task(information_source_) + # news_list_ = get_news(information_source_) + # for news in news_list_: + # print(news) + logger.info('Done.') diff --git a/seek/ofweek_com/__init__.py b/seek/ofweek_com/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/seek/ofweek_com/__pycache__/__init__.cpython-312.pyc b/seek/ofweek_com/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5703bb4ec5ec6cad662c00288a47161b27b3340e GIT binary patch literal 138 zcmX@j%ge<81T6P5(n0iN5P=Rpvj9b=GgLBYGWxA#C}INgK7-W!(u}Z*NzPA6jVVYi zNiB*gPEE~@$xkZ>Qt`?8xiRtanR%Hd@$q^EmA^P_a`RJ4b5iY!Sb;hjfw&mN_{hx2 K$XLV-WB~xm;UBvI literal 0 HcmV?d00001 diff --git a/seek/ofweek_com/__pycache__/ai.cpython-312.pyc b/seek/ofweek_com/__pycache__/ai.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..567d03c0e9426aa8e5deb7220744a204b638f47c GIT binary patch literal 3783 zcmb_eZ)_9E6`!?t9Xqy@C9#8(5VA=)h#@h@e;UC7cOkjx)U;AKmGh-6i+2(>tp9X& ztpuM_rE03nNmL3_D}>OAerQ!rd_iifLsTbKI=;0Z>~oNPTli32DZOu2Lo396>6_WC zZK6nRrH-^aZ|1#u^X9!bzc+sj2AdJIJKrO-Lq3GQq64Q18^n4)htLhAB9+rooc1oA zOSHcIO9^jA*jU&~&j8q?P zzU#L4iU-wZ%&UIfGQ+EZWj@|Y$1^T92(*Z|sV!Fq5h4>VguQdS97F59+IkmkrH$&x z{9Sv1;_a%4+vU*45UJZ>6t0chVASD^U>(T5qG#*98f30=SGj2DYpP;0%2}baI!@x0 zIhi)c(%F=HmXI`o`L=TtshKoMDyEiBO{KFWfnioSH;LyBAUt|{b{11X?-a&Y-c$_A z4rE9=fek}8wIohBR?>dMX=r^f3_gSqVHD?76nCi{cF(w0&T`qhy0wm2freL${&dI(7KZxf{r=F+XzU(N7^y@?70t zSJ5ZjWYlE|MmB?q<&|YErJ1rEbz8iF^%>%a<(5yD6S@MYk`1a`l9pZ$oFAD^q*Xkf z!6qit1{gST%{vdFSZSZ@sgdTf?A2k)dkwWLbN8X`z zm_&(<#Go{4f{z_(84h{_&7wsvkLDQb)!!VI+SJtAVdtDH(jcL_99Jy5@~%AJ+%!>l_N!*DQhc)M1;NW8api;~m>n zp%d0^W#OV=wmT~Gf+G+6GN_>(@OACFIg+5O#;?!8s@fUy++25^<52mUMI5^JDG#YQ zg$9th!{P5jWMI)__B!J^$)W1K>dAX{&g^$q{8u`? zROiRFY^ZBo<*xZ1o%3kH{T`aw=_fmjwMv-h^+W4;m#Y=*Qd+jJ;m^z!r&XF@TqcmtQa2q!(u7??n?7l;ogBgo_iw&`XM3sVK)L5= zvF9kvHdi7&w*xl=w_3`P;bLU?&Ri+-t^0k&$cb{~Ofhn1Rd^gdau1iHCssXc;_iyH zt1KNWO2^95$)a=;6a*_`cUkN&iv7Ph`bgYc-G(}(>JB7!7rb3Bp8HW~@E?c^Ug92$ z9XA5k1BKor_mq26h2wIe=VDn@ilXw(~qOQ-QzR^>{=Lb7SMd6_s0Q%vM$modha6|w)+C&RLN=af_hIq-zw3^i^ z9gyW8WEI`!5Sqpe+DVj#CAd)>5?I%vaBzg8ltBUfG>aRdNWp#-91L2>Uo4}lixY+m z`%eB7LH{pLo6t=t4%xzX!Vzp{N$QnSK$DJ@f@vV2w$;i+9y!nj#ev%js2!jLpy##% zONY;)ylV;(TA-qC+Z=XG4_ryjxD0d5x?sK7$%MckIOQAzc->!%~WpQ^=++7j}mnJIwwlW_n@{u*ZtKtt8_z?5SD;}UXj(H#sXK(bt zT;nSMglGfkror0O06aO9>S1lJMmaM8?!G$hP>z29CL4g)q|RV%Qv|60HZ?~Bc$k}` zPlHXZ&acPye>#=x|8=UX5c^- zz`4tsA=%&|0hLOd5ryJW2_Dt ztTI`B;Be&kP-NP+g8*;SClmmaXu@kHH7$N`ZP}2epNUEXLwS z)e~B#EHP_px&d1ObTX1vQaj90W(|{J0E->j3k%3$W_^QY8*h2p@@a+)*=A}9LZ3y1 zrq3tH21i;3UwTj2u6=`y!jK}M@k{st*mK;UQR`E5_$fO51nqu;`k$bICn);d)4_GG z4n9ZFRS)u9xX^c~ilD#GuhXvDB5;?uLjU1)M2V`1_`pi;3wLLQZ(SN=*(DxX=Sx0VXz5$y`>S3gi1zB$rbm3F8b*8@ literal 0 HcmV?d00001 diff --git a/seek/ofweek_com/__pycache__/content.cpython-312.pyc b/seek/ofweek_com/__pycache__/content.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fe3d097c792227f1cc9c1542b4eddd042c8f0fe1 GIT binary patch literal 2763 zcmb_d&2Jk;6rb50J5FN9b`l!bRAF0Eh}*_#LMZv*tEC_GgY+XoP!q8--f`-5ZI{`# z(ps)$q)4PfNa285sRD7WD()clKj31MRCX77LOpP^NI_LkytnJE-B1oljI=Xv=FPmF z_ulXQ=BGfQi$LrAIJ@}0hmhZJ;x?GuZF3(bNzOPOiJfX%NEPQPg7ExakP!! z<`8sW)9b{1qP!-R$d`bO5^eLD+@iBF$?_P5OI%n!Rn6owrmDtdiy2%`3&fb!sj4YG zX&9<%plwAdKOX*YdLfn9_`)S_aO~sA~r{s$z}OGS*9^IYms6op2SQf^^vBh#c(cf zDrp3Y!X4m1zB&wU7yuK9uL`RcGx=px;O%0P2*P0bn&(vmv#2N{jH}&?+=MvVF)T*6 z#I8+;CXD0Il}Wwtxm907k~{a-2cExP`l?hLh*bw-l|8So54=$uI8+@twEAwtOG1&= z;G>5g5*qr0P~5!d+Pn4Oz-qAJf$3udGqSRKFS*@4E&oWT2Q*QEIW{nzZ`&S^vmoNq z&s7QZE`@>>x4C*sO41w=7^#@-cw#&4O+)b*7hr7=R94W22*j$HNW895(9LEIIjOe8 zWp@!cW9)+tDq-7DW%q0A+a}AW>wz7$z;HD%ydKz7K3ZpeH5RS1=my(S?+%yWa{`1# z`cx1Q#dZ)9;C_1$mfE=jHerJrdPYUugbHg_!_t$waL!=Agz4MT+z{O)-^gybDKRn0 zcH5GKfV!se;Ls zq^T66ZCZ}h(9*?x0)uKX1h(ZdGbRv(PfMEI%w)MG=km)A^8H#~MSivaA$GzUAP<|D z11-i?lbNR&#P9M7UQFzTLodK*l%NBAv98Zges*$g;`;Pg(`)-GgVT2=Ycr>+GpA}Z zXR0%2)@RODrspc5`3?5Lul>8en^^B3F27f2;TjvPvcV0e*m9)GA{#9F4{MWmBDJX# z)u|J;soCn(?E2K1%H-Kf=-gd4XX6zKC7aCTRJHix{Dq96IGmz@Qvrh2 zcg`2zx?q}@jOo$Q<>lqL<8vHdnbBlsbbSB##K_qA=)rUIBjfwWUK<}9nH(G6Kk~}{ z1FueuO}rAnU}p8A=Wss9T0WCpe!^9>IjUEr3r)z~xf@32@ zn18kSv%I~|g5{$Ko=WJ|4R+w3Km5h9@{#(sX!*#>>3Zla49D;FY`?L4y+ list: + ofweek_com_ai = OfweekComAi(information_source) + news_list = ofweek_com_ai.get_news() + ofweek_com_ai.finish() + return news_list + + +def news_task(information_source: TInformationSource): + logger.info(f'{information_source.title} news_task start execute at {datetime.datetime.now()}', ) + ofweek_com_ai = OfweekComAi(information_source) + ofweek_com_ai.do_seek_task() + ofweek_com_ai.finish() + logger.info(f'{information_source.title} news_task end execute at {datetime.datetime.now()}') + + +if __name__ == '__main__': + logger.info('This module is not for direct call!') + information_source_ = TInformationSource() + information_source_.is_static = True + information_source_.url = 'https://www.ofweek.com/ai/' + information_source_.title = '人工智能_维科网' + news_task(information_source_) + # news_list_ = get_news(information_source_) + # for news in news_list_: + # print(news) + logger.info('Done.') diff --git a/seek/ofweek_com/content.py b/seek/ofweek_com/content.py new file mode 100644 index 0000000..803cd22 --- /dev/null +++ b/seek/ofweek_com/content.py @@ -0,0 +1,46 @@ +import datetime + +from DrissionPage.errors import ElementNotFoundError + +from database.tinformationsource.model import TInformationSource +from database.tnews.model import TNews +from log.log_manager import logger +from seek.content_base import ContentBase + + +class ArticleContent(ContentBase): + def __init__(self, news: TNews): + super().__init__(news) + + def get_content(self): + content_ = '' + try: + content_ = self.session.s_ele('.artical-content').text + except ElementNotFoundError: + content_ = 'not found element' + return content_ + + +def get_content(information_source: TInformationSource) -> list: + article_content = ArticleContent(information_source) + result = article_content.get_content() + article_content.finish() + return result + + +def content_task(news: TNews): + logger.info(f'{news.title} news_task start execute at {datetime.datetime.now()}', ) + ofweek_com_ai = ArticleContent(news) + ofweek_com_ai.do_seek_task() + ofweek_com_ai.finish() + logger.info(f'{news.title} news_task end execute at {datetime.datetime.now()}') + + +if __name__ == '__main__': + logger.info('This module is not for direct call!') + news_ = TNews() + news_.is_static = True + news_.url = 'https://www.ofweek.com/ai/2024-12/ART-201721-8120-30654143.html' + content = get_content(news_) + logger.info(content) + logger.info('Done.') diff --git a/seek/seek_base.py b/seek/seek_base.py new file mode 100644 index 0000000..65a19ed --- /dev/null +++ b/seek/seek_base.py @@ -0,0 +1,57 @@ +from abc import ABC, abstractmethod + +from DrissionPage import Chromium, SessionPage, ChromiumOptions + +from database.database import get_session +from database.tinformationsource.model import TInformationSource +from database.tnews.crud import create_news_list_if_url_not_exists +from log.log_manager import log + + +class SeekBase(ABC): + def __init__(self, information_source: TInformationSource): + self.information_source = information_source + self.session = None # 初始化为 None + self.browser = None # 初始化为 None + self.tab = None + if information_source.is_static: + self.session = SessionPage() + self.session.get(information_source.url) + else: + co = ChromiumOptions() + self.browser = Chromium() + self.tab = self.browser.new_tab() + self.tab.get(information_source.url) + + @abstractmethod + def get_news(self): + """Abstract method to fetch news from a specific source.""" + pass + + def do_seek_task(self): + """Saves the list of news to the database if the URL does not already exist.""" + news_list = self.get_news() + for news in news_list: + if news.primary_category is None: + news.primary_category = self.information_source.primary_category + if news.secondary_category is None: + news.secondary_category = self.information_source.secondary_category + if news.tertiary_category is None: + news.tertiary_category = self.information_source.tertiary_category + if news.label is None: + news.label = self.information_source.label + if news.lang is None: + news.lang = self.information_source.lang + with get_session() as db: + inserted_news = create_news_list_if_url_not_exists(db, news_list) + log(f'Inserted {len(inserted_news)} {self.information_source.title} news items into the database.') + return inserted_news + + def finish(self): + """Closes the browser and session.""" + if self.tab: + self.tab.close() + # if self.browser: + # self.browser.quit() + if self.session: + self.session.close() \ No newline at end of file diff --git a/seek/the_paper_com/__init__.py b/seek/the_paper_com/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/seek/the_paper_com/__pycache__/__init__.cpython-312.pyc b/seek/the_paper_com/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..468b283454100d6d81fb8871b1b822061a0be6b5 GIT binary patch literal 141 zcmX@j%ge<81m`ZMr-SInAOanHW&w&!XQ*V*Wb|9fP{ah}eFmxdr4wNllboND8dH#3 zl3El~oSK>)Q<9MyUyxXkS`?q0pBob&pP83g5+AQuQ2C3)CO1E&G$+-rh!v=r5r~UH OjE~HWjEqIhKo$U{;32U9 literal 0 HcmV?d00001 diff --git a/seek/the_paper_com/__pycache__/base.cpython-312.pyc b/seek/the_paper_com/__pycache__/base.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0aafe7357232a67acb76672aaa1e930f437d9e0b GIT binary patch literal 2366 zcmah~U2GFa5Z*ic&US2r3323*A0LF0I5ZZN-=_Qoii0ZJ^d~_rCe`V<+XOrJ_bSaQ%dQ61`W4EMK ztYO_6Hmo5duPZ4+351shW-(ZCr?Ty zRflrKkg;isRz|~e)m?S2`9|w~Ab$ZFVw7eSlx7tMd&b#0G|njOY$)wjJhLdx&7o-~ z&5xjj*Y;9uyE)T{Ml_vpWcCFI6rX4+x97KF>sdOsN$lMMxr<*oLLe}r@YS5xrtb2Xs{!n9|%fbG7EdR6LTlZi0-ON8{`cxC%C@+CgJzv8+>j{;yXeU}qsU~b%a=y69xsEg-t$5!j-YO2 zZF9|bBC=n)p+vRc%s`!fDjl6#gm%GTr9Jo=>gF)w@ZOrhW^H=|@(kaFgTrBX` z{Yr056!S*CQW}W!%+)p*a|%s+&Y>wL5gaaTO=ffjPm8id6u2=bl(coC(43TXt6w%$ zp+@@;7C1|q7?40^YLcpo7M_NSr+l<$@5O?js?40EKc-iq5Z$OZ2?cgQDD(*Z!jP0v zu_9Q8Fb=K-Ty11RMi+EwQbNYUTC&i7R>#vhEL&iSX>St@Ss5K#rB!duAsv~ zClCR^<{e@ZT2{7~#%6O?#sZC3&FGW1Cr?yDYu?^m!_j3dDiSmUr)BWf-emJqE{Ao6 z(9219YR@^|?LcZ;#yQ$l6F#z;DuOgxkzg~gT6UA@NNUCe#|{tv=@Lk^soD%S>E#fr zgU}+j{S(-7Y-|0-6=@4HWtP<@hM|(=WN0*|LVuR(_;NrHW*t@@cSdP74iH(+Hqx0NK z@BX`ImwMlt=YEg8wjA%S#1EF^2P^R-<@gZ*2`xw3E0G=L$c|eF9!3&tTTrX8)`24J zCI7ake+N-Z?_Y=s=a`j9>q6*Ss1ygZAJ2b({_cB=&5I`%_mzgUQm0;t80CoZd}V6~ zwH|C(58T^(ue;QFtP(l?TjcojH4oK3GwGx8)5*l!eeC_dKq}1L-?!^j7x$nm1pI@( zSjx}+?B{?__~=uiOBxnMI4V&yl)Os$kSKnZm(;3;cmdAo|D=Z=Pq_As$Qa69OfV!u z&xKa_pmTnN_7rELj2VUL=`31f8Lq#y`{)w{^dG*NYbVXF-EbmoZ#_jaPU(IiP2i+c zT&7cQ_I4;o$$5sX7hcV3VJ0DDtU3|eEe$(bVKtRg`J3T+Pr@f^k`C~sTEd&?bC;x( zNZ|=BL_3nVGOC%R#VI=(;%~l{X`eg%1OahC{sn#-D=7c~ literal 0 HcmV?d00001 diff --git a/seek/the_paper_com/__pycache__/content.cpython-312.pyc b/seek/the_paper_com/__pycache__/content.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4ce8f6f935928a5e46744cfa3666c3d204d2b565 GIT binary patch literal 3007 zcmb_eO>7&-6`onn@<-yYBv*`R$QxUB!pNqAR7ow#aIM;k8z*JeNJSDt$YRSKN|cwo z>g*Df2&!NK0Z~vCkPm@f1pPTS@VP(_x%k`*S;-)?wNV2_4!sePTNn;d^vy0;lv%ex ziw?+{H}huR%)Ix#Z}tz7$N+-!=SSM=`+kJ}!aH6OyTajljL>bQA{A>W$6GOuMu;s7hz8ds?{K|VjopRw@xtv>|6R9(atSp~6Y@~E zi_`@4hP@uljL?L)GB@(?tkV&1j${4#Cm8C@*}_{mJ^WX`O*W0~kp+#GsBUEq>rJDq zs|$=74916+-_i>PD=Ajd(3gxd%TpK?m$S5Dg5cMT)m6$T4qi2M3k-ZmF=^gAJa4<+ zLx-6C6p$1Y)bg1A5K8^3BHZ2Bjv|J4gDOTQNzbBjXuNKv|x*78O+*2SuW~DOP13g+Xh^p?XqB4}y`81I=La&dB}X_#up;d>0~IMjY7A?Et|X*mxU0%bOu?FPHve%?8dh6U+v)g&pYVJKKtQqK_%Px14zhi z;U|I~blVE9piSQgsB)y?XY1Fc#jK60!f##U6fmi20*WE}GK|{9qMH`ueoOmU3>rJw4Wevg+6}$L5?nL5N-YSp zvtc#WAKGI|ZpYk$s*c*jCw7A!ArP8skEU+dZq-_&>E>v<@yfUNN3XO-7n-9ByWj0Z zQ9QXDd-6Df;uBvXF*MZ?NM!0!>eR0<{o+z9b)lKM@LOX)b-9(g+Du)&6@2nw_@%E9 zj!d;vnNNaWcYGlJ<56sMH}-WW0?VJ87h(6Evyq^f_A_dgnf^?M@aQ=ivH}8R|55hTkcUi;^XFxUN59(w=)ETdta!!KV^kBC z5aVeJIB{;`T-sj%?l9M2>=dZ%#Bn-saAF6#fO!T=j>nwpBrG#8Km|OIe5vvB?0#~t zK~A)zr&`g;W^{5tdZB*3O-{7P$tF2@K*rmHL-nldB+gUMID#j!m6J+IDeAIZJ+r)4G$of;B#>ODW|CTDG;c|HMblnfu6}dPvNp`w zsi{h(l7Y-}-q}oEpW?2+MlGeN$?#cHQJ5;fHuJ6N*QcjntBThQoo3P}7<`5yd-xhF zI()yVtkMkS-!rDe_V$b5a^*+xoszMR(TvM788{i$7*7v-q&JUm+f{L@uw(GulYu|n zN(b}I4Qzr3E#t+3@4x~u+YKtwil*geKMxK|@#-8mCOUYsLq$^tJXyuO?JF~lO@Qt* zRq5uE@ literal 0 HcmV?d00001 diff --git a/seek/the_paper_com/__pycache__/international.cpython-312.pyc b/seek/the_paper_com/__pycache__/international.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..11d2bcbbe0c9bba1d032043a4df59e94a274a0d4 GIT binary patch literal 2306 zcmb_d&2JM&6rWj-9VfA|osW`)fYv}OtON&WsVHhIRG}m;5**+Vtz3p>cam&auhZGJ zLaZPKk%}Odic5KJ9~Y`MZU25)0#d1i)j zpvVWR&9pEaaW<=iHunQ_9qP=Gf_0=wI%bh+X%$UlU0O*fWEEY;O7twsMZ9Feau~OK zC#vc77%iXF_MK`{n-AeaJid$1@onU632#VC=q5CJ3Ad$$?&8xq$&1>RdB(jsrG{lV zl!9YoGjp29;d8H(Qq5Ftn^IfYmgVIaho_DgwX)8N^9;f(+KgQ)I>=q9{GxtIveQcH|duZz-B zS+AIaPEh)BMKuEn7s2^%yks+CjhQB7gSi+PTg=o6F6M=Igqujv?uYI93Tj9=de9wt z?;ix`-(urpoC0OP5;4x43b(Z8bb>G_w_oxiFi`L<*%As1eFZP!Iia&{g>!vH0(p1` z+jK%XMoZFJ#Kry1Mf`BQ_zZKX#TINYHVr+qXF=eH7&k22Q7w&mokC1a!*-f&lY`uM zf*CHhBs}&AY*6&nu)BL~Ep=#lvYsB;NM~#5>{@!?a-mKV*Uwx%vrhW!-F+_U^DFXF z{xLA%=dVEK?fK8O4<;yywS?wKK(+680B;;&$6s?>RQLv=IWZyIR#=@o=&MLrX^foi zP;FcNMYh{kXaVYr=eBU9rPPwQy8558lK!u(7NPN{tHZM2Qb)Bf$$_v3*CG;{CmzfG5TtIV9^5=>V z0&Rgp-C_w|k5ky`JF+cWBpPX02mQBZ0HWutJu zRye;_xZqBzZu;UnVeP|55*uW&Mh4f(uGZ+i(%|?{mp3M+Y76f3-CqI7nPV=Lj0LQ#3&Y(L&U`FvbN+ z1g#?CX8ZS;i~4f`pL|%j?rn3}{7F!+)37PbjALkCq{21kp$%l31<&8X2sU)bhALG3lf0pezUs~L782DgywE_+$RH#enf`+ GZ2kp_i3*|s literal 0 HcmV?d00001 diff --git a/seek/the_paper_com/__pycache__/tech.cpython-312.pyc b/seek/the_paper_com/__pycache__/tech.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..906d3c63e49b2b1b2e28a0ae40f03afc4100af5b GIT binary patch literal 2207 zcmb^yOK2NMbY?Y@Wm&S6*e15~VU?t*g&n1#Nohh`LMe7cz&XT+un14qyCX}=YUR$Z za4Z~fh#^j)r9HGcgq};trN@>+PrbE=RqY^U+w@ZElADSH4LOv)+0|N#$)(T-`{uoQ zZ{ED$y!|neh#`Qf--?UrFhaizCIH|)*!c&Rdq_t*HjpA#$-os!k+7gcMyNs*;?cwi zS0YLTBMHqQJ-m+e2#Y)n`DT z#B2t;`{#0Ip~@?&U8qwDw z%t8fe0~N3?t;ZFzfo@93(J&~XXcs3)sVXL;pxErOkWR`47#eEht3OowO-F`|nL$xeQEn!=h*IrCqJ)hUA zI?FFHoAJEG*!8?!WOPXdJ5;My@;11eU0UIBI8jds=d*$Je%P}10eZfMevKyX_BEoT z&e-Im=+q8uqD(|MVyYEJsT-rTQq^mQpyQN&SW^vO!i6iIbvM!RPh*AwzGE&bZ;95p zz(v`4Uw}k7)(D{I>!>Z^@JVO#-Maq$c@A4*6Xhc9E)m1R4cjH#U~W`bZDyA$%ng~< zWxuWBdXmdWZ?r} zO%;9*exVMCA_ynC^D`@qaWN(QQW5d8y?4w-{<#>R{E)bMsOQ4YFM+!KC5uAO*d@&k z)wsbAgXRO|4>-~KFQV`G33`uD0xPDNbstbWgfadFCI3Jdo}i;o(8RNdgimgqd4^zV z#|R#A(r>mAtly9wv9t%ncmX>{&+H&Uv{Q)0H|jr!##$t~HtjV=>g*PI>+$&1=6Kd2 x2U^D`0B@0W>&Uc69Bj#FJ>u|Vl5hsoTjXdvioz-X^rwA~$Y^^6k#W!Ye*u)a>HGiy literal 0 HcmV?d00001 diff --git a/seek/the_paper_com/base.py b/seek/the_paper_com/base.py new file mode 100644 index 0000000..61fee97 --- /dev/null +++ b/seek/the_paper_com/base.py @@ -0,0 +1,32 @@ +from DrissionPage.errors import ElementNotFoundError + +from database.tinformationsource.model import TInformationSource +from database.tnews.model import TNews +from log.log_manager import logger +from seek.seek_base import SeekBase +from utils.time_utils import process_time + + +class Base(SeekBase): + def __init__(self, information_source: TInformationSource): + super().__init__(information_source) + + def get_news(self): + news_result = [] + _news_list = self.session.s_ele('.index_cards__AdZtA').s_eles('.ant-col ant-col-6') + + for _news in _news_list: + tnews = TNews() + try: + tnews.title = _news.s_ele('tag:a').text + tnews.url = _news.s_ele('tag:a').link + _time = _news.s_ele('.small_text__dR01h').s_eles('tag:span')[1].text + tnews.occurrence_date = process_time(_time) + tnews.source = self.information_source.title + news_result.append(tnews) + except ElementNotFoundError as e: + logger.error(f"ElementNotFoundError {tnews.title}: {e} - Failed to find element in news item.") + except Exception as e: + logger.error(f'Unexpected error occurred: {e}') + + return news_result diff --git a/seek/the_paper_com/content.py b/seek/the_paper_com/content.py new file mode 100644 index 0000000..92f17ed --- /dev/null +++ b/seek/the_paper_com/content.py @@ -0,0 +1,50 @@ +import datetime + +from DrissionPage.errors import ElementNotFoundError + +from database.tinformationsource.model import TInformationSource +from database.tnews.model import TNews +from log.log_manager import logger +from seek.content_base import ContentBase + + +class ThePaperContent(ContentBase): + def __init__(self, news: TNews): + super().__init__(news) + + def get_content(self): + content_ = '' + try: + content_ = self.session.s_ele('.^index_cententWrap').text + except ElementNotFoundError as e: + try: + # 视频 + content_ = self.session.s_ele('.^header_desc').text + except ElementNotFoundError as e: + content_ = 'not found element' + return content_ + + +def get_content(information_source: TInformationSource) -> list: + the_paper_content = ThePaperContent(information_source) + content = the_paper_content.get_content() + the_paper_content.finish() + return content + + +def content_task(news: TNews): + logger.info(f'{news.title} news_task start execute at {datetime.datetime.now()}', ) + ofweek_com_ai = ThePaperContent(news) + ofweek_com_ai.do_seek_task() + ofweek_com_ai.finish() + logger.info(f'{news.title} news_task end execute at {datetime.datetime.now()}') + + +if __name__ == '__main__': + logger.info('This module is not for direct call!') + news_ = TNews() + news_.is_static = True + news_.url = 'https://www.thepaper.cn/newsDetail_forward_29745442' + content = get_content(news_) + logger.info(content) + logger.info('Done.') diff --git a/seek/the_paper_com/international.py b/seek/the_paper_com/international.py new file mode 100644 index 0000000..64d7127 --- /dev/null +++ b/seek/the_paper_com/international.py @@ -0,0 +1,38 @@ +import datetime + +from database.tinformationsource.model import TInformationSource +from log.log_manager import logger +from seek.the_paper_com.base import Base + + +class International(Base): + def __init__(self, information_source: TInformationSource): + super().__init__(information_source) + + +def get_news(information_source: TInformationSource) -> list: + instance = International(information_source) + news_list = instance.get_news() + instance.finish() + return news_list + + +def news_task(information_source: TInformationSource): + logger.info(f'{information_source.title} news_task start execute at {datetime.datetime.now()}', ) + instance = International(information_source) + instance.do_seek_task() + instance.finish() + logger.info(f'{information_source.title} news_task end execute at {datetime.datetime.now()}') + + +if __name__ == '__main__': + logger.info('This module is not for direct call!') + information_source_ = TInformationSource() + information_source_.is_static = True + information_source_.url = 'https://www.thepaper.cn/channel_122908' + information_source_.title = '国际_澎湃新闻' + # news_task(information_source_) + news_list_ = get_news(information_source_) + for news in news_list_: + print(news) + logger.info('Done.') diff --git a/seek/the_paper_com/tech.py b/seek/the_paper_com/tech.py new file mode 100644 index 0000000..4ab219d --- /dev/null +++ b/seek/the_paper_com/tech.py @@ -0,0 +1,38 @@ +import datetime + +from database.tinformationsource.model import TInformationSource +from log.log_manager import logger +from seek.the_paper_com.base import Base + + +class Tech(Base): + def __init__(self, information_source: TInformationSource): + super().__init__(information_source) + + +def get_news(information_source: TInformationSource) -> list: + instance = Tech(information_source) + news_list = instance.get_news() + instance.finish() + return news_list + + +def news_task(information_source: TInformationSource): + logger.info(f'{information_source.title} news_task start execute at {datetime.datetime.now()}', ) + instance = Tech(information_source) + instance.do_seek_task() + instance.finish() + logger.info(f'{information_source.title} news_task end execute at {datetime.datetime.now()}') + + +if __name__ == '__main__': + logger.info('This module is not for direct call!') + information_source_ = TInformationSource() + information_source_.is_static = True + information_source_.url = 'https://www.thepaper.cn/channel_119908' + information_source_.title = '科技_澎湃新闻' + news_task(information_source_) + # news_list_ = get_news(information_source_) + # for news in news_list_: + # print(news) + logger.info('Done.') diff --git a/seek/xinhuanet_com/__init__.py b/seek/xinhuanet_com/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/seek/xinhuanet_com/__pycache__/__init__.cpython-312.pyc b/seek/xinhuanet_com/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7cf1d200262060d581ce2dee2bfd8593243703f4 GIT binary patch literal 141 zcmX@j%ge<81SSVE(n0iN5P=Rpvj9b=GgLBYGWxA#C}INgK7-W!(uuH&NzPA6jVVYi zNiB*gPEE~@smRRBC{4^uEs0Og&y9(X&&=(bATo*q%0plmL0bWNao~R-1@h$AzL1p+GS~Pi$V1@yIN~#s0 zK^M5$*`3+B-I?Fc?B4=`HUe$&QEvXb7DB$mf!m}8^K^(3a*t?4qdG}q@6l=2lk`yJ z1zpIBNzvv-UCMfsUYnQnmaH%7vw5%Hnr%zAQQ{%jh}N=BG#_hwC^+_$?OH1nwKnEY zi(31-m<(us=1qIF0Pr0wsCC>qMF^ks5awHy-OV+7E%*@jhYjsB??Y!ml3^{x!b+qu zBw83oQFjEI0Ty;wCOeO^&N^Kk+xp9Q5NYbH(p4Id{u6&Q6Q@?^6`f^S&Ya1cSM!CO zc7^jihxy3d^;|m7v#OcN=VtQ-o?m(^@B2e!JS)s|6SrV75j*HyPq;%G> zZ;3hhb;`5v7x)MKp2mQDM(+`GPk7f;B7XpUl<2;blP~E^++#^bVTp0erzn|R##EHJ zV2K9P(;SgzwJJ(VR}Di^474rFGVl}v7-pyrsVr1Dvi z+CE4ut?~|7RO~*u_k{Z^KM0KwkmqIctuJu5rRwXs-}%^g;wj9LR0Af+6A?c9a7ADM z+;ok$k`h@&*`c zoUur;V^CwJn$a7vD28)+Q%)mZWai)}E?O-HGYmirI9P#MOElSv$#HLS4-Z0T`5Jyw z3=E?z^Dxe~&)Z(KJy`5LJy%F4%VEDqoc&wj1LmswI34f(i z!hr~2da)yru+z)`bJe6Fm48nPwlUxs=Y&%vlq5| zFK*nZM^DtEv1&B7_2G8(Y%MxkjgD?yu19`eiwssHgIia(BWG%nk!oaQKU|3n?Dz)}&T_@q-^6pQ-alCN_tk^_FpY?gy)dwv z{O%IHB#}o_`=w6dQIrCW`!L<*)GSjJt4&d|d99!$?^l$M3##r&I3nIoVEj08gFvi6 zBOj)iu0yV*9MhM9nd*Y=R^E!LNZL_=3!krpqo~w>|AT)K=zj2erJjF>+9a9d?rdEv zl4+I~#jxv(<>kyQ@Z5n*h!@blge71g!|$cY5^-U9)D?5<>>6Gs_pJa{+~&#w)^u?X z?yi)OeR9t%;g~DpqI2A)d%S5VBE~{o;P3_@R#0_{7E}d0pW&H|f^IfyiIedh@1hO# zurUDy%0XAGa&mmTYoh#aJ&+Z3j-5uhzwAO?<5?zP2OAZN9fE_U?#%_4aW2 zSB^)qa2@a(Lf`gU(oS7XuPyGcpP+;dipon$1SA96EDwu&g~C~bbPfw2dH!FgS|xE|rr0OryF@WnM4et z4*@w)O8stu#qIDh5Tm+*cLxwAwtaHrw>LJ&?oE9&)upfT^Wo_c?p4#Nw)ycPOlQY%HneEA& zm5KK%q4yt)vo=aG17+1rPEm@3a|;YS&qhXuQ;7vLs~4rq`5a5ckMY;RD^~O}&)Bb)o9aAEFns+Q_M63e1wx_T zhF2}rsb;QnQdI)>&UB8~#_DY9*_(7km~X5gypj{}!MSr_o;v{#V=xnLr0@?MvyT&O zQO_8rlOQmBJceW#jY&4ntk#U70M5V%mQdij)zUEZC2?{AKB->#e&IOxm;53OF=rZo z2C^$qO8-HE-;=TL$%QB6)hA@&35h@Rc2RlrH_r%kyCV{fRAT3M3G`o$J;iP}B+)s# zS-`;@-3<}Zzft_VaJ()C%U2OnmC(5z@%%Ub@b9mcuhhGG%U9Olt%v5JyZ%kb@%tyY bJLK}UJ?+hw9kFNM=&{(d8zf?{z5o9J#T6KA literal 0 HcmV?d00001 diff --git a/seek/xinhuanet_com/__pycache__/information.cpython-312.pyc b/seek/xinhuanet_com/__pycache__/information.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9506b8b3e37a8a51ff29b72cc9febf29c92488d3 GIT binary patch literal 3484 zcmb_eU2Gf25#BrAk(5YEryr{N@y}7RW3!U!HgWtT)QX$flGPMS0l9!fs6L;0EBUD7 zQQbR25=#XND!>Ym1UAstD1xBPL(v2VS_4G^zqBagzO^Whl?*Z$`>A!Cz7dgI7|u&) z_jDvB2nY&vf!m$knVp;2nfZ4A6pOVYXm9-9oclbC(BJ69Z{jww)x#ll8)-=643wrL zZ*Vz2&2y9v7=fIS78orUVlJ2tGFmi3xo|qn=%CS(YfZOuh({Ah3#}n7j9c#nJpJj2 z)`A7C6-Q?UEwU!0V{|^tYf+%vaJv?pKZ+2UMW#72QsI}bz`)H&3v2e$mpmc}U zjysgj?J3eEn03`>9Wd+mXRxmgckIx!_rE$DGRxdDm+bl%RWX(1-0lko&SBG@%G=}l zf~j2~Bu`+y^U|a_nezGc+mnPXyoVSFcl0>V?6(|YT=A;6&<$^ z32l89CO04q7^OK4rFo6RfmwbX&2k#Q7EKFUU=5|kb+p8#gVQJ}xRIv+x}AE%2*qMn zWca54(BIH+>kxE5=58aq&U}k6p?4q`N?eog=p$|_$-AOeSir;$D~fLFwxT2hu3%wf zme8znTNEW@sFtND7S%1wtIv*K9-YbLH9WI`ZA@k?aCv4)H?J2|6WdBApPOkiG_|ly z+F)a`yUCT5|F0-;tPbd&uA+zG*ju4;xYy|$eh?nnf<=_sP6|q+5PZzNR&e?@nnNpG z31t~;XP2c?yP8&aJnNT4lZ3|m$Fjng`19y{%LuImT98fu$(o;#D?%x-r~ZJiAqyqA zyOji`{?{+vPtRmmgZ*_&z-MU}8>U$X0Xx@WSBt+=;<9m{Zj2?tx0*w5|3HBBm_|d$ z?(roKAu_Zg+I{{!+wW72Q4>nydq@-Cub%*iUI$QF5#JEs{)IO#a`rxd2k0F9Qk_An z^Zj~Mnl%2!*SLt50cRpxH9|9&1nD-B z{l{&&u#~|I6n2t9(oPu>YL`N=RWNL~Rbev&9Ct-VyCGtE1cxoENoI}EB6GuYP@r_j z&T8~msebG4(5<3sPw#r98t=P3b8Ds&KT?h#agLteh@Yv%&z0ln{?NM-|JHizr}2a9 z7ixSYc8+`4fAGB{H$zqV@Gm1DL_VC@kWW#>{-;k` zQ1{W#5f{70J(PNGM{h-)gD>7y?@l|X6=(m|ilmk$^^1pH`=|_<turW6to|eP3JYdrAl7zJv!o6qM^iJwM}=iRWpO#mU?1t-y5>$ zHr>TeXb=c%7&<`y_I_vJ*hYJ5^~T57GqoEa(L{-LR(G*w%B{4Vd{wmQQX}%PaIP!TvyoK`hC$tRf>oTQr`s zmqgpiMJ*vOP&vx~A#{*RLhFrJ-+a~SI$Y@*E_V$(og>c3#Xt8}Ce!7~bY=2NdGgA} zPXi%1>Br^LbM|Em4&`dVW5$Cdnz3N<5Fefa(Uu%Wn!j0F|#po#Ti!|>FNVP zWhqJlBd6-7q7;WNUDqwy!weZHGjGfAl$14{zP!uw<-9*ArsUSFP3dIQYjWL zs-Dp2i5Et=;w+n0IExKAOgYDynN22azh4!%MYj~lCR@)C`nVu8Jw8Gn>d`yvS@;{? z;f;|om{Mr7K7bBToZ~)4?T^uk$LQQ6^xPvf_y`R>33hV>H;+C+Fw{~47k3gb)DVpK zge^MM+C=Ue=M0|MLX@aUNQkZ%{~GA43hk@oEOn$8H-(d*^p9-xrySuc)k8<2-xLzn o17{d9P?eu&#Qsl&nA4Wn6b5TyBud`yo1q6nZ!M05e&)h|0n^9}^Z)<= literal 0 HcmV?d00001 diff --git a/seek/xinhuanet_com/content.py b/seek/xinhuanet_com/content.py new file mode 100644 index 0000000..db7fe87 --- /dev/null +++ b/seek/xinhuanet_com/content.py @@ -0,0 +1,58 @@ +import datetime + +from DrissionPage.errors import ElementNotFoundError + +from database.tinformationsource.model import TInformationSource +from database.tnews.model import TNews +from log.log_manager import logger +from seek.content_base import ContentBase + + +class ArticleContent(ContentBase): + def __init__(self, news: TNews): + super().__init__(news) + + def get_content(self): + content_ = '' + try: + content_ = self.session.s_ele('#detailContent').text + except ElementNotFoundError: + content_ = 'not found element' + return content_ + + def get_occurrence_date(self): + try: + header_time = self.session.s_ele('.header-time left') + year = header_time.s_ele('.year').text # 2023 + day = header_time.s_ele('.day').text # 12/27 + time = header_time.s_ele('.time').text # 08:05:11 + occurrence_date_ = f'{year}/{day} {time}' + print(occurrence_date_) + except ElementNotFoundError: + occurrence_date_ = None + return occurrence_date_ + +def get_content(information_source: TInformationSource) -> list: + article_content = ArticleContent(information_source) + result = article_content.get_content() + article_content.get_occurrence_date() + article_content.finish() + return result + + +def content_task(news: TNews): + logger.info(f'{news.title} news_task start execute at {datetime.datetime.now()}', ) + article_content = ArticleContent(news) + article_content.do_seek_task() + article_content.finish() + logger.info(f'{news.title} news_task end execute at {datetime.datetime.now()}') + + +if __name__ == '__main__': + logger.info('This module is not for direct call!') + news_ = TNews() + news_.is_static = True + news_.url = 'https://www.news.cn/politics/leaders/20241227/90e76f85ad4a43ba94802b07c5736e00/c.html' + content = get_content(news_) + logger.info(content) + logger.info('Done.') diff --git a/seek/xinhuanet_com/information.py b/seek/xinhuanet_com/information.py new file mode 100644 index 0000000..f7345b7 --- /dev/null +++ b/seek/xinhuanet_com/information.py @@ -0,0 +1,59 @@ +import datetime + +from DrissionPage.errors import ElementNotFoundError + +from database.tinformationsource.model import TInformationSource +from database.tnews.model import TNews +from log.log_manager import logger +from seek.seek_base import SeekBase + + +class Information(SeekBase): + def __init__(self, information_source: TInformationSource): + super().__init__(information_source) + + def get_news(self): + news_result = [] + _news_list = self.session.s_ele('#focusListNews').s_eles('tag:li') + for _news in _news_list: + try: + rs_news = TNews() + rs_news.title = _news.s_ele('tag:a').text + rs_news.url = _news.s_ele('tag:a').link + # rs_news.summary = tmp_.s_eles('tag:a')[1].text + # rs_news.occurrence_date = self.process_time(tmp_.s_ele('.info__time').text) + rs_news.source = self.information_source.title + news_result.append(rs_news) + except ElementNotFoundError as e: + logger.error(f"ElementNotFoundError: {e} - Failed to find element in news item.") + except Exception as e: + logger.error(f'Unexpected error occurred: {e}') + return news_result + + +def get_news(information_source: TInformationSource) -> list: + instance = Information(information_source) + news_list = instance.get_news() + instance.finish() + return news_list + + +def news_task(information_source: TInformationSource): + logger.info(f'{information_source.title} news_task start execute at {datetime.datetime.now()}', ) + instance = Information(information_source) + instance.do_seek_task() + instance.finish() + logger.info(f'{information_source.title} news_task end execute at {datetime.datetime.now()}') + + +if __name__ == '__main__': + logger.info('This module is not for direct call!') + information_source_ = TInformationSource() + information_source_.is_static = True + information_source_.url = 'http://www.xinhuanet.com/' + information_source_.title = '资讯_新华网' + news_task(information_source_) + # news_list_ = get_news(information_source_) + # for news in news_list_: + # print(news) + logger.info('Done.') diff --git a/seek/zhihu_com/__init__.py b/seek/zhihu_com/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/seek/zhihu_com/__pycache__/__init__.cpython-312.pyc b/seek/zhihu_com/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d9d010f56dde1e19c6b8c4386318cfd2b6e2583c GIT binary patch literal 137 zcmX@j%ge<81lJ?ir-SInAOanHW&w&!XQ*V*Wb|9fP{ah}eFmxdr4eBjlboND8dH#3 zl3El~oSK>)QuhM7=4=vrdNz@esk9SwO1qfApVkTFWKa0h`(`vF z+sGl=_D7$O-@JF0hhBtE{lPjw{ z8n1Rh>&+U-;_7NowpTZx|AQGTy0;DvIwP~JJrxcojEK!y$v=_P$kzn|&!2Yf^aMfoX!XC5XQJ5ACw z;6b(0Ns$hQv^)Ji2C!*6;~!pYcMaMDl*jJ#Gj{Tri)IkP%zfQ{rj19lN!Ze4N5FSE z=?8nTy1+1jC=US<4hd+e5siJQ2UHXu|KYG#Dg56hAVbq0k_54K><6IM zkTfWDC)LN{CNlyTR20OK`e}UJv<#+1zzj*ot7TYFvKO;S`&pHXA-%(tf0$J!)c~WD zIT(hr$ym9UWE?b=1>C4?$XFQ%Km#>H`ljOXmV-_|K^`0?8In3kljMH$>FLcK; zYZLi}XI!UU7u2!*>XTjba!j8;ufR09$NTR>SImhrAqg72TMm#j7>Nzw1U4Wca9E~N zLISyrkdks(zA{+8icz^u!O92Fpq&d9jL2}4(aw=!#5Q5whHwV8b4h>&F}uJXrVnZ& z#whkW-fxppNOcr~vdTl0e}pC}R?0XI!J3lMq-T(oIQ=vtVK!5_2uo$RyL>LjZXe6y z##k@ZjnGOLLUtUR)fUCHC1J^|vGiwSVcEDQk!uXgo>x(M(7Z?}6#zJ)W?>v>u)|mg zSHm#f0A8Bh;?Pqq>BNNINTwypty}_tYHF?Gh-bQ9XnaeOJ97yDs)V?fm!+gl(@QGs z05@98OHzs^>^ceTy0_7;Q)7%?Koq18$p~3cW>*Vv*l+2$$whAIS`}g|+67AzP#2~C zUs5+MAq`caE{=#$hU84Kfa5L^N~v#99+bJ!I$M&2%OIo(DyYn$f>0Fz#zNXYeEgLl z?zSxH$YBZ6x)2s)oaG7Oq>q?7)h^_hMxjV-;}9;Zh1InF^uqt`thC zFQH~^LM^BiM)xB2@`9Ci03$L&%BPhR_;@y78o`-rq1SiflKH*3Y!kSQV8^5fun~NT zR5cHkO7q?Mjej6;V`>-3^l%>9@4@4qz#}D5`WdEHq=&n|b>>GffvznpkONZO{^@V# zPQCE?`B!hg`_o%LpStz@n`3!H3^Po(tXefPGE&doRrPRJjn&k*IVi#m*E7xdsnH&| z6c<(MT=X6i5W%V-#`N_9US0B53n+ja&u2`*I1aZAHGCsSU&mO*6AL&VJXakJ zkTkMbRvjd2D0Jw5h0dk9XMXhgI~P8`{M)&|ee1t3|4si_XbXS-qhHSb_O-d=mv8^W z-`{%v7tnP3omZjtE3{*w@z!fE&As{V+__1<_pO)y?$*hFoC{AOlof!IR0M~}D`M{Q zU(HQVL%YpHq3z8oedLH8t|AKIvr@RSS=org#jsk^L)r-!J?wFek_0P-Dl3DGrqCh9 zDrkRzasr4Uk1?#AvvydS$K^Z1O2Lj|B^1dj2VFkG;qkBv$M7&{kdi}BN`tz~No1R` zQktRIj7J@w0Ld9^th(cvljICKR_^rpY0{>mz)r?kEysbjQzRYmFsvGxSjdKFMY`B+5~4JT=Ixq#8aCH`{FKz0dFZcN8tuPF#(M*k;V`^T{xroymxUQ({M zB#bN01WyO!#+scd{>1uzE(b222yJo&)VTxg)Mw8P%+e zRPCS9JPHj3#phM$ROdD4G?9u8F;jEgv^i?p9BK8$Ox|$MA5E4C*}O!j@5E<|DqifJ z=tz{+OmzHp-wkW!rK*cnlN(~zb#d#4sCC1%F=pKww{}LYoq*7rFqdCaT~tk)V&;ap zxhZOHni`3jTjS>TsJR^wdJ-j_6J4{#rRN9E4NNMh@_tkBRza+I<3#5ljKzua>d7@1 zd(O&VR^Kq069!A%P!%;)P3B!QtomXFR$!f9jp+=hj{W3VLT5OmI;}dBbvkQCXHH}n zMN~z9x>taiYQMyA{Ufu6!ZUrR`!2M;vv%s}+f7qd5zEH7VbfK^ru(x-3qk?Gc$`St z#>;K*(9gkK zpV*ewRe*hvyRD?7Uh=y&I8=Vupoj8@QVEnll;b@T?8AC>SHAqi^_I>O`9~%Nz&|R{ z0OX@;OP5~$aVCO%td|4i<9sP97pS{ir5{@@-74u7sT?R=QE32j#bD`fkzQ%!AT4qP zX_Z3jzesF7HBxR3I~N*YqT%)em>rzq2BrxdkudJ1>HpqvgG)S(VJ2LXIL3&^O3K6o z1L`#}pk&J&<7n$IN#Aai{dtDovb}Umm_yZ#?Cm9Kfl-m(U=}fiagMns>@Y z@Jtb|S_IdKaP=a%7&8cEEP`i=_?e5~*&<(ZFAZdcN2{jW$Dird6Qj-XfGzU&0vF3wjOZ1aqiIm^Ps`nB(pc z%BgQ^p9*|Kxr8aGe-fjz7Ob6IqF@ZH6#>VWv9L{mf^=m9WuKBzEvzpiZK2RN-P#Em z9wP0Epka};s~3#wA=0{ggpsDcL=jOe&=bZ5GKelyQYNv!j7%j$-*hqoJ<;(3r*>zG zyHE;>`w52?JnsB69K**1TB)zF?qRn{ImfVGXakqc9MUptgnHVjrCyK;$G`#132C3! z!r1Qcr4d|kIKVYqNVoQ(bW54t0^guk;PHUN`2as=kI;*AI3G58^Y`E)883Q>R5cHk z%JSX$jsKUxjoJ6TWU_vD9xziDflF$p(%+-Yx}4v-rWUxIWdY==@t)T0U%hiZy~T!$Q$JIKDfgTjh;2-Kvl-<|XacR3D|V=Cl5a$!>B8QwwB zCN|UIqen=p4MN%stKq#-h)ScH>IgYH;-?6j%|wCcHZHJDuo_g^;U`>!E|M6_<^0{2 zLBGf27wC)?*6(pShxUQoY_g!b^b$5fq)}`Vj@fY(VNrH&;9c3+)I<3-()99 z+DW-MKdpbvAc|1ub+~+Oeow&b18+GQIPU?e>ll}pq|88ftf_umfMNVT^P_%-e4toY z7Z^5ob`0*Ph`}8_y~DfPx3?cX;$}$Bv9AJ``CxtPL897rkgmQt{cLLhsS^q=-AbT+ zUZA~@wZnjw7+ z_O0H%%e-w})01nq9%<@wH`09#+xP8Vx9fm;Pav?nY2VJF2Iro!?Nrlap3&XIV+N4m zsWr80LshLUwNP9evQ^tyyq3kra<~ae4!q9g^MQwqf@l$>di)NEG#sYbtpsT8P$X`WG>nAxJUyZNsthS*z6<`G3bEXS*;Mgu@A#` zrI1<4YWV$`R1G?*IsH)Re53ZGY%j_lfh?tFbQaK53u-{NEt`=P1O-IoUIzmbD|`X( zA$Tds6oh~_8aWKfXvX3XAKT!bx{~7_)2D(PNl+*Bre|Q!Ky;(9D4abn>Bb)#-Z-0g zzTjNJjHc{nUg6n>7YUVeK{D%K2<8zjWS!6=t``ty>#s2v5Br3LwSM^3`J@- z$E>Y!Yg^RX_I_8)+84L(j#_t5bU|duQgO+2(G;mZ6tg(vmZ7L+Xkzi~tl@|h+p1Szd)S6eGj#-+{DiZyBBabDQ3&m43nH6TKqCN2Z^O6+vJN zG2F0my5paE-|fAEzt6z$BA{Jud7mESkwdrGlSO4K`*(oa_A?UYDA*`xsYrxH0pK2;)|PcziJ8swkm zT6VU`KV8SQwrBtnQCN1>$s;QfBvL0w5RPV~QN5=~8rf{wT`rB5%7K2gTmz8kYRev@ z^r~I~kgG;HK&}=^QMuH%S0(MYDbX{h&*3HQb~eLqhpQt1A!j?QvD=RZ93H-fLZNPI zGs;lBiE=^qI>xd3O(BYj|&KY5> zf{!~deC*|gLufC+KaB#=@VKkdKi74Gs9(xo2LTJM{_&zVqy}cV7H4yp`}aJ9=_~rv{|T)U;7Z zI{v*v=JE~txy45zrPvgd9kmUk>?mYxw#x^v`V2`VUlw>XLM50GG@{I3*uq8aV1`Hv zo?hr9)RWNoBK*@n$c|$-)rRrnv({N%-p`a_c|xBb*IS}`OH5xDR?HSQPg|!4V}%{z z?9cMb!yPD8-g~k)QV3VwWLLzrZbs9XFc!s)RZ%0v@~0#*<65}=G!0V?Q^&4pwtS() z@+-bpV;RO%#V3oup;1isxvk|}o2CEUEZ-*KHYGCS?RHL2{OcC4ty{l-?)I;K9zZ?# z;@q3#x55*)Lf%r-v{NqbH#W$V;ePGF{qBLyT?nt|9)4)f4rKLUGdd1AXtG`?a5KOb z2-SK9-g%NbtanlYg3~@*n6@1b5YodNlg32|+@9mRb2EyTDr-uE!c#!!44VzV)TxJm zyVv1E;&97=vzNK`fu6d+9OKR+Y^23O%kszy-t`@h}K5<`MkB#IDW5*aqx0O_R_hs=~cCl7&R>y$Z2L zWZA6~%M%jAbW`lDGp5gBlQdx`X-g7s+fD2=O`IJGkrBBiJKm|m=2vl?&UWgVo&C<0 zt|SOqhn^XZzVn^$eBU|WIrn?fcFGYn8AXak>pqWt=_oQ~C=s6vRp zUxhf0lTidFep%fsM07+TpwYe(o9Z)g24`-9PJvjxLf>Zu`c(K!XGluv_qavaD(p^e7e(HxmZ(8av0pp)=OdaWdnb;e^# zDdlnAGc0htRVAqf&UH*u^?L+NUsA#wF@e@*bGRL%&E`v2_Ee{!(t$w8gl-fq8Vkb4 zqJU~Kr}+E6fcklwPs6JL>DKZi#g4;~L+ieabh2E<7f?q^8U-R1uzgYhHh2+KNEk|L=mbgaa=1@O8vD=? z=Vp0q4S4*)OB9FQn?wpCFX_K+cX>J4-$}ZTr}{bBCrVVm%Oh~s6drqdDa|H&h8+Di zFYmHpO-O2a6FShA4Lc?fg?}Ux2qc0NQeY7{9^9g~G6TDnwLEAv0&Pb)q$rSsBW#3};#c?bm42*{(BP!L0gux&eHRG5zep zGY2DdS%@y1Iy6sj16*$5MeTX*RB_nc5HW8LnYYjQ!sh*fgMTtx#(}Rky^{c{{NjsM z=c}f6g!5`5dG(>Z`th~}rYyo#hnVW=rcan{w+tw^^i~NnF$*SUQa_>pXZl)NQBYg- z*Dvysx%P8Jq~E*9HQ zh<7??cLpu(5vC)=blh1bF~yxPG{{sA)Vg+Jk-mAB?XH%EcG3IhM z1(?hEB!-K0ooe!OrKQ72UeQv(;fj$4%#~bA$6oSEi;USz0p@)*3A68~WOWvhYl186 z4rCWt4{u0zfe;Rikbjp8#KG7GF3=wj#t0wko7Uh> zaeVq3e0m(8!DiSq;YCp!o;y=H)kumz1pZ@hejP z?o$TajLKL3X_7~(xjmG=Ibx5{P^6`v(G@x84G$>^ssxM zzD|y4alp2MUF{0C^D8W#GS=Pa>B4k0;$%6Y zpLfV{t&fSHaO&b*IEI!lyt@3-kD|)_ z^QEyUu=3(>A>N1R+z|i=>a7FXSmi{p@tojwiBXE1zk258Z%;SZ*Vn_+o4Ga!6 zzj^t_3$M$!wc!$A%Htf9$M%88$pMd(EvsIlM283ge)^Y=#DqJ-~kZdfjfEA*vRtxao2{sz0lzV#L<63Qj2i; ziPVL!z;mRFb4%0^?{JGeHl>h#xQ}^wY=QYo+*RQYKMZTCoWMlYlBTF7b~}6xP)LML zGCmP|^|m2+_Phb3&0b*XVq-~5(i7sbdyzmp(HdDa{V|Ci(eASK57>EoKh)R+ek)K# zi*W{p6m~7(f`*(I#k**NWJs#J`It4CQEd4xSSgglqg;~`soRPFCnp-ef2`+vR^GYB zA3Qvc6P)y{N!^4lVyX<8DyQjrQ!`Fu0CUa)i41^-2}8tG5i(WGn<|s0FbGqVsS~N^ zgilOmtLZ4acr_apS}t~-@0zlO3z`Ex*BDEL*%D&5O!>l0<4nWMeL-eNpyN7YovxV4 z4l_Fg9gF$(Gg-67aDHc?d$FKlred}zT+kKhSnmvr8)uq>El0wI zM+3drN=k!t;i^hwJWMRq?2Od3hH6@a_dXh~c`RtE`ZTjlu>%Avc83dEBL(|H1^a$i8!k8khbb-z(glAsSK#b0 z1M;zjqSA{4=Ldq7yTe7Tk)nN}qJ85Bu9sF!9sfz~_~At+_gu|9vt=Qtbn07kIo9d# z%r?z!-Lq&ex=5d=gOvwAsQI8USn=Sz`Jn}K$wE%$bopFPU3~h1dGmvxnhO`0@@Yee zX$mssiL`UaJkxX?Qp&>|=j+%326NzUuwz85Wx ziy$U5d%;|}SWvZCSiM+O8wGA<8BF?BWYFpV`o#ePF;$Q~f)W=^*^~4H9W3sOm=4XE z4&js$(09J5yGvR-_jS_RxlN_Db6+Q|ox4j~`wo09&$d@}XQ9j6_8i=Uu57IY`2DPX znTM)Xzpo--@Ox_>z#ot*fIpyiV7L{-f7qitR7rlAl69z*{IEa+m=8-SjHx6sT&?TQ zpgwG}9L}RYV({!oc{E@?+G6Q8Qdd(j=Bkkb%+(A`e>F?jb1!+d$kIcRSNCdq5&D>- z0rPRXrDrGkaifgcNny;rB+ULN%IeJ`d#x$BzKngX%_iw>*l2s9%wv;io9zj&-4&hT zadi@xd7zSmV?zGh(ETftE6lQ8;c@;acP0P?>`hLg&s8d_bX8kQ6;HL_M(}gHi+F&b z^wDK_1(uE7qs`{UmCm5ORtSB2gZA?CW&t69fA-GG~|N2TzZ!>lks(Ool20+zmx|5im8J?*q9qzX}OdY z^(pcp9|S2pC=f|HIk>PvaZ1()F12H4%=ci5ZrptbcR1mWRq|V->fOUX4mcbR34Z9{ z8z6|!kmWN}^%=Tn36*}Xp@_`?F+EKX`dbM1tNYbRO$U5)WbQR}+GywL!~b|VXeysq zSNylh{C)lC!3AS>#8?_KmIlj?gpEf>J3m$H&Xfg?Jy$cY&IUNJ 1000 or content_len < 100: + logger.error(f"skip本条内容,内容长度:{content_len}") + continue + if total_characters > 5000: + logger.error(f"contents_result长度超过5000,跳出循环") + break + total_characters += content_len + contents_result.append(content) + # 打印contents_result的长度 + logger.info(f"contents_result长度:{len(contents_result)}") + except ElementNotFoundError as e: + logger.error(f"元素缺失:{str(e)}") + except ValueError as e: + logger.error(f"热度值转换失败:{str(e)}") + + except ElementNotFoundError as e: + logger.error(f"热榜容器元素未找到:{str(e)}") + except Exception as e: + logger.error(f"获取热榜数据异常:{str(e)}") + finally: + if self.tab: + self.tab.close() + # 返回json格式的数据 + return json.dumps({ + 'title': title, + 'answer_count': answer_count, + 'comment_count': comment_count, + 'topic_description': topic_description, + 'keywords': keywords, + 'date_created': date_created, + 'date_modified': date_modified, + 'follower_count': follower_count, + 'contents': contents_result + }, ensure_ascii=False) + + +if __name__ == '__main__': + # 测试用例 + logger.info('知乎采集测试') + # 执行采集任务 + zhihu = Zhihu() + result = zhihu.get_content('https://www.zhihu.com/question/588507809') + print(len(result)) + print(result) + logger.info('测试完成') + diff --git a/seek/zhihu_com/hot.py b/seek/zhihu_com/hot.py new file mode 100644 index 0000000..b677349 --- /dev/null +++ b/seek/zhihu_com/hot.py @@ -0,0 +1,85 @@ +import datetime +import re # 添加正则表达式库的导入 + +from DrissionPage.errors import ElementNotFoundError + +from database.database import get_session +from database.tinformationsource.model import TInformationSource +from database.tnews.model import TNews +from log.log_manager import logger +from seek.seek_base import SeekBase + + +class ZhihuHot(SeekBase): + def get_news(self): + """获取知乎热榜数据""" + news_result = [] + try: + # 访问热榜页面 + self.tab.get('https://www.zhihu.com/hot') + + # 等待热榜内容加载 + self.tab.wait.ele_displayed('.HotItem') + + # 获取所有热榜条目 + hot_items = self.tab.ele('.HotList-list').eles('.HotItem') + + for item in hot_items: + try: + news = TNews() + # 提取标题和链接 + news.title = item.ele('tag:a').attr('title').title() + news.url = item.ele('tag:a').link + + # 提取热度值(去除"热度"文字) + heat_value = item('.HotItem-metrics HotItem-metrics--bottom').text + logger.info(f"热度值:{heat_value}") + # 使用正则表达式提取数值部分 + match = re.search(r'(\d+\s*万)', heat_value) + + if match: + news.heat = match.group(1).replace(' ', '') # 去除空格 + else: + logger.error(f"无法提取热度值:{heat_value}") + logger.info(f"提取到的热度值:{news.heat}") + + news.source = self.information_source.title + news.occurrence_date = datetime.datetime.now() + news_result.append(news) + except ElementNotFoundError as e: + logger.error(f"元素缺失:{str(e)}") + except ValueError as e: + logger.error(f"热度值转换失败:{str(e)}") + + except ElementNotFoundError as e: + logger.error(f"热榜容器元素未找到:{str(e)}") + except Exception as e: + logger.error(f"获取热榜数据异常:{str(e)}") + + return news_result + +def get_news(information_source: TInformationSource) -> list: + """对外暴露的获取新闻接口""" + zhihu = ZhihuHot(information_source) + return zhihu.get_news() + +def news_task(information_source: TInformationSource): + """任务执行入口""" + with get_session() as db: + news_list = get_news(information_source) + # create_news_list_if_url_not_exists(db, news_list) + for news in news_list: + logger.info(f"采集到新闻:{news}") + +if __name__ == '__main__': + # 测试用例 + logger.info('知乎热榜采集测试') + information_source_ = TInformationSource() + information_source_.is_static = False # 知乎需要浏览器渲染 + information_source_.url = 'https://www.zhihu.com/hot' + information_source_.title = '热榜_知乎' + + # 执行采集任务 + news_task(information_source_) + logger.info('测试完成') + diff --git a/seek/zhihu_com/zhihu.py b/seek/zhihu_com/zhihu.py new file mode 100644 index 0000000..e75989b --- /dev/null +++ b/seek/zhihu_com/zhihu.py @@ -0,0 +1,173 @@ +import re + +from DrissionPage import Chromium +from DrissionPage import ChromiumOptions +from DrissionPage.errors import ElementNotFoundError + +from database.database import get_session +from database.thotcontent.crud import create_contents_top3_if_url_not_exists +from database.thotcontent.model import THotContent +from database.thottopic.crud import create_topics_if_url_not_exists, update_hot_topic +from database.thottopic.model import THotTopic +from log.log_manager import logger + +def get_content_from_meta(metas, itemprop): + content = None + for meta in metas: + if meta.attr('itemprop') == itemprop: + content = meta.attr('content') + return content + + +class Zhihu: + def __init__(self): + co = ChromiumOptions() + self.browser = Chromium() + self.tab = None + + def get_topics(self): + """获取知乎数据""" + topics_result = [] + try: + self.tab = self.browser.new_tab() + # 访问知乎主页面 + self.tab.get('https://www.zhihu.com') + + # 等待热榜内容加载 + self.tab.wait.ele_displayed('.Card TopstoryItem TopstoryItem-isRecommend') + + # 获取所有热榜条目 + hot_items = self.tab.ele('.Topstory-content').eles('.Card TopstoryItem TopstoryItem-isRecommend') + + for item in hot_items: + try: + topic = THotTopic() + topic.source = '知乎' + # 提取标题和链接 + topic.topic = item.ele('tag:h2').ele('tag:a').text + topic.url = item.ele('tag:h2').ele('tag:a').link + pattern = r'^https://www\.zhihu\.com/question/\d+' + result = re.findall(pattern, topic.url) + if result: + topic.url = result[0] + else: + continue + topics_result.append(topic) + except ElementNotFoundError as e: + logger.error(f"元素缺失:{str(e)}") + except ValueError as e: + logger.error(f"热度值转换失败:{str(e)}") + + except ElementNotFoundError as e: + logger.error(f"热榜容器元素未找到:{str(e)}") + except Exception as e: + logger.error(f"获取热榜数据异常:{str(e)}") + finally: + if self.tab: + self.tab.close() + return topics_result + + def get_content(self, topic: THotTopic, db): + """获取话题内容数据""" + contents_result = [] + try: + self.tab = self.browser.new_tab() + # 访问话题/问题页面 + self.tab.get(topic.url) + + for _ in range(10): + # 等待内容加载 + self.tab.wait.ele_displayed('.List-item') + self.tab.wait(3) + # 向下滚动页面,直到所有内容加载完成 + self.tab.scroll.to_bottom() + self.tab.wait(1) + self.tab.scroll.up(100) + + # 获取话题/问题相关信息:话题内容、keywards、话题创建日期dateCreated、话题修改日期dateModified、回答数量answerCount、评论数量commentCount + question_page = self.tab.ele('.QuestionPage') + # 获取话题属性,为QuestionPage的前9个meta标签 + metas = question_page.eles('tag:meta')[0:9] + # print(metas) + answer_count = get_content_from_meta(metas, 'answerCount') + comment_count = get_content_from_meta(metas, 'commentCount') + keywords = get_content_from_meta(metas, 'keywords') + date_created = get_content_from_meta(metas, 'dateCreated') + date_modified = get_content_from_meta(metas, 'dateModified') + follower_count = get_content_from_meta(metas, 'zhihu:followerCount') + # print(date_created, date_modified, answer_count, comment_count, keywords) + topic.content_count = int(answer_count) + topic.comment_count = int(comment_count) + topic.follower_count = int(follower_count) + topic.keywords = keywords + topic.date_created = date_created + topic.date_modified = date_modified + try: + topic.topic_description = question_page.ele('.RichText ztext css-ob6uua').text + except ElementNotFoundError as e: + logger.error(f"元素缺失:不存在topic_description") + update_hot_topic(db, topic) + + # 获取所有内容条目 + content_items = self.tab.ele('.Question-mainColumn').eles('.List-item') + + for item in content_items: + try: + content = THotContent() + content.topic_id = topic.id + content.url = item.ele('.ContentItem-time').ele('tag:a').link + upvote_str = item.ele('.Button VoteButton VoteButton--up FEfUrdfMIKpQDJDqkjte').text + match = re.search(r'(\d+\.?\d*)\s*万?', upvote_str) + if match: + number = float(match.group(1)) + content.content_upvote_count = int(number * 10000) if '万' in upvote_str else int(number) + else: + content.content_upvote_count = 0 + comment_str = item.ele('.Button ContentItem-action FEfUrdfMIKpQDJDqkjte Button--plain Button--withIcon Button--withLabel fEPKGkUK5jyc4fUuT0QP B46v1Ak6Gj5sL2JTS4PY RuuQ6TOh2cRzJr6WlyQp').text + match = re.search(r'(\d{1,3}(?:,\d{3})*)', comment_str) + if match: + content.content_comment_count = int(match.group(1).replace(',', '')) + else: + content.content_comment_count = 0 + content.content = item.ele('.RichContent-inner').text + contents_result.append(content) + except ElementNotFoundError as e: + logger.error(f"元素缺失:{str(e)}") + except ValueError as e: + logger.error(f"热度值转换失败:{str(e)}") + + except ElementNotFoundError as e: + logger.error(f"热榜容器元素未找到:{str(e)}") + except Exception as e: + logger.error(f"获取热榜数据异常:{str(e)}") + finally: + if self.tab: + self.tab.close() + return contents_result + + + +def get_topics() -> list: + zhihu = Zhihu() + topics = zhihu.get_topics() + return topics + +def gather_task(): + """任务执行入口""" + with get_session() as db: + zhihu = Zhihu() + topics = zhihu.get_topics() + inserted_topics = create_topics_if_url_not_exists(db, topics) + for topic in inserted_topics: + logger.info(f"采集到话题:{topic}") + contents = zhihu.get_content(topic, db) + create_contents_top3_if_url_not_exists(db, contents) + + +if __name__ == '__main__': + # 测试用例 + logger.info('知乎采集测试') + # 执行采集任务 + gather_task() + logger.info('测试完成') + diff --git a/seek/zhihu_com/zhihu_hot.py b/seek/zhihu_com/zhihu_hot.py new file mode 100644 index 0000000..984d90a --- /dev/null +++ b/seek/zhihu_com/zhihu_hot.py @@ -0,0 +1,156 @@ +from DrissionPage import Chromium +from DrissionPage import ChromiumOptions +from DrissionPage.errors import ElementNotFoundError + +from log.log_manager import logger + + +def get_content_from_meta(metas, itemprop): + content = None + for meta in metas: + if meta.attr('itemprop') == itemprop: + content = meta.attr('content') + return content + + +class ZhihuHot: + def __init__(self): + co = ChromiumOptions() + self.browser = Chromium() + + def get_topic_url_list(self) -> list: + """获取知乎热榜数据""" + _topic_url_list = [] + _tab = None + try: + _tab = self.browser.new_tab() + # 访问热榜页面 + _tab.get('https://www.zhihu.com/hot') + + # 等待热榜内容加载 + _tab.wait.ele_displayed('.HotItem') + + # 获取所有热榜条目 + hot_items = _tab.ele('.HotList-list').eles('.HotItem') + + for item in hot_items: + try: + # 提取标题和链接 + # title = item.ele('tag:a').attr('title').title() + url = item.ele('tag:a').link + _topic_url_list.append(url) + except ElementNotFoundError as e: + logger.error(f"元素缺失:{str(e)}") + except ValueError as e: + logger.error(f"热度值转换失败:{str(e)}") + + except ElementNotFoundError as e: + logger.error(f"热榜容器元素未找到:{str(e)}") + except Exception as e: + logger.error(f"获取热榜数据异常:{str(e)}") + finally: + if _tab: + _tab.close() + + return _topic_url_list + + def get_content(self, url): + """获取话题内容数据""" + contents_result = [] + _tab = None + global title, keywords, date_created, date_modified, follower_count, comment_count, answer_count, topic_description + try: + _tab = self.browser.new_tab() + # 访问话题/问题页面 + _tab.get(url) + + for _ in range(10): + # for _ in range(1): + # 等待内容加载 + _tab.wait.ele_displayed('.List-item') + _tab.wait(3) + # 向下滚动页面,直到所有内容加载完成 + _tab.scroll.to_bottom() + _tab.wait(1) + _tab.scroll.up(100) + + # 获取话题/问题相关信息:话题内容、keywards、话题创建日期dateCreated、话题修改日期dateModified、回答数量answerCount、评论数量commentCount + question_page = _tab.ele('.QuestionPage') + # 获取话题属性,为QuestionPage的前9个meta标签 + metas = question_page.eles('tag:meta')[0:9] + # print(metas) + title = get_content_from_meta(metas, 'name') + answer_count = get_content_from_meta(metas, 'answerCount') + comment_count = get_content_from_meta(metas, 'commentCount') + keywords = get_content_from_meta(metas, 'keywords') + date_created = get_content_from_meta(metas, 'dateCreated') + date_modified = get_content_from_meta(metas, 'dateModified') + follower_count = get_content_from_meta(metas, 'zhihu:followerCount') + # print(date_created, date_modified, answer_count, comment_count, keywords) + topic_description = "" + try: + unfold_topic_description = question_page.ele('.^Button QuestionRichText-more') + if unfold_topic_description: + unfold_topic_description.click() + topic_description = question_page.ele('.^QuestionRichText').text + except ElementNotFoundError as e: + logger.error(f"元素缺失:不存在topic_description") + + # 获取所有内容条目 + content_items = _tab.ele('.Question-mainColumn').eles('.List-item') + + total_characters = 0 + for item in content_items: + try: + content = item.ele('.RichContent-inner').text + # 计算content的字数 + content_len = len(content) + print(content_len) + if content_len > 1000 or content_len < 100: + logger.error(f"skip本条内容,内容长度:{content_len}") + continue + if total_characters > 5000: + logger.error(f"contents_result长度超过5000,跳出循环") + break + total_characters += content_len + contents_result.append(content) + # 打印contents_result的长度 + logger.info(f"contents_result长度:{len(contents_result)}") + except ElementNotFoundError as e: + logger.error(f"元素缺失:{str(e)}") + except ValueError as e: + logger.error(f"热度值转换失败:{str(e)}") + + except ElementNotFoundError as e: + logger.error(f"热榜容器元素未找到:{str(e)}") + except Exception as e: + logger.error(f"获取热榜数据异常:{str(e)}") + finally: + if _tab: + _tab.close() + return { + 'title': title, + 'topic_description': topic_description, + 'keywords': keywords, + 'url': url, + 'contents': contents_result, + 'date_created': date_created, + 'date_modified': date_modified, + 'follower_count': follower_count, + 'answer_count': answer_count, + 'comment_count': comment_count + } + + +if __name__ == '__main__': + # 测试用例 + logger.info('知乎采集测试') + # 执行采集任务 + zhihu_hot = ZhihuHot() + result = zhihu_hot.get_content('https://www.zhihu.com/question/14351228309') + print(len(result)) + print(result) + # topic_url_list = zhihu_hot.get_topic_url_list() + # print(topic_url_list) + logger.info('测试完成') + diff --git a/task/__init__.py b/task/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/task/__pycache__/__init__.cpython-312.pyc b/task/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1fdafe34ab56c4052725c2fecd627bad82555177 GIT binary patch literal 127 zcmX@j%ge<81Xc^f(n0iN5P=Rpvj9b=GgLBYGWxA#C}INgK7-W!l8dm4NzPA6jVVYi zNiB*gNi5EeiI30B%PfhH*DI*}#bJ}1pHiBWYFESxRL%&*#URElcL+8hZ1o2PQ(V8jCJnsW$kYEXx3=yp=C0j}8 z>q5G)zEw{mnLb2?sa6W}RLBrEwi+>S2${m>Rx{>}Axk)~HIF1L1nXo?oRu|mHrB$~ zI*e@IB}1#7wQ`OQ9c#NpwdS*SP7k~TcxRT+2fiRq`U~ceY0Z98DcBQ=wx0_|I*+xV z<=9w=<3VU_k48FzoxoW;IWZt`f)I>GpzB6mkrB=bfwp*nyTG-_#3)*!4@El_3mOST zm@ud02_nP8)FAL?F2Z82U7vLbB4+rZ{&O9PoFl?hZCyHzTcj+RGfpkaHBKT&v+GB* zs}uU175|S$-76X?N!ME-YC$C=;wA)5(%&R*>9x5e5!XpNiR^)-NEnvQr6dYg7$gI0 zyh*X9TgKc~esk;xu$yNOem?Wbotclnn7uysuls$^emgev#r5fX_hvpB{`QxbX72aS z+`cmNi@tB~U7Nl2>FnTdX1*NkDe#@z;0tf?v9#}Cqwh$g?^ttBA&mn-t0CfOMx=Y3 zGz4Oa#9Bwk1dc{gqI>l9-|u*7rUedy&kBA@q15OJBHw{I3uFotLM)G*;5YIphP(wm z&>>J9i$M>-D#fM+KHCd1-2$hkf?$LNG;S|fpKLtc9%Z@H-JHnrr%@7}4l@y^ljCu& z)py60LdY-N$tE47Bkuwg#qbj(c=Qs}?y|u%183ywE%&J}9iKbKdnVrgV8VT@w|T0l zWN^p84tdq4&)7djejgb>IkBaAqUinJ_cEq@*;F!RwkG4rb8=bjNV8nKO}1@+WZv<# zq$*XDE~!h}7KZ&Jbt9Ey7st2DKRhPyX^~q_%eFI*%pX06_Y>eQFuZ*sjX`99A5QW6 zz${5|=r3=hSwVsn7mkTUbgi81*@Pwu9i*YQlFU6s%hf|)o0ri2?>9)g(X17a(xhJ( zD+dL$mv28C?o+dI=JP+Q$*Cq`vtQ5KVG0h4Vh-Rc5D0)2MQAZc1f&(h&KCiN1i=-A z!;$67K(q>e!Zvt7JPX`|Wdmi`%6kt@QN{h@U}7MV>iU{$_&a63bYLpK=xY3O{L01N zgE^yP>mO0ucsJ5*5wwD~BqmN6!!h|Ycs4gCns+u6OCmsu3lZo-IlHValp8qq`-pzs z84nzMC<)lO&~}6Sk@9|5qTB2E_67SHHp#+ zPQoIVXoHePGA!&O4>AsgVObMV^l_8SnhuR7tdg~#6b?v6mOQr}V!Z=&c(wV9RLWyX z$+VItRwtS6>PNHQg)FYpR>!E0aj?OJZ5W1is~6{MT$fMd z7O58ZgiTzhNhO=6l{KGx3-)9c{n{97(Z+s8N;WnRxr?>pw;EU*l57XbYkgpX8FsV= zs3S}Bv(%ZT1zC!9!@NS&U)ghai$<+52<;91=33b^aEUEs2TAmkeI4LSH{Mm#Yxm-o z^lb5ev8`kjRlqm5trTplQCH+XU$O18oD3Mgy`@;?JPpwC8~)ALLkorLOKQUMUo5pQ^a}F zU(u|XgAuqbf69bf;HoxdZ*4^vWECgRb%&UC4z9|eAOf^g3d2F6Bgk`sP9Dk872acL zf$NIyanlUTax8s5D4wM?l{AmGtx)ZJG%|q&qi^wAiAyB-=1)6a%S zo;`RlbN}ib3bsP-RLB}0-A{AqDxAX{D!6l~j?SS&?zbpL)n3II4n|@kr%-33F<#NL zOk6R9qY?3}Vg)`B?FgLbxN`~>=NO)^gta^}q^BU)Xl#W-u?*-b;A4>h;sQ(-bNcm) zSrfia16n}$lw*f9MZl#&mKOP7CjbyyjCXU5*i9jxW7s(0Cvd7zk6VwIW|oNSXn2>3 zr-B0QFXfR>c(iFAZ66Slb}6Ya*&2=_%pK@vV9WN7SfpJ9WTTFmvL+}_Ei3@UF*d{d zP%Oc4ca#@H!8XONvH?|_Vgi)ThycJ46X=SzDS8CQOb}_*!{`lTHzglA5Avj&S0TP) z5V}J_5#T%*QSweOp%}NH=cByhP;Y8v9Coxl!d(yp$gvn2X5l^>@;i9A0|`aPwkZY# zi3Ak_Y(Y+dT1xP5qsLY>$ArcMkg~?C<#sI4{GjNO?Z%EE7h(l9jOc9wx)T9IM5)v1OXO_|CysiQZ7nKd;d4Z}?tb!>45^Cj4P8<_7|ZvL)u*W~Ww>D|ZW zT_@zy4ylzDJ-Nic>CpYd- z!(aLSe)H6_1qZ(>{VGp(9eeC-`7S{cZqL6>L|(~N?`7{x0oBVZ2e&mj^?!95n#xQ| zFefU17;~b7k&8J2yJx=_P;I|M!Us8z5*G-XDxg6PD(8L?@gKlI6wN?z60)0eB;${$ zGK_rrqagDa@{K?+CReK9yGyG(^$VO(-~{ftKZck#_)i9e8t_)}R_+k0<@lq( zyj?8@@9?cK3AsQx3y*m{Ns`|X&Tk0o46$Q|D0@QqpAd~th^l$YLQ>xqo5=u~s-7nh x9owwZ@%_(nf4kZC1trgG*@0ICLnw2;=Tf_IVt)55fnQlD^h zG$lB#yth@^@%fpva;RKyA8bsxZ)y;vh*t_#<=U+L9|uiMby_NtFt z=M!wb@11D((TZL>gNyieF9rM?2{3mI<2J0U@=--Jc%9s_c3u-3{rMGb22AlPR+SG^ zig6j7L0Jv5P29)F#AZ*gmX)BNQB`K0{!B$SWq-G7NQD4YLKV@E#N2 zCzM)rP*1I=oS#(Uf^*UdvOFsgNA`Nek-gJUyJu0(MJMR4cGSFW*bG}L=g8D97xD~U z#j-_NA%D^dd(dcZ5W-H&GDaQsqq8|u=6#q_oq&1LiLB9@j_NjLA#XEB;?UXf-zcZm z+uJB-*=f3tXBOs{GfqvGu<5OmBM_UDPxr^mzGH*ObCyYtmkA^EIFCl#ZF*lDb~$en z>Q*aVo^~RB`BwR|Y@!5j*K~3p=98F#BS%|(ht3HZ$ z&5k_W{$8!U=YC{cBeJU=*>#y+J9p*W-N@km=vLR+TaWbCBJb7)4m1Xi)CZ2#2F4bn zTP_^@?qDssr;!}0Cx>c@;o9)gTKx0hqsREN_&bgGa6LXeFMYpmS%&VuhPJn^?QLlL z>e{{=_RS9$w2yCoQP)1XbD*vroo`)?^)zC;>#^O}I~QWZ^YWs;2ykCz${4~CJUMY)zevj!LoFVC^%cTSjVH)oD=gF0_MPe9N;5j;hcJmCC zA&-^&=uR|wL)!f)uqcA?5ZeEM)DnEU1pQ0UvjjVz$(w}3!tx%F+Gc0&im?Y$d#xjR OPujLDg0zLEaOJ7%Q6n?Wm{z+}TA#s|L+THvd;>xkwDy`b83aBAQB25kjBw`s>8}B4IU3<;! znEYU)B5{aBE}Vi?RpN?r;97C2k|PpYlU8oG$q6`c3k_0HxG=Np6(!`vti11gpWmB# zGw<8^As+7nHu~J}78MELXYK?;?0~}$8Q25^3}ixqTfsyXp&%fRMN_Ou1J{yV}9-_GHW zY}rkqoy7sRI#HLQ2{hO+G$1Iii5lol_~283rqB?U*byBcpT`&XO#|9Y`x8Ms)(6q6zkEgE=1ubZ@3 z(aY>BsFe*&JiX^?tFCgKBz!H2$jR&`P({DJDfoDUPoml!w zVz8YUZ6!vX#06(O-yWZ7jn6pa@9imlo3XEAj(VxBUTvvYopaZmYqy=$JCBripD6vm zB-8C=ww25}W3RWz@~yG_Zt|`5{NCB&Z}LudqMe;;Wv6z}=GSKq1TlUKJylXp`od%7 z;_uxseC34!(Tgvob2OgH|Bbttb@ zgtyph4KJapSFomi3x|vZlH_$nZulyjAm^CNFDv^Jd$Fw}^bC|=A+rzD`!KW*=bwk- g=oUIqL5i)f{Un}wDk)C);A3g%Km_SEl4fat17k;PT>t<8 literal 0 HcmV?d00001 diff --git a/task/default/main_spider_task.py b/task/default/main_spider_task.py new file mode 100644 index 0000000..d2b8b9d --- /dev/null +++ b/task/default/main_spider_task.py @@ -0,0 +1,26 @@ +import importlib + +from database.database import get_session +from database.tinformationsource.curd import get_active_information_sources +from log.log_manager import logger +from task.manager_task import execute_task + + +def main_spider_task(): + with get_session() as db: + information_sources = get_active_information_sources(db) + for information_source in information_sources: + if information_source.module is None or information_source.method is None: + logger.error(f"{information_source.title} module or method is None") + continue + # 动态导入模块和函数 + module = importlib.import_module(information_source.module) + task_function = getattr(module, information_source.method) + try: + task_function(information_source) + except Exception as e: + logger.error(f"{information_source.title} task error: {e}") + + +if __name__ == '__main__': + execute_task(main_spider_task) diff --git a/task/hot_topic/__init__.py b/task/hot_topic/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/task/hot_topic/__pycache__/__init__.cpython-312.pyc b/task/hot_topic/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c97b7e0761418ebec2c921a5442d970bc03779bc GIT binary patch literal 137 zcmX@j%ge<81TPnCOb5}AK?FMZ%mNgd&QQsq$>_I|p@<2{`wUX^OC!Q6COJPPHKri7 zB(*4}B(XRTZlX-=wL5i3w5BM=vZ7$2D# K85xV1fh+(Wtsp-D literal 0 HcmV?d00001 diff --git a/task/hot_topic/__pycache__/zhihu.cpython-312.pyc b/task/hot_topic/__pycache__/zhihu.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0520d7ab15e0a7a6eeb51c72033902b37c9dd53d GIT binary patch literal 1382 zcmZ8h%WoS+7@vLFYp-3B1(Db;UR_WHaquA$kw9uB#DSYpddQ!^ogTbRtC~?Nkl@l=N^;`F%zD=~ox{xc{LS}$^V|I` zpU(oiO5cUGA0z<&VoI`vSU9>$g`2`?bqwAwTVn?b`GUpC$|^kDrQeN1b(32M z<W6;pV0p88DnOTmQbPe-c0x=ndc7Fs+u4P3-IIzSVZwcy!CRq27WRBAwAypCIc| z36)ldC_#&U04Sm}DpFw9U~rpmH^9Nn{P)=#@BLX=xL-IM7S8S$R)^xV$0<-eGdy!~ zzpzfVL#izgE)K2^3oldkNTzB(e<0`k?p^uhNG;u07ejS%kl$C&_0!B=+d45))a|uU zd48z8bnC@IZEqv2ytTjd_OIrhrAr67+3k0BR(4((o?08Kuiwp`ALX9;dv0Mcw>KAR zuZ@;p-MbtvzcJF!4OcIR`un5Puk5Xbr!R~aD}(mWUms`0`PF{zI14f-cRsi$YmZ8_ zq9fW5<&9yursEmL6)j5^Y!jQGnE|ZeXt-lSRHg?Vz3JnpBl7`-jmr4-EZ>bKI@B>M zX4#A@uIZVzWT5OTji$=(_DmNV#xDSsn3XY~$md<(Za3itv_Mt%#|T#Fb|`Qh_Xntl np#A`?JOE4oq>9|q;TjNg{mnZ<;dgOnyL3-{?oa??F@pOK@;YeM literal 0 HcmV?d00001 diff --git a/task/hot_topic/zhihu.py b/task/hot_topic/zhihu.py new file mode 100644 index 0000000..6cdb86d --- /dev/null +++ b/task/hot_topic/zhihu.py @@ -0,0 +1,36 @@ +from database.tvideoscript.video_script import video_script_not_exists, VideoScript, create_video_script +from seek.zhihu_com.zhihu_hot import ZhihuHot +from task.manager_task import execute_task + + +def spider_task(): + zhihu_hot = ZhihuHot() + # 1. 获取热榜主题 + hot_topic_url_list = zhihu_hot.get_topic_url_list() + + # 2. 过滤掉已经在数据库存在的主题 + hot_topic_url_list = video_script_not_exists(hot_topic_url_list) + + # 3. 选择前10个主题 + hot_topic_url_list = hot_topic_url_list[:10] + # hot_topic_url_list = hot_topic_url_list[:3] + + # 4. 循环获取每个主题的内容 + for hot_topic_url in hot_topic_url_list: + print(hot_topic_url) + content = zhihu_hot.get_content(hot_topic_url) + print(content) + if content['contents'] is None or len(content['contents']) == 0: + print(f'skip {hot_topic_url}, no fitch content') + continue + # 5. 将内容保存到数据库中 + video_script = VideoScript(title=content['title'], + keywords=content['keywords'], + description=content['topic_description'], + content=content['contents'], + url=content['url']) + create_video_script(video_script) + + +if __name__ == '__main__': + execute_task(spider_task) \ No newline at end of file diff --git a/task/manager_task.py b/task/manager_task.py new file mode 100644 index 0000000..e9fb398 --- /dev/null +++ b/task/manager_task.py @@ -0,0 +1,112 @@ +import importlib +import time + +from apscheduler.schedulers.blocking import BlockingScheduler + +from config import config +from database.database import get_session +from database.tscheduler.crud import get_tasks_by_executor +from log.log_manager import log + +""" +这是一个特殊的任务,负责管理任务,命名为管理者任务。 + +工作流程: +1 检索数据库任务数据表 +2 检查是否已经在任务队列中,如果不在则添加 + +任务执行时间间隔为600秒。 + +""" + +def log_task_execution(task_name: str, start_time: float, end_time: float = None): + """辅助函数,记录任务的开始和结束日志""" + start_time_str = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(start_time)) + end_time_str = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(end_time)) + if end_time is None: + log(f"{task_name} start execute at {start_time_str}") + else: + elapsed_time = end_time - start_time + log(f"{task_name} end execute at {end_time_str}, use time {elapsed_time:.2f} seconds") + + +def execute_task(task: callable): + """执行任务并记录日志""" + start_time = time.time() + log_task_execution(task.__name__, start_time) # 先记录开始时间 + task() + end_time = time.time() + log_task_execution(task.__name__, start_time, end_time) # 记录结束时间 + +# 从数据库加载任务 +def load_tasks(scheduler: BlockingScheduler): + with get_session() as db: + tasks = get_tasks_by_executor(db, config.scheduler_name) + + for task in tasks: + module_path = task.module_path + function_name = task.function_name + trigger = task.trigger + interval_seconds = task.interval_seconds + task_id = task.id + + # 动态导入模块和函数 + module = importlib.import_module(module_path) + task_function = getattr(module, function_name) + + job = scheduler.get_job(str(task_id)) + # 检查任务是否已存在 + if not job: + if trigger == "interval": + scheduler.add_job( + task_function, + "interval", + seconds=interval_seconds, + id=str(task_id), + replace_existing=True, + misfire_grace_time=interval_seconds + ) + log(f"Task {task.task_name} added with interval {interval_seconds} seconds") + elif trigger == "cron": + # 解析 cron 表达式的字段 + fields = task.cron_expression.split() + # 确保字段长度符合七字段格式 + if len(fields) != 7: + raise ValueError("无效的 Quartz cron 表达式") + # 替换 Quartz 风格的 `?` 为 APScheduler 可接受的 `*` + if fields[5] == '?': + fields[5] = '*' # 替换 `day_of_week` 字段中的 `?` + # 使用 cron 表达式的字段添加任务 + scheduler.add_job( + task_function, # 要执行的任务 + 'cron', # 使用 cron 触发器 + second=fields[0], # 秒 + minute=fields[1], # 分钟 + hour=fields[2], # 小时 + day=fields[3], # 日期 + month=fields[4], # 月份 + day_of_week=fields[5], # 星期 + year=fields[6], # 年份 + id=str(task_id), + replace_existing=True + ) + log(f"Task {task.task_name} added with cron {task.cron_expression}") + elif trigger == "date": + scheduler.add_job( + task_function, + "date", + run_date=task["run_date_and_time"], + id=str(task_id), + replace_existing=True + ) + log(f"Task {task.task_name} added with date {task.execution_date}") + else: + log(f"Task Invalid trigger type: {trigger}") + else: + log(f"Task {task.task_name} already exists......") + run_time = job.next_run_time - job.trigger.start_date + log(f"Task {task.task_name} already exists, run time is {run_time}") + +# 管理者任务 +def manager_task(scheduler: BlockingScheduler): + load_tasks(scheduler) diff --git a/utils/__init__.py b/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/utils/__pycache__/__init__.cpython-312.pyc b/utils/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e83ec45b4e00dde7fec7d710a1b093360fae6509 GIT binary patch literal 128 zcmX@j%ge<81m8l7(n0iN5P=Rpvj9b=GgLBYGWxA#C}INgK7-W!l8>;8NzPA6jVVYi zNiB*gEy>I&j){-Y%*!l^kJl@x{Ka9Do1apelWJGQ3RKSs#Kj=SM`lJw#v*1Q3jmC; B95Mg^ literal 0 HcmV?d00001 diff --git a/utils/__pycache__/time_utils.cpython-312.pyc b/utils/__pycache__/time_utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9b2d48a45355f9183e44063cd8f74a7e07e3637f GIT binary patch literal 3113 zcmd5;T}&KR6uz_nvt4A_U4A-iw+rne!*8jSkcDDuOC^?4Q)~-}#bsv#Z1&HY*;eRo z+6Oh;2h&9#mPCy_*n|cicrd2EVPay^zAzb*B~u=nC?vcQ@xkz-=g#cFf;6Zx(VNVi zbI>SP=RNPqd-SGh5ZbWD$cHB8&!AN(>S(*M)Tv{h*#e zjQ&G3XqX^q!v;3$r3u*-j*X78T#(3n!2}{8Jv>`ajM$@qWy%O?OaTpH(;}p53MGt`QE## z17oTgfAoUBn07o_abH5kTHSZ)T%JuC56gT1szW8aUNo*?ZN>w4%8{Ln_Ux3uVP`Jf zVT^9xHkaS^`Mp!5bm2u6DcXtuZ&c)^y*>G#R-{;|NFQTY=oO|!p_dp($|N|o&V^hW zw@Z_-yXNwDTFRU>hst+#8poKmilz>`op9s!jU-~6I}y~g?^3d>D!8}y+2CpDU|*$s zeUbkxqq4nTP9TBOQaFh?4}pYZ+C1dZxLwMWC(-BVOPy910wsx*egNT{?Kb)VxZ4C> zG1>7J7YnjH&+^_tl<@{*(OWDhz{D$zM_4Z}aO2TY?|4**frtqRtRjw$gxH|an7}vk zTKeOSPrq8b{|SeC1RoG{WIUP>SYFjGJXp_u3tCVZ$6^VNPvFabeR_87>qi^-RtANE z2~4PR5CV4NcNp7*pVrF;Y$Kagm$HdxAwx!S0XXae2TkC!xAxsP3SIvqt?#S#~WN7zwjpC{#Sa@vKrTzn8Yl7OiZuJ{r3%TW|#@V^LPt zCxoD^huX?UJ{}$yWD=_wHY@}{0+o-;W{!=A13^~SaV%bafD4Yv#!)Voh|3m$#Ic2J zy&ecB*e;HXaU3p%Y-BM{Thw5~0F-UP1jn&aVHjI*@EJ#PaV)3`gRCmWol%n*hBXMX z4P${71eS}+#z;U2j`8?kz?&c2mHqyqV2ojh;w)f>62f?xA5yFowlO{-+v4gDQ$yfi z5;HtJZrnZev(ue(o)n!YbIuykSudqk=yUA);tAe(#1Ky6+kkhOH$B9h)! zyED_B?VakKX_s703xw!8E!kg98w(xymg#Ph>IMfFH3wbbz>(<}$zz4Ul4bh5NSz1Z z>l*OfLBJoJi~T>|{QlXF5+6$ulLXNsDQkPdtT$zFF@YL{3pHxcE zR*I$dQc1(Avn(@_y*qVxhL_6gX3vS`jgqrz#p&8KBX=*cg$!;hq|lmAq2a$vp*o+! zF=Q>y)QBWCotU+V$4@O>dpstdxwuk(F>PD5JF?^yIbA<%k?hUTHfKe)eX4!>{m(kn zmQ|`MN7ac`U5;uIsg{NQrPr=Y)D2jjEAQHgtVJ2i{o@B9uS!%OAnkc%k%6pR^;G9P zO`@kM=V=o?ZI3G!{gS6QePz?Cf`0pzM0V;IqywnU@u25^&yu@&p=N7~^bpXgqa`Apvmm_5+q;6T3V8(YoW<{i#;Q;I3p)B=@xTtis>!3(t?!4 zlGIylsd>ryDXF(OGg2#3GSgFwOMw1lE-lI_0y*y%V{S3X8wv^vzhok;Vn9k`3Q|i_ zi(*PkGINR{q+UVgE%x-(l6auOMZ7@epl~cU1QHDlw^@X4aPaqYcXD_0HaOm3VQu$l z^ymogkLrw?!G4)n;R=i54Q`%(>rU%#+Y2lbpMjbf{4|+vF&CEWHa^HWN5QtgTafm~3a6oVKam>C%vKd`VcN=|V8!T_YeS^x!SWA*?5 literal 0 HcmV?d00001 diff --git a/utils/time_utils.py b/utils/time_utils.py new file mode 100644 index 0000000..56b7a10 --- /dev/null +++ b/utils/time_utils.py @@ -0,0 +1,50 @@ +import datetime +import re + +from log.log_manager import logger + + +def process_time(time_str): + """Processes and converts a time string into a datetime object.""" + current_time = datetime.datetime.now(datetime.timezone.utc) + if '分钟前' in time_str: + minutes = int(time_str.split('分钟前')[0]) + occurrence_time = current_time - datetime.timedelta(minutes=minutes) + elif '小时前' in time_str: + hours = int(time_str.split('小时前')[0]) + occurrence_time = current_time - datetime.timedelta(hours=hours) + elif '昨天' in time_str: + occurrence_time = current_time - datetime.timedelta(days=1) + elif '天前' in time_str: + occurrence_time = current_time - datetime.timedelta(days=int(time_str.split('天前')[0])) + elif '昨天' in time_str: + # time_str = '昨天HH:mm' + time_part = time_str.split('昨天')[-1].strip() + occurrence_time = (current_time - datetime.timedelta(days=1)).replace( + hour=int(time_part.split(':')[0]), + minute=int(time_part.split(':')[1]), + second=0 + ) + elif '前天' in time_str: + # time_str = '前天HH:mm' + time_part = time_str.split('前天')[-1].strip() + occurrence_time = (current_time - datetime.timedelta(days=2)).replace( + hour=int(time_part.split(':')[0]), + minute=int(time_part.split(':')[1]), + second=0 + ) + elif '年' in time_str and '月' in time_str and '日' in time_str: + time_pattern = r"(\d{4}年\d{1,2}月\d{1,2}日 \d{1,2}:\d{2})" + match = re.search(time_pattern, time_str) + time_str = match.group(1) + occurrence_time = datetime.datetime.strptime(time_str, "%Y年%m月%d日 %H:%M") + elif '/' in time_str: + occurrence_time = datetime.datetime.strptime(time_str, "%Y/%m/%d %H:%M:%S") + else: + try: + occurrence_time = datetime.datetime.strptime(time_str, '%Y-%m-%d') + except ValueError: + logger.error(f"Unable to parse date: {time_str}") + occurrence_time = current_time + + return occurrence_time \ No newline at end of file diff --git a/utils/utils.py b/utils/utils.py new file mode 100644 index 0000000..0d11260 --- /dev/null +++ b/utils/utils.py @@ -0,0 +1,6 @@ + +def get_md5(url) -> str: + import hashlib + m = hashlib.md5() + m.update(url.encode('utf-8')) + return m.hexdigest() \ No newline at end of file