import peter

This commit is contained in:
konjacpotato
2025-11-12 20:42:16 +08:00
commit 8c1a740f0b
147 changed files with 2763 additions and 0 deletions

0
seek/163_com/__init__.py Normal file
View File

Binary file not shown.

Binary file not shown.

Binary file not shown.

45
seek/163_com/content.py Normal file
View File

@ -0,0 +1,45 @@
import datetime
from DrissionPage.errors import ElementNotFoundError
from database.tinformationsource.model import TInformationSource
from database.tnews.model import TNews
from log.log_manager import logger
from seek.content_base import ContentBase
class ArticleContent(ContentBase):
def __init__(self, news: TNews):
super().__init__(news)
def get_content(self):
try:
content_ = self.session.s_ele('.post_body').text
except ElementNotFoundError:
content_ = 'not found element'
return content_
def get_content(information_source: TInformationSource) -> list:
article_content = ArticleContent(information_source)
result = article_content.get_content()
article_content.finish()
return result
def content_task(news: TNews):
logger.info(f'{news.title} news_task start execute at {datetime.datetime.now()}', )
ofweek_com_ai = ArticleContent(news)
ofweek_com_ai.do_seek_task()
ofweek_com_ai.finish()
logger.info(f'{news.title} news_task end execute at {datetime.datetime.now()}')
if __name__ == '__main__':
logger.info('This module is not for direct call!')
news_ = TNews()
news_.is_static = True
news_.url = 'https://www.163.com/dy/article/JKC1V4E70519DDQ2.html'
content = get_content(news_)
logger.info(content)
logger.info('Done.')

59
seek/163_com/house.py Normal file
View File

@ -0,0 +1,59 @@
import datetime
from DrissionPage.errors import ElementNotFoundError
from database.tinformationsource.model import TInformationSource
from database.tnews.model import TNews
from log.log_manager import logger
from seek.seek_base import SeekBase
class House(SeekBase):
def __init__(self, information_source: TInformationSource):
super().__init__(information_source)
def get_news(self):
news_result = []
_news_list = self.session.s_ele('.news-first').s_eles('.data_row news_article clearfix2 ')
for _news in _news_list:
try:
rs_news = TNews()
rs_news.title = _news.s_ele('.news_title').s_ele('tag:a').text
rs_news.url = _news.s_ele('tag:a').link
# rs_news.summary = _news.s_ele('tag:p').text
# rs_news.occurrence_date = self.process_time(tmp.s_eles('tag:span')[1].text)
rs_news.source = self.information_source.title
news_result.append(rs_news)
except ElementNotFoundError as e:
logger.error(f"ElementNotFoundError: {e} - Failed to find element in news item.")
except Exception as e:
logger.error(f'Unexpected error occurred: {e}')
return news_result
def get_news(information_source: TInformationSource) -> list:
instance = House(information_source)
news_list = instance.get_news()
instance.finish()
return news_list
def news_task(information_source: TInformationSource):
logger.info(f'{information_source.title} news_task start execute at {datetime.datetime.now()}', )
instance = House(information_source)
instance.do_seek_task()
instance.finish()
logger.info(f'{information_source.title} news_task end execute at {datetime.datetime.now()}')
if __name__ == '__main__':
logger.info('This module is not for direct call!')
information_source_ = TInformationSource()
information_source_.is_static = True
information_source_.url = 'https://sz.house.163.com/'
information_source_.title = '房产_网易'
news_task(information_source_)
# news_list_ = get_news(information_source_)
# for news in news_list_:
# print(news)
logger.info('Done.')

0
seek/__init__.py Normal file
View File

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -0,0 +1,46 @@
import datetime
from DrissionPage.errors import ElementNotFoundError
from database.tinformationsource.model import TInformationSource
from database.tnews.model import TNews
from log.log_manager import logger
from seek.content_base import ContentBase
class ArticleContent(ContentBase):
def __init__(self, news: TNews):
super().__init__(news)
def get_content(self):
content_ = ''
try:
content_ = self.session.s_ele('.^info-content').text
except ElementNotFoundError:
content_ = 'not found element'
return content_
def get_content(information_source: TInformationSource) -> list:
article_content = ArticleContent(information_source)
result = article_content.get_content()
article_content.finish()
return result
def content_task(news: TNews):
logger.info(f'{news.title} news_task start execute at {datetime.datetime.now()}', )
ofweek_com_ai = ArticleContent(news)
ofweek_com_ai.do_seek_task()
ofweek_com_ai.finish()
logger.info(f'{news.title} news_task end execute at {datetime.datetime.now()}')
if __name__ == '__main__':
logger.info('This module is not for direct call!')
news_ = TNews()
news_.is_static = True
news_.url = 'https://sz.news.anjuke.com/louping-965203-pan528488.html'
content = get_content(news_)
logger.info(content)
logger.info('Done.')

62
seek/anjuke_com/house.py Normal file
View File

@ -0,0 +1,62 @@
import datetime
from DrissionPage.errors import ElementNotFoundError
from database.tinformationsource.model import TInformationSource
from database.tnews.model import TNews
from log.log_manager import logger
from seek.seek_base import SeekBase
from utils.time_utils import process_time
class House(SeekBase):
def __init__(self, information_source: TInformationSource):
super().__init__(information_source)
def get_news(self):
news_result = []
print(self.session.html)
_news_list = self.session.s_ele('.main-list').s_eles('.m-list-item clearfix')
for _news in _news_list:
try:
rs_news = TNews()
tmp_ = _news.s_ele('.item-col-right')
rs_news.title = tmp_.s_ele('tag:h3').text
rs_news.url = tmp_.s_ele('tag:a').link
rs_news.summary = tmp_.s_eles('tag:a')[1].text
rs_news.occurrence_date = process_time(tmp_.s_ele('.info__time').text)
rs_news.source = self.information_source.title
news_result.append(rs_news)
except ElementNotFoundError as e:
logger.error(f"ElementNotFoundError: {e} - Failed to find element in news item.")
except Exception as e:
logger.error(f'Unexpected error occurred: {e}')
return news_result
def get_news(information_source: TInformationSource) -> list:
instance = House(information_source)
news_list = instance.get_news()
instance.finish()
return news_list
def news_task(information_source: TInformationSource):
logger.info(f'{information_source.title} news_task start execute at {datetime.datetime.now()}', )
instance = House(information_source)
instance.do_seek_task()
instance.finish()
logger.info(f'{information_source.title} news_task end execute at {datetime.datetime.now()}')
if __name__ == '__main__':
logger.info('This module is not for direct call!')
information_source_ = TInformationSource()
information_source_.is_static = True
information_source_.url = 'https://sz.news.anjuke.com/hot/'
information_source_.title = '房产_安居客'
news_task(information_source_)
# news_list_ = get_news(information_source_)
# for news in news_list_:
# print(news)
logger.info('Done.')

0
seek/cnn_com/__init__.py Normal file
View File

58
seek/cnn_com/content.py Normal file
View File

@ -0,0 +1,58 @@
import datetime
from DrissionPage.errors import ElementNotFoundError
from database.tinformationsource.model import TInformationSource
from database.tnews.model import TNews
from log.log_manager import logger
from seek.content_base import ContentBase
class ArticleContent(ContentBase):
def __init__(self, news: TNews):
super().__init__(news)
def get_content(self):
content_ = ''
try:
content_ = self.session.s_ele('#detailContent').text
except ElementNotFoundError:
content_ = 'not found element'
return content_
def get_occurrence_date(self):
try:
header_time = self.session.s_ele('.header-time left')
year = header_time.s_ele('.year').text # 2023
day = header_time.s_ele('.day').text # 12/27
time = header_time.s_ele('.time').text # 08:05:11
occurrence_date_ = f'{year}/{day} {time}'
print(occurrence_date_)
except ElementNotFoundError:
occurrence_date_ = None
return occurrence_date_
def get_content(information_source: TInformationSource) -> list:
article_content = ArticleContent(information_source)
result = article_content.get_content()
article_content.get_occurrence_date()
article_content.finish()
return result
def content_task(news: TNews):
logger.info(f'{news.title} news_task start execute at {datetime.datetime.now()}', )
article_content = ArticleContent(news)
article_content.do_seek_task()
article_content.finish()
logger.info(f'{news.title} news_task end execute at {datetime.datetime.now()}')
if __name__ == '__main__':
logger.info('This module is not for direct call!')
news_ = TNews()
news_.is_static = True
news_.url = 'https://www.news.cn/politics/leaders/20241227/90e76f85ad4a43ba94802b07c5736e00/c.html'
content = get_content(news_)
logger.info(content)
logger.info('Done.')

62
seek/cnn_com/edition.py Normal file
View File

@ -0,0 +1,62 @@
import datetime
from DrissionPage.errors import ElementNotFoundError
from database.tinformationsource.model import TInformationSource
from database.tnews.model import TNews
from log.log_manager import logger
from seek.seek_base import SeekBase
class Edition(SeekBase):
def __init__(self, information_source: TInformationSource):
super().__init__(information_source)
def get_news(self):
news_result = []
# _news_list = self.tab.s_ele('.zone zone--t-light zone-2-observer').s_eles('.stack')
# _news_list = self.tab.s_ele('.zone zone--t-light zone-2-observer').s_eles('.stack__items ')
_news_list = self.tab.s_ele('.zone zone--t-light zone-2-observer').s_eles('tag:a')
for _news in _news_list:
print(_news.html)
try:
rs_news = TNews()
rs_news.title = _news.text
rs_news.url = _news.link
# rs_news.summary = tmp_.s_eles('tag:a')[1].text
# rs_news.occurrence_date = self.process_time(tmp_.s_ele('.info__time').text)
rs_news.source = self.information_source.title
news_result.append(rs_news)
except ElementNotFoundError as e:
logger.error(f"ElementNotFoundError: {e} - Failed to find element in news item.")
except Exception as e:
logger.error(f'Unexpected error occurred: {e}')
return news_result
def get_news(information_source: TInformationSource) -> list:
instance = Edition(information_source)
news_list = instance.get_news()
instance.finish()
return news_list
def news_task(information_source: TInformationSource):
logger.info(f'{information_source.title} news_task start execute at {datetime.datetime.now()}', )
instance = Edition(information_source)
instance.do_seek_task()
instance.finish()
logger.info(f'{information_source.title} news_task end execute at {datetime.datetime.now()}')
if __name__ == '__main__':
logger.info('This module is not for direct call!')
information_source_ = TInformationSource()
information_source_.is_static = False
information_source_.url = 'https://edition.cnn.com/'
information_source_.title = 'edition_CNN'
# news_task(information_source_)
news_list_ = get_news(information_source_)
for news in news_list_:
print(news)
logger.info('Done.')

50
seek/content_base.py Normal file
View File

@ -0,0 +1,50 @@
from abc import ABC, abstractmethod
from DrissionPage import Chromium, SessionPage, ChromiumOptions
from database.database import get_session
from database.tnews.crud import update_news_by_id
from database.tnews.model import TNews
from log.log_manager import log
class ContentBase(ABC):
def __init__(self, news: TNews):
self.news = news
self.session = None # 初始化为 None
self.browser = None # 初始化为 None
if news.is_static:
self.session = SessionPage()
self.session.get(news.url)
else:
co = ChromiumOptions()
self.browser = Chromium(addr_or_opts=co)
# self.tab = self.browser.latest_tab
self.tab = self.browser.new_tab()
self.tab.get(news.url)
@abstractmethod
def get_content(self):
"""Abstract method to fetch news from a specific source."""
pass
def get_occurrence_date(self):
return None
def do_seek_task(self):
"""Saves the list of news to the database if the URL does not already exist."""
self.news.content = self.get_content()
if self.news.occurrence_date is None:
self.news.occurrence_date = self.get_occurrence_date()
with get_session() as db:
update_news_by_id(db, self.news)
log(f'successful fetch {self.news.title} news content into the database.')
def finish(self):
"""Closes the browser and session."""
if self.tab:
self.tab.close()
# if self.browser:
# self.browser.quit()
if self.session:
self.session.close()

View File

Binary file not shown.

Binary file not shown.

Binary file not shown.

46
seek/fang_com/content.py Normal file
View File

@ -0,0 +1,46 @@
import datetime
from DrissionPage.errors import ElementNotFoundError
from database.tinformationsource.model import TInformationSource
from database.tnews.model import TNews
from log.log_manager import logger
from seek.content_base import ContentBase
class ArticleContent(ContentBase):
def __init__(self, news: TNews):
super().__init__(news)
def get_content(self):
content_ = ''
try:
content_ = self.session.s_ele('.^news-text').text
except ElementNotFoundError:
content_ = 'not found element'
return content_
def get_content(information_source: TInformationSource) -> list:
article_content = ArticleContent(information_source)
result = article_content.get_content()
article_content.finish()
return result
def content_task(news: TNews):
logger.info(f'{news.title} news_task start execute at {datetime.datetime.now()}', )
ofweek_com_ai = ArticleContent(news)
ofweek_com_ai.do_seek_task()
ofweek_com_ai.finish()
logger.info(f'{news.title} news_task end execute at {datetime.datetime.now()}')
if __name__ == '__main__':
logger.info('This module is not for direct call!')
news_ = TNews()
news_.is_static = True
news_.url = 'https://sz.news.fang.com/open/51863596.html'
content = get_content(news_)
logger.info(content)
logger.info('Done.')

64
seek/fang_com/house.py Normal file
View File

@ -0,0 +1,64 @@
import datetime
from DrissionPage.errors import ElementNotFoundError
from database.tinformationsource.model import TInformationSource
from database.tnews.model import TNews
from log.log_manager import logger
from seek.seek_base import SeekBase
from utils.time_utils import process_time
class House(SeekBase):
def __init__(self, information_source: TInformationSource):
super().__init__(information_source)
def get_news(self):
news_result = []
_news_list = self.session.s_ele('.news-list').s_eles('tag:li')
for _news in _news_list:
try:
rs_news = TNews()
tmp = _news.s_ele('.txt')
rs_news.title = tmp.s_ele('tag:a').text
rs_news.url = tmp.s_ele('tag:a').link
rs_news.summary = tmp.s_ele('tag:p').text
rs_news.occurrence_date = process_time(tmp.s_eles('tag:span')[1].text)
rs_news.source = self.information_source.title
news_result.append(rs_news)
except ElementNotFoundError as e:
if _news.s_ele('.item'):
# 此为视频内容,跳过
continue
logger.error(f"ElementNotFoundError: {e} - Failed to find element in news item.")
except Exception as e:
logger.error(f'Unexpected error occurred: {e}')
return news_result
def get_news(information_source: TInformationSource) -> list:
instance = House(information_source)
news_list = instance.get_news()
instance.finish()
return news_list
def news_task(information_source: TInformationSource):
logger.info(f'{information_source.title} news_task start execute at {datetime.datetime.now()}', )
instance = House(information_source)
instance.do_seek_task()
instance.finish()
logger.info(f'{information_source.title} news_task end execute at {datetime.datetime.now()}')
if __name__ == '__main__':
logger.info('This module is not for direct call!')
information_source_ = TInformationSource()
information_source_.is_static = True
information_source_.url = 'https://sz.news.fang.com/'
information_source_.title = '房产_房天下'
news_task(information_source_)
# news_list_ = get_news(information_source_)
# for news in news_list_:
# print(news)
logger.info('Done.')

View File

Binary file not shown.

Binary file not shown.

Binary file not shown.

46
seek/focus_cn/content.py Normal file
View File

@ -0,0 +1,46 @@
import datetime
from DrissionPage.errors import ElementNotFoundError
from database.tinformationsource.model import TInformationSource
from database.tnews.model import TNews
from log.log_manager import logger
from seek.content_base import ContentBase
class ArticleContent(ContentBase):
def __init__(self, news: TNews):
news.is_static = True
super().__init__(news)
def get_content(self):
try:
content_ = self.session.s_ele('.article').text
except ElementNotFoundError:
content_ = 'not found element'
return content_
def get_content(information_source: TInformationSource) -> list:
article_content = ArticleContent(information_source)
result = article_content.get_content()
article_content.finish()
return result
def content_task(news: TNews):
logger.info(f'{news.title} news_task start execute at {datetime.datetime.now()}', )
ofweek_com_ai = ArticleContent(news)
ofweek_com_ai.do_seek_task()
ofweek_com_ai.finish()
logger.info(f'{news.title} news_task end execute at {datetime.datetime.now()}')
if __name__ == '__main__':
logger.info('This module is not for direct call!')
news_ = TNews()
news_.is_static = True
news_.url = 'https://www.focus.cn/a/842171870_124752'
content = get_content(news_)
logger.info(content)
logger.info('Done.')

62
seek/focus_cn/house.py Normal file
View File

@ -0,0 +1,62 @@
import datetime
from DrissionPage.errors import ElementNotFoundError
from database.tinformationsource.model import TInformationSource
from database.tnews.model import TNews
from log.log_manager import logger
from seek.seek_base import SeekBase
from utils.time_utils import process_time
class House(SeekBase):
def __init__(self, information_source: TInformationSource):
super().__init__(information_source)
def get_news(self):
news_result = []
self.tab.wait.ele_displayed('.FeedList')
_news_list = self.tab.s_ele('.cbd-recommend').s_eles('.FeedList')
for _news in _news_list:
try:
rs_news = TNews()
rs_news.title = _news.s_ele('.item-text-content-title').text
link = _news.s_ele('tag:a').link
rs_news.url = link.split('?')[0]
rs_news.summary = _news.s_ele('.item-text-content-description').text
rs_news.occurrence_date = process_time(_news.s_ele('.extra-info-item').text)
rs_news.source = self.information_source.title
news_result.append(rs_news)
except ElementNotFoundError as e:
logger.error(f"ElementNotFoundError: {e} - Failed to find element in news item.")
except Exception as e:
logger.error(f'Unexpected error occurred: {e}')
return news_result
def get_news(information_source: TInformationSource) -> list:
instance = House(information_source)
news_list = instance.get_news()
instance.finish()
return news_list
def news_task(information_source: TInformationSource):
logger.info(f'{information_source.title} news_task start execute at {datetime.datetime.now()}', )
instance = House(information_source)
instance.do_seek_task()
instance.finish()
logger.info(f'{information_source.title} news_task end execute at {datetime.datetime.now()}')
if __name__ == '__main__':
logger.info('This module is not for direct call!')
information_source_ = TInformationSource()
information_source_.is_static = False
information_source_.url = 'https://sz.focus.cn/zixun/'
information_source_.title = '房产_搜狐焦点'
news_task(information_source_)
# news_list_ = get_news(information_source_)
# for news in news_list_:
# print(news)
logger.info('Done.')

View File

Binary file not shown.

Binary file not shown.

Binary file not shown.

45
seek/leju_com/content.py Normal file
View File

@ -0,0 +1,45 @@
import datetime
from DrissionPage.errors import ElementNotFoundError
from database.tinformationsource.model import TInformationSource
from database.tnews.model import TNews
from log.log_manager import logger
from seek.content_base import ContentBase
class ArticleContent(ContentBase):
def __init__(self, news: TNews):
super().__init__(news)
def get_content(self):
try:
content_ = self.session.s_ele('.^sf_news_contend').text
except ElementNotFoundError:
content_ = 'not found element'
return content_
def get_content(information_source: TInformationSource) -> list:
article_content = ArticleContent(information_source)
result = article_content.get_content()
article_content.finish()
return result
def content_task(news: TNews):
logger.info(f'{news.title} news_task start execute at {datetime.datetime.now()}', )
ofweek_com_ai = ArticleContent(news)
ofweek_com_ai.do_seek_task()
ofweek_com_ai.finish()
logger.info(f'{news.title} news_task end execute at {datetime.datetime.now()}')
if __name__ == '__main__':
logger.info('This module is not for direct call!')
news_ = TNews()
news_.is_static = True
news_.url = 'https://sz.leju.com/news/2024-12-18/18427272536617796292963.shtml'
content = get_content(news_)
logger.info(content)
logger.info('Done.')

60
seek/leju_com/house.py Normal file
View File

@ -0,0 +1,60 @@
import datetime
from DrissionPage.errors import ElementNotFoundError
from database.tinformationsource.model import TInformationSource
from database.tnews.model import TNews
from log.log_manager import logger
from seek.seek_base import SeekBase
from utils.time_utils import process_time
class House(SeekBase):
def __init__(self, information_source: TInformationSource):
super().__init__(information_source)
def get_news(self):
news_result = []
_news_list = self.session.s_ele('.sf_listPage').s_eles('tag:li')
for _news in _news_list:
try:
rs_news = TNews()
rs_news.title = _news.s_ele('tag:a').text
rs_news.url = _news.s_ele('tag:a').link
rs_news.summary = _news.s_ele('tag:p').text
rs_news.occurrence_date = process_time(_news.s_ele('.tag').text)
rs_news.source = self.information_source.title
news_result.append(rs_news)
except ElementNotFoundError as e:
logger.error(f"ElementNotFoundError: {e} - Failed to find element in news item.")
except Exception as e:
logger.error(f'Unexpected error occurred: {e}')
return news_result
def get_news(information_source: TInformationSource) -> list:
instance = House(information_source)
news_list = instance.get_news()
instance.finish()
return news_list
def news_task(information_source: TInformationSource):
logger.info(f'{information_source.title} news_task start execute at {datetime.datetime.now()}', )
instance = House(information_source)
instance.do_seek_task()
instance.finish()
logger.info(f'{information_source.title} news_task end execute at {datetime.datetime.now()}')
if __name__ == '__main__':
logger.info('This module is not for direct call!')
information_source_ = TInformationSource()
information_source_.is_static = True
information_source_.url = 'https://sz.leju.com/news/'
information_source_.title = '房产_新浪乐居'
# news_task(information_source_)
news_list_ = get_news(information_source_)
for news in news_list_:
print(news)
logger.info('Done.')

View File

Binary file not shown.

Binary file not shown.

Binary file not shown.

45
seek/mittr_com/content.py Normal file
View File

@ -0,0 +1,45 @@
import datetime
from DrissionPage.errors import ElementNotFoundError
from database.tinformationsource.model import TInformationSource
from database.tnews.model import TNews
from log.log_manager import logger
from seek.content_base import ContentBase
class ArticleContent(ContentBase):
def __init__(self, news: TNews):
super().__init__(news)
def get_content(self):
try:
content_ = self.tab.s_ele('.content').text
except ElementNotFoundError:
content_ = 'not found element'
return content_
def get_content(information_source: TInformationSource) -> list:
article_content = ArticleContent(information_source)
result = article_content.get_content()
article_content.finish()
return result
def content_task(news: TNews):
logger.info(f'{news.title} news_task start execute at {datetime.datetime.now()}', )
ofweek_com_ai = ArticleContent(news)
ofweek_com_ai.do_seek_task()
ofweek_com_ai.finish()
logger.info(f'{news.title} news_task end execute at {datetime.datetime.now()}')
if __name__ == '__main__':
logger.info('This module is not for direct call!')
news_ = TNews()
news_.is_static = False
news_.url = 'https://www.mittrchina.com/news/detail/14218'
content = get_content(news_)
logger.info(content)
logger.info('Done.')

63
seek/mittr_com/mit_t_r.py Normal file
View File

@ -0,0 +1,63 @@
import datetime
from DrissionPage.errors import ElementNotFoundError
from database.tinformationsource.model import TInformationSource
from database.tnews.model import TNews
from log.log_manager import logger
from seek.seek_base import SeekBase
from utils.time_utils import process_time
class MittrChinaCom(SeekBase):
def __init__(self, information_source: TInformationSource):
super().__init__(information_source)
def get_news(self):
news_result = []
self.tab.wait.ele_displayed('.last-item')
_news_list = self.tab.s_ele('.lastest-list').s_eles('.last-item')
for _news in _news_list:
try:
tnews = TNews()
tnews.title = _news.s_ele('tag:a').text
tnews.url = _news.s_ele('tag:a').link
_time = _news.parent().s_ele('.time').text
tnews.occurrence_date = process_time(_time)
tnews.source = self.information_source.title
news_result.append(tnews)
except ElementNotFoundError as e:
logger.error(f"ElementNotFoundError: {e} - Failed to find element in news item.")
except Exception as e:
logger.error(f'Unexpected error occurred: {e}')
return news_result
def get_news(information_source: TInformationSource) -> list:
mittr = MittrChinaCom(information_source)
news_list = mittr.get_news()
mittr.finish()
return news_list
def news_task(information_source: TInformationSource):
logger.info(f'{information_source.title} news_task start execute at {datetime.datetime.now()}', )
mittr = MittrChinaCom(information_source)
mittr.do_seek_task()
mittr.finish()
logger.info(f'{information_source.title} news_task end execute at {datetime.datetime.now()}')
if __name__ == '__main__':
logger.info('This module is not for direct call!')
information_source_ = TInformationSource()
information_source_.is_static = False
information_source_.url = 'https://www.mittrchina.com/'
information_source_.title = '科技_麻省理工科技评论'
news_task(information_source_)
# news_list_ = get_news(information_source_)
# for news in news_list_:
# print(news)
logger.info('Done.')

View File

Binary file not shown.

Binary file not shown.

Binary file not shown.

62
seek/ofweek_com/ai.py Normal file
View File

@ -0,0 +1,62 @@
import datetime
from DrissionPage.errors import ElementNotFoundError
from database.tinformationsource.model import TInformationSource
from database.tnews.model import TNews
from log.log_manager import logger
from seek.seek_base import SeekBase
from utils.time_utils import process_time
class OfweekComAi(SeekBase):
def __init__(self, information_source: TInformationSource):
super().__init__(information_source)
def get_news(self):
news_result = []
_news_list = self.session.s_ele('.main-cont-left w640').s_eles('.^top-title')
for _news in _news_list:
try:
__news = TNews()
__news.title = _news.s_ele('tag:a').text
__news.url = _news.s_ele('tag:a').link
_time = _news.parent().s_eles('tag:span')[4].text
__news.occurrence_date = process_time(_time)
__news.source = self.information_source.title
news_result.append(__news)
except ElementNotFoundError as e:
logger.error(f"ElementNotFoundError: {e} - Failed to find element in news item.")
except Exception as e:
logger.error(f'Unexpected error occurred: {e}')
return news_result
def get_news(information_source: TInformationSource) -> list:
ofweek_com_ai = OfweekComAi(information_source)
news_list = ofweek_com_ai.get_news()
ofweek_com_ai.finish()
return news_list
def news_task(information_source: TInformationSource):
logger.info(f'{information_source.title} news_task start execute at {datetime.datetime.now()}', )
ofweek_com_ai = OfweekComAi(information_source)
ofweek_com_ai.do_seek_task()
ofweek_com_ai.finish()
logger.info(f'{information_source.title} news_task end execute at {datetime.datetime.now()}')
if __name__ == '__main__':
logger.info('This module is not for direct call!')
information_source_ = TInformationSource()
information_source_.is_static = True
information_source_.url = 'https://www.ofweek.com/ai/'
information_source_.title = '人工智能_维科网'
news_task(information_source_)
# news_list_ = get_news(information_source_)
# for news in news_list_:
# print(news)
logger.info('Done.')

View File

@ -0,0 +1,46 @@
import datetime
from DrissionPage.errors import ElementNotFoundError
from database.tinformationsource.model import TInformationSource
from database.tnews.model import TNews
from log.log_manager import logger
from seek.content_base import ContentBase
class ArticleContent(ContentBase):
def __init__(self, news: TNews):
super().__init__(news)
def get_content(self):
content_ = ''
try:
content_ = self.session.s_ele('.artical-content').text
except ElementNotFoundError:
content_ = 'not found element'
return content_
def get_content(information_source: TInformationSource) -> list:
article_content = ArticleContent(information_source)
result = article_content.get_content()
article_content.finish()
return result
def content_task(news: TNews):
logger.info(f'{news.title} news_task start execute at {datetime.datetime.now()}', )
ofweek_com_ai = ArticleContent(news)
ofweek_com_ai.do_seek_task()
ofweek_com_ai.finish()
logger.info(f'{news.title} news_task end execute at {datetime.datetime.now()}')
if __name__ == '__main__':
logger.info('This module is not for direct call!')
news_ = TNews()
news_.is_static = True
news_.url = 'https://www.ofweek.com/ai/2024-12/ART-201721-8120-30654143.html'
content = get_content(news_)
logger.info(content)
logger.info('Done.')

57
seek/seek_base.py Normal file
View File

@ -0,0 +1,57 @@
from abc import ABC, abstractmethod
from DrissionPage import Chromium, SessionPage, ChromiumOptions
from database.database import get_session
from database.tinformationsource.model import TInformationSource
from database.tnews.crud import create_news_list_if_url_not_exists
from log.log_manager import log
class SeekBase(ABC):
def __init__(self, information_source: TInformationSource):
self.information_source = information_source
self.session = None # 初始化为 None
self.browser = None # 初始化为 None
self.tab = None
if information_source.is_static:
self.session = SessionPage()
self.session.get(information_source.url)
else:
co = ChromiumOptions()
self.browser = Chromium()
self.tab = self.browser.new_tab()
self.tab.get(information_source.url)
@abstractmethod
def get_news(self):
"""Abstract method to fetch news from a specific source."""
pass
def do_seek_task(self):
"""Saves the list of news to the database if the URL does not already exist."""
news_list = self.get_news()
for news in news_list:
if news.primary_category is None:
news.primary_category = self.information_source.primary_category
if news.secondary_category is None:
news.secondary_category = self.information_source.secondary_category
if news.tertiary_category is None:
news.tertiary_category = self.information_source.tertiary_category
if news.label is None:
news.label = self.information_source.label
if news.lang is None:
news.lang = self.information_source.lang
with get_session() as db:
inserted_news = create_news_list_if_url_not_exists(db, news_list)
log(f'Inserted {len(inserted_news)} {self.information_source.title} news items into the database.')
return inserted_news
def finish(self):
"""Closes the browser and session."""
if self.tab:
self.tab.close()
# if self.browser:
# self.browser.quit()
if self.session:
self.session.close()

View File

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -0,0 +1,32 @@
from DrissionPage.errors import ElementNotFoundError
from database.tinformationsource.model import TInformationSource
from database.tnews.model import TNews
from log.log_manager import logger
from seek.seek_base import SeekBase
from utils.time_utils import process_time
class Base(SeekBase):
def __init__(self, information_source: TInformationSource):
super().__init__(information_source)
def get_news(self):
news_result = []
_news_list = self.session.s_ele('.index_cards__AdZtA').s_eles('.ant-col ant-col-6')
for _news in _news_list:
tnews = TNews()
try:
tnews.title = _news.s_ele('tag:a').text
tnews.url = _news.s_ele('tag:a').link
_time = _news.s_ele('.small_text__dR01h').s_eles('tag:span')[1].text
tnews.occurrence_date = process_time(_time)
tnews.source = self.information_source.title
news_result.append(tnews)
except ElementNotFoundError as e:
logger.error(f"ElementNotFoundError {tnews.title}: {e} - Failed to find element in news item.")
except Exception as e:
logger.error(f'Unexpected error occurred: {e}')
return news_result

View File

@ -0,0 +1,50 @@
import datetime
from DrissionPage.errors import ElementNotFoundError
from database.tinformationsource.model import TInformationSource
from database.tnews.model import TNews
from log.log_manager import logger
from seek.content_base import ContentBase
class ThePaperContent(ContentBase):
def __init__(self, news: TNews):
super().__init__(news)
def get_content(self):
content_ = ''
try:
content_ = self.session.s_ele('.^index_cententWrap').text
except ElementNotFoundError as e:
try:
# 视频
content_ = self.session.s_ele('.^header_desc').text
except ElementNotFoundError as e:
content_ = 'not found element'
return content_
def get_content(information_source: TInformationSource) -> list:
the_paper_content = ThePaperContent(information_source)
content = the_paper_content.get_content()
the_paper_content.finish()
return content
def content_task(news: TNews):
logger.info(f'{news.title} news_task start execute at {datetime.datetime.now()}', )
ofweek_com_ai = ThePaperContent(news)
ofweek_com_ai.do_seek_task()
ofweek_com_ai.finish()
logger.info(f'{news.title} news_task end execute at {datetime.datetime.now()}')
if __name__ == '__main__':
logger.info('This module is not for direct call!')
news_ = TNews()
news_.is_static = True
news_.url = 'https://www.thepaper.cn/newsDetail_forward_29745442'
content = get_content(news_)
logger.info(content)
logger.info('Done.')

View File

@ -0,0 +1,38 @@
import datetime
from database.tinformationsource.model import TInformationSource
from log.log_manager import logger
from seek.the_paper_com.base import Base
class International(Base):
def __init__(self, information_source: TInformationSource):
super().__init__(information_source)
def get_news(information_source: TInformationSource) -> list:
instance = International(information_source)
news_list = instance.get_news()
instance.finish()
return news_list
def news_task(information_source: TInformationSource):
logger.info(f'{information_source.title} news_task start execute at {datetime.datetime.now()}', )
instance = International(information_source)
instance.do_seek_task()
instance.finish()
logger.info(f'{information_source.title} news_task end execute at {datetime.datetime.now()}')
if __name__ == '__main__':
logger.info('This module is not for direct call!')
information_source_ = TInformationSource()
information_source_.is_static = True
information_source_.url = 'https://www.thepaper.cn/channel_122908'
information_source_.title = '国际_澎湃新闻'
# news_task(information_source_)
news_list_ = get_news(information_source_)
for news in news_list_:
print(news)
logger.info('Done.')

View File

@ -0,0 +1,38 @@
import datetime
from database.tinformationsource.model import TInformationSource
from log.log_manager import logger
from seek.the_paper_com.base import Base
class Tech(Base):
def __init__(self, information_source: TInformationSource):
super().__init__(information_source)
def get_news(information_source: TInformationSource) -> list:
instance = Tech(information_source)
news_list = instance.get_news()
instance.finish()
return news_list
def news_task(information_source: TInformationSource):
logger.info(f'{information_source.title} news_task start execute at {datetime.datetime.now()}', )
instance = Tech(information_source)
instance.do_seek_task()
instance.finish()
logger.info(f'{information_source.title} news_task end execute at {datetime.datetime.now()}')
if __name__ == '__main__':
logger.info('This module is not for direct call!')
information_source_ = TInformationSource()
information_source_.is_static = True
information_source_.url = 'https://www.thepaper.cn/channel_119908'
information_source_.title = '科技_澎湃新闻'
news_task(information_source_)
# news_list_ = get_news(information_source_)
# for news in news_list_:
# print(news)
logger.info('Done.')

View File

Binary file not shown.

View File

@ -0,0 +1,58 @@
import datetime
from DrissionPage.errors import ElementNotFoundError
from database.tinformationsource.model import TInformationSource
from database.tnews.model import TNews
from log.log_manager import logger
from seek.content_base import ContentBase
class ArticleContent(ContentBase):
def __init__(self, news: TNews):
super().__init__(news)
def get_content(self):
content_ = ''
try:
content_ = self.session.s_ele('#detailContent').text
except ElementNotFoundError:
content_ = 'not found element'
return content_
def get_occurrence_date(self):
try:
header_time = self.session.s_ele('.header-time left')
year = header_time.s_ele('.year').text # 2023
day = header_time.s_ele('.day').text # 12/27
time = header_time.s_ele('.time').text # 08:05:11
occurrence_date_ = f'{year}/{day} {time}'
print(occurrence_date_)
except ElementNotFoundError:
occurrence_date_ = None
return occurrence_date_
def get_content(information_source: TInformationSource) -> list:
article_content = ArticleContent(information_source)
result = article_content.get_content()
article_content.get_occurrence_date()
article_content.finish()
return result
def content_task(news: TNews):
logger.info(f'{news.title} news_task start execute at {datetime.datetime.now()}', )
article_content = ArticleContent(news)
article_content.do_seek_task()
article_content.finish()
logger.info(f'{news.title} news_task end execute at {datetime.datetime.now()}')
if __name__ == '__main__':
logger.info('This module is not for direct call!')
news_ = TNews()
news_.is_static = True
news_.url = 'https://www.news.cn/politics/leaders/20241227/90e76f85ad4a43ba94802b07c5736e00/c.html'
content = get_content(news_)
logger.info(content)
logger.info('Done.')

View File

@ -0,0 +1,59 @@
import datetime
from DrissionPage.errors import ElementNotFoundError
from database.tinformationsource.model import TInformationSource
from database.tnews.model import TNews
from log.log_manager import logger
from seek.seek_base import SeekBase
class Information(SeekBase):
def __init__(self, information_source: TInformationSource):
super().__init__(information_source)
def get_news(self):
news_result = []
_news_list = self.session.s_ele('#focusListNews').s_eles('tag:li')
for _news in _news_list:
try:
rs_news = TNews()
rs_news.title = _news.s_ele('tag:a').text
rs_news.url = _news.s_ele('tag:a').link
# rs_news.summary = tmp_.s_eles('tag:a')[1].text
# rs_news.occurrence_date = self.process_time(tmp_.s_ele('.info__time').text)
rs_news.source = self.information_source.title
news_result.append(rs_news)
except ElementNotFoundError as e:
logger.error(f"ElementNotFoundError: {e} - Failed to find element in news item.")
except Exception as e:
logger.error(f'Unexpected error occurred: {e}')
return news_result
def get_news(information_source: TInformationSource) -> list:
instance = Information(information_source)
news_list = instance.get_news()
instance.finish()
return news_list
def news_task(information_source: TInformationSource):
logger.info(f'{information_source.title} news_task start execute at {datetime.datetime.now()}', )
instance = Information(information_source)
instance.do_seek_task()
instance.finish()
logger.info(f'{information_source.title} news_task end execute at {datetime.datetime.now()}')
if __name__ == '__main__':
logger.info('This module is not for direct call!')
information_source_ = TInformationSource()
information_source_.is_static = True
information_source_.url = 'http://www.xinhuanet.com/'
information_source_.title = '资讯_新华网'
news_task(information_source_)
# news_list_ = get_news(information_source_)
# for news in news_list_:
# print(news)
logger.info('Done.')

View File

Binary file not shown.

Binary file not shown.

Binary file not shown.

25
seek/zhihu_com/demo.py Normal file
View File

@ -0,0 +1,25 @@
from database.database import get_session
from database.thotcontent.crud import get_hot_content_by_topic_id
from database.thottopic.crud import get_latest_hot_topic
if __name__ == '__main__':
with get_session() as db:
# 1. 获取最新的热点话题
latest_hot_topic = get_latest_hot_topic(db)
topic = latest_hot_topic.topic
print(latest_hot_topic)
# 2. 获取话题内容
hot_contents = get_hot_content_by_topic_id(db, latest_hot_topic.id)
for hot_content in hot_contents:
print(hot_content)
# 统计hot_content.content的字数
print(len(hot_content.content))
topic_content = [hot_content.content for hot_content in hot_contents]
print(topic_content)
print(len(topic_content))
print('---------------------------------------------------------------')
print(topic_content[0])
print('---------------------------------------------------------------')
print(topic_content[1])
print('---------------------------------------------------------------')
print(topic_content[2])

116
seek/zhihu_com/demo2.py Normal file
View File

@ -0,0 +1,116 @@
import json
from DrissionPage import Chromium
from DrissionPage import ChromiumOptions
from DrissionPage.errors import ElementNotFoundError
from log.log_manager import logger
def get_content_from_meta(metas, itemprop):
content = None
for meta in metas:
if meta.attr('itemprop') == itemprop:
content = meta.attr('content')
return content
class Zhihu:
def __init__(self):
co = ChromiumOptions()
self.browser = Chromium()
self.tab = None
def get_content(self, url):
"""获取话题内容数据"""
global title, keywords, date_created, date_modified, follower_count, comment_count, answer_count, topic_description
contents_result = []
try:
self.tab = self.browser.new_tab()
# 访问话题/问题页面
self.tab.get(url)
for _ in range(10):
# 等待内容加载
self.tab.wait.ele_displayed('.List-item')
self.tab.wait(3)
# 向下滚动页面,直到所有内容加载完成
self.tab.scroll.to_bottom()
self.tab.wait(1)
self.tab.scroll.up(100)
# 获取话题/问题相关信息话题内容、keywards、话题创建日期dateCreated、话题修改日期dateModified、回答数量answerCount、评论数量commentCount
question_page = self.tab.ele('.QuestionPage')
# 获取话题属性为QuestionPage的前9个meta标签
metas = question_page.eles('tag:meta')[0:9]
# print(metas)
title = get_content_from_meta(metas, 'name')
answer_count = get_content_from_meta(metas, 'answerCount')
comment_count = get_content_from_meta(metas, 'commentCount')
keywords = get_content_from_meta(metas, 'keywords')
date_created = get_content_from_meta(metas, 'dateCreated')
date_modified = get_content_from_meta(metas, 'dateModified')
follower_count = get_content_from_meta(metas, 'zhihu:followerCount')
# print(date_created, date_modified, answer_count, comment_count, keywords)
topic_description = ""
try:
topic_description = question_page.ele('.RichText ztext css-ob6uua').text
except ElementNotFoundError as e:
logger.error(f"元素缺失不存在topic_description")
# 获取所有内容条目
content_items = self.tab.ele('.Question-mainColumn').eles('.List-item')
total_characters = 0
for item in content_items:
try:
content = item.ele('.RichContent-inner').text
# 计算content的字数
content_len = len(content)
print(content_len)
if content_len > 1000 or content_len < 100:
logger.error(f"skip本条内容内容长度{content_len}")
continue
if total_characters > 5000:
logger.error(f"contents_result长度超过5000跳出循环")
break
total_characters += content_len
contents_result.append(content)
# 打印contents_result的长度
logger.info(f"contents_result长度{len(contents_result)}")
except ElementNotFoundError as e:
logger.error(f"元素缺失:{str(e)}")
except ValueError as e:
logger.error(f"热度值转换失败:{str(e)}")
except ElementNotFoundError as e:
logger.error(f"热榜容器元素未找到:{str(e)}")
except Exception as e:
logger.error(f"获取热榜数据异常:{str(e)}")
finally:
if self.tab:
self.tab.close()
# 返回json格式的数据
return json.dumps({
'title': title,
'answer_count': answer_count,
'comment_count': comment_count,
'topic_description': topic_description,
'keywords': keywords,
'date_created': date_created,
'date_modified': date_modified,
'follower_count': follower_count,
'contents': contents_result
}, ensure_ascii=False)
if __name__ == '__main__':
# 测试用例
logger.info('知乎采集测试')
# 执行采集任务
zhihu = Zhihu()
result = zhihu.get_content('https://www.zhihu.com/question/588507809')
print(len(result))
print(result)
logger.info('测试完成')

85
seek/zhihu_com/hot.py Normal file
View File

@ -0,0 +1,85 @@
import datetime
import re # 添加正则表达式库的导入
from DrissionPage.errors import ElementNotFoundError
from database.database import get_session
from database.tinformationsource.model import TInformationSource
from database.tnews.model import TNews
from log.log_manager import logger
from seek.seek_base import SeekBase
class ZhihuHot(SeekBase):
def get_news(self):
"""获取知乎热榜数据"""
news_result = []
try:
# 访问热榜页面
self.tab.get('https://www.zhihu.com/hot')
# 等待热榜内容加载
self.tab.wait.ele_displayed('.HotItem')
# 获取所有热榜条目
hot_items = self.tab.ele('.HotList-list').eles('.HotItem')
for item in hot_items:
try:
news = TNews()
# 提取标题和链接
news.title = item.ele('tag:a').attr('title').title()
news.url = item.ele('tag:a').link
# 提取热度值(去除"热度"文字)
heat_value = item('.HotItem-metrics HotItem-metrics--bottom').text
logger.info(f"热度值:{heat_value}")
# 使用正则表达式提取数值部分
match = re.search(r'(\d+\s*万)', heat_value)
if match:
news.heat = match.group(1).replace(' ', '') # 去除空格
else:
logger.error(f"无法提取热度值:{heat_value}")
logger.info(f"提取到的热度值:{news.heat}")
news.source = self.information_source.title
news.occurrence_date = datetime.datetime.now()
news_result.append(news)
except ElementNotFoundError as e:
logger.error(f"元素缺失:{str(e)}")
except ValueError as e:
logger.error(f"热度值转换失败:{str(e)}")
except ElementNotFoundError as e:
logger.error(f"热榜容器元素未找到:{str(e)}")
except Exception as e:
logger.error(f"获取热榜数据异常:{str(e)}")
return news_result
def get_news(information_source: TInformationSource) -> list:
"""对外暴露的获取新闻接口"""
zhihu = ZhihuHot(information_source)
return zhihu.get_news()
def news_task(information_source: TInformationSource):
"""任务执行入口"""
with get_session() as db:
news_list = get_news(information_source)
# create_news_list_if_url_not_exists(db, news_list)
for news in news_list:
logger.info(f"采集到新闻:{news}")
if __name__ == '__main__':
# 测试用例
logger.info('知乎热榜采集测试')
information_source_ = TInformationSource()
information_source_.is_static = False # 知乎需要浏览器渲染
information_source_.url = 'https://www.zhihu.com/hot'
information_source_.title = '热榜_知乎'
# 执行采集任务
news_task(information_source_)
logger.info('测试完成')

173
seek/zhihu_com/zhihu.py Normal file
View File

@ -0,0 +1,173 @@
import re
from DrissionPage import Chromium
from DrissionPage import ChromiumOptions
from DrissionPage.errors import ElementNotFoundError
from database.database import get_session
from database.thotcontent.crud import create_contents_top3_if_url_not_exists
from database.thotcontent.model import THotContent
from database.thottopic.crud import create_topics_if_url_not_exists, update_hot_topic
from database.thottopic.model import THotTopic
from log.log_manager import logger
def get_content_from_meta(metas, itemprop):
content = None
for meta in metas:
if meta.attr('itemprop') == itemprop:
content = meta.attr('content')
return content
class Zhihu:
def __init__(self):
co = ChromiumOptions()
self.browser = Chromium()
self.tab = None
def get_topics(self):
"""获取知乎数据"""
topics_result = []
try:
self.tab = self.browser.new_tab()
# 访问知乎主页面
self.tab.get('https://www.zhihu.com')
# 等待热榜内容加载
self.tab.wait.ele_displayed('.Card TopstoryItem TopstoryItem-isRecommend')
# 获取所有热榜条目
hot_items = self.tab.ele('.Topstory-content').eles('.Card TopstoryItem TopstoryItem-isRecommend')
for item in hot_items:
try:
topic = THotTopic()
topic.source = '知乎'
# 提取标题和链接
topic.topic = item.ele('tag:h2').ele('tag:a').text
topic.url = item.ele('tag:h2').ele('tag:a').link
pattern = r'^https://www\.zhihu\.com/question/\d+'
result = re.findall(pattern, topic.url)
if result:
topic.url = result[0]
else:
continue
topics_result.append(topic)
except ElementNotFoundError as e:
logger.error(f"元素缺失:{str(e)}")
except ValueError as e:
logger.error(f"热度值转换失败:{str(e)}")
except ElementNotFoundError as e:
logger.error(f"热榜容器元素未找到:{str(e)}")
except Exception as e:
logger.error(f"获取热榜数据异常:{str(e)}")
finally:
if self.tab:
self.tab.close()
return topics_result
def get_content(self, topic: THotTopic, db):
"""获取话题内容数据"""
contents_result = []
try:
self.tab = self.browser.new_tab()
# 访问话题/问题页面
self.tab.get(topic.url)
for _ in range(10):
# 等待内容加载
self.tab.wait.ele_displayed('.List-item')
self.tab.wait(3)
# 向下滚动页面,直到所有内容加载完成
self.tab.scroll.to_bottom()
self.tab.wait(1)
self.tab.scroll.up(100)
# 获取话题/问题相关信息话题内容、keywards、话题创建日期dateCreated、话题修改日期dateModified、回答数量answerCount、评论数量commentCount
question_page = self.tab.ele('.QuestionPage')
# 获取话题属性为QuestionPage的前9个meta标签
metas = question_page.eles('tag:meta')[0:9]
# print(metas)
answer_count = get_content_from_meta(metas, 'answerCount')
comment_count = get_content_from_meta(metas, 'commentCount')
keywords = get_content_from_meta(metas, 'keywords')
date_created = get_content_from_meta(metas, 'dateCreated')
date_modified = get_content_from_meta(metas, 'dateModified')
follower_count = get_content_from_meta(metas, 'zhihu:followerCount')
# print(date_created, date_modified, answer_count, comment_count, keywords)
topic.content_count = int(answer_count)
topic.comment_count = int(comment_count)
topic.follower_count = int(follower_count)
topic.keywords = keywords
topic.date_created = date_created
topic.date_modified = date_modified
try:
topic.topic_description = question_page.ele('.RichText ztext css-ob6uua').text
except ElementNotFoundError as e:
logger.error(f"元素缺失不存在topic_description")
update_hot_topic(db, topic)
# 获取所有内容条目
content_items = self.tab.ele('.Question-mainColumn').eles('.List-item')
for item in content_items:
try:
content = THotContent()
content.topic_id = topic.id
content.url = item.ele('.ContentItem-time').ele('tag:a').link
upvote_str = item.ele('.Button VoteButton VoteButton--up FEfUrdfMIKpQDJDqkjte').text
match = re.search(r'(\d+\.?\d*)\s*万?', upvote_str)
if match:
number = float(match.group(1))
content.content_upvote_count = int(number * 10000) if '' in upvote_str else int(number)
else:
content.content_upvote_count = 0
comment_str = item.ele('.Button ContentItem-action FEfUrdfMIKpQDJDqkjte Button--plain Button--withIcon Button--withLabel fEPKGkUK5jyc4fUuT0QP B46v1Ak6Gj5sL2JTS4PY RuuQ6TOh2cRzJr6WlyQp').text
match = re.search(r'(\d{1,3}(?:,\d{3})*)', comment_str)
if match:
content.content_comment_count = int(match.group(1).replace(',', ''))
else:
content.content_comment_count = 0
content.content = item.ele('.RichContent-inner').text
contents_result.append(content)
except ElementNotFoundError as e:
logger.error(f"元素缺失:{str(e)}")
except ValueError as e:
logger.error(f"热度值转换失败:{str(e)}")
except ElementNotFoundError as e:
logger.error(f"热榜容器元素未找到:{str(e)}")
except Exception as e:
logger.error(f"获取热榜数据异常:{str(e)}")
finally:
if self.tab:
self.tab.close()
return contents_result
def get_topics() -> list:
zhihu = Zhihu()
topics = zhihu.get_topics()
return topics
def gather_task():
"""任务执行入口"""
with get_session() as db:
zhihu = Zhihu()
topics = zhihu.get_topics()
inserted_topics = create_topics_if_url_not_exists(db, topics)
for topic in inserted_topics:
logger.info(f"采集到话题:{topic}")
contents = zhihu.get_content(topic, db)
create_contents_top3_if_url_not_exists(db, contents)
if __name__ == '__main__':
# 测试用例
logger.info('知乎采集测试')
# 执行采集任务
gather_task()
logger.info('测试完成')

156
seek/zhihu_com/zhihu_hot.py Normal file
View File

@ -0,0 +1,156 @@
from DrissionPage import Chromium
from DrissionPage import ChromiumOptions
from DrissionPage.errors import ElementNotFoundError
from log.log_manager import logger
def get_content_from_meta(metas, itemprop):
content = None
for meta in metas:
if meta.attr('itemprop') == itemprop:
content = meta.attr('content')
return content
class ZhihuHot:
def __init__(self):
co = ChromiumOptions()
self.browser = Chromium()
def get_topic_url_list(self) -> list:
"""获取知乎热榜数据"""
_topic_url_list = []
_tab = None
try:
_tab = self.browser.new_tab()
# 访问热榜页面
_tab.get('https://www.zhihu.com/hot')
# 等待热榜内容加载
_tab.wait.ele_displayed('.HotItem')
# 获取所有热榜条目
hot_items = _tab.ele('.HotList-list').eles('.HotItem')
for item in hot_items:
try:
# 提取标题和链接
# title = item.ele('tag:a').attr('title').title()
url = item.ele('tag:a').link
_topic_url_list.append(url)
except ElementNotFoundError as e:
logger.error(f"元素缺失:{str(e)}")
except ValueError as e:
logger.error(f"热度值转换失败:{str(e)}")
except ElementNotFoundError as e:
logger.error(f"热榜容器元素未找到:{str(e)}")
except Exception as e:
logger.error(f"获取热榜数据异常:{str(e)}")
finally:
if _tab:
_tab.close()
return _topic_url_list
def get_content(self, url):
"""获取话题内容数据"""
contents_result = []
_tab = None
global title, keywords, date_created, date_modified, follower_count, comment_count, answer_count, topic_description
try:
_tab = self.browser.new_tab()
# 访问话题/问题页面
_tab.get(url)
for _ in range(10):
# for _ in range(1):
# 等待内容加载
_tab.wait.ele_displayed('.List-item')
_tab.wait(3)
# 向下滚动页面,直到所有内容加载完成
_tab.scroll.to_bottom()
_tab.wait(1)
_tab.scroll.up(100)
# 获取话题/问题相关信息话题内容、keywards、话题创建日期dateCreated、话题修改日期dateModified、回答数量answerCount、评论数量commentCount
question_page = _tab.ele('.QuestionPage')
# 获取话题属性为QuestionPage的前9个meta标签
metas = question_page.eles('tag:meta')[0:9]
# print(metas)
title = get_content_from_meta(metas, 'name')
answer_count = get_content_from_meta(metas, 'answerCount')
comment_count = get_content_from_meta(metas, 'commentCount')
keywords = get_content_from_meta(metas, 'keywords')
date_created = get_content_from_meta(metas, 'dateCreated')
date_modified = get_content_from_meta(metas, 'dateModified')
follower_count = get_content_from_meta(metas, 'zhihu:followerCount')
# print(date_created, date_modified, answer_count, comment_count, keywords)
topic_description = ""
try:
unfold_topic_description = question_page.ele('.^Button QuestionRichText-more')
if unfold_topic_description:
unfold_topic_description.click()
topic_description = question_page.ele('.^QuestionRichText').text
except ElementNotFoundError as e:
logger.error(f"元素缺失不存在topic_description")
# 获取所有内容条目
content_items = _tab.ele('.Question-mainColumn').eles('.List-item')
total_characters = 0
for item in content_items:
try:
content = item.ele('.RichContent-inner').text
# 计算content的字数
content_len = len(content)
print(content_len)
if content_len > 1000 or content_len < 100:
logger.error(f"skip本条内容内容长度{content_len}")
continue
if total_characters > 5000:
logger.error(f"contents_result长度超过5000跳出循环")
break
total_characters += content_len
contents_result.append(content)
# 打印contents_result的长度
logger.info(f"contents_result长度{len(contents_result)}")
except ElementNotFoundError as e:
logger.error(f"元素缺失:{str(e)}")
except ValueError as e:
logger.error(f"热度值转换失败:{str(e)}")
except ElementNotFoundError as e:
logger.error(f"热榜容器元素未找到:{str(e)}")
except Exception as e:
logger.error(f"获取热榜数据异常:{str(e)}")
finally:
if _tab:
_tab.close()
return {
'title': title,
'topic_description': topic_description,
'keywords': keywords,
'url': url,
'contents': contents_result,
'date_created': date_created,
'date_modified': date_modified,
'follower_count': follower_count,
'answer_count': answer_count,
'comment_count': comment_count
}
if __name__ == '__main__':
# 测试用例
logger.info('知乎采集测试')
# 执行采集任务
zhihu_hot = ZhihuHot()
result = zhihu_hot.get_content('https://www.zhihu.com/question/14351228309')
print(len(result))
print(result)
# topic_url_list = zhihu_hot.get_topic_url_list()
# print(topic_url_list)
logger.info('测试完成')