174 lines
7.1 KiB
Python
174 lines
7.1 KiB
Python
import re
|
||
|
||
from DrissionPage import Chromium
|
||
from DrissionPage import ChromiumOptions
|
||
from DrissionPage.errors import ElementNotFoundError
|
||
|
||
from database.database import get_session
|
||
from database.thotcontent.crud import create_contents_top3_if_url_not_exists
|
||
from database.thotcontent.model import THotContent
|
||
from database.thottopic.crud import create_topics_if_url_not_exists, update_hot_topic
|
||
from database.thottopic.model import THotTopic
|
||
from log.log_manager import logger
|
||
|
||
def get_content_from_meta(metas, itemprop):
|
||
content = None
|
||
for meta in metas:
|
||
if meta.attr('itemprop') == itemprop:
|
||
content = meta.attr('content')
|
||
return content
|
||
|
||
|
||
class Zhihu:
|
||
def __init__(self):
|
||
co = ChromiumOptions()
|
||
self.browser = Chromium()
|
||
self.tab = None
|
||
|
||
def get_topics(self):
|
||
"""获取知乎数据"""
|
||
topics_result = []
|
||
try:
|
||
self.tab = self.browser.new_tab()
|
||
# 访问知乎主页面
|
||
self.tab.get('https://www.zhihu.com')
|
||
|
||
# 等待热榜内容加载
|
||
self.tab.wait.ele_displayed('.Card TopstoryItem TopstoryItem-isRecommend')
|
||
|
||
# 获取所有热榜条目
|
||
hot_items = self.tab.ele('.Topstory-content').eles('.Card TopstoryItem TopstoryItem-isRecommend')
|
||
|
||
for item in hot_items:
|
||
try:
|
||
topic = THotTopic()
|
||
topic.source = '知乎'
|
||
# 提取标题和链接
|
||
topic.topic = item.ele('tag:h2').ele('tag:a').text
|
||
topic.url = item.ele('tag:h2').ele('tag:a').link
|
||
pattern = r'^https://www\.zhihu\.com/question/\d+'
|
||
result = re.findall(pattern, topic.url)
|
||
if result:
|
||
topic.url = result[0]
|
||
else:
|
||
continue
|
||
topics_result.append(topic)
|
||
except ElementNotFoundError as e:
|
||
logger.error(f"元素缺失:{str(e)}")
|
||
except ValueError as e:
|
||
logger.error(f"热度值转换失败:{str(e)}")
|
||
|
||
except ElementNotFoundError as e:
|
||
logger.error(f"热榜容器元素未找到:{str(e)}")
|
||
except Exception as e:
|
||
logger.error(f"获取热榜数据异常:{str(e)}")
|
||
finally:
|
||
if self.tab:
|
||
self.tab.close()
|
||
return topics_result
|
||
|
||
def get_content(self, topic: THotTopic, db):
|
||
"""获取话题内容数据"""
|
||
contents_result = []
|
||
try:
|
||
self.tab = self.browser.new_tab()
|
||
# 访问话题/问题页面
|
||
self.tab.get(topic.url)
|
||
|
||
for _ in range(10):
|
||
# 等待内容加载
|
||
self.tab.wait.ele_displayed('.List-item')
|
||
self.tab.wait(3)
|
||
# 向下滚动页面,直到所有内容加载完成
|
||
self.tab.scroll.to_bottom()
|
||
self.tab.wait(1)
|
||
self.tab.scroll.up(100)
|
||
|
||
# 获取话题/问题相关信息:话题内容、keywards、话题创建日期dateCreated、话题修改日期dateModified、回答数量answerCount、评论数量commentCount
|
||
question_page = self.tab.ele('.QuestionPage')
|
||
# 获取话题属性,为QuestionPage的前9个meta标签
|
||
metas = question_page.eles('tag:meta')[0:9]
|
||
# print(metas)
|
||
answer_count = get_content_from_meta(metas, 'answerCount')
|
||
comment_count = get_content_from_meta(metas, 'commentCount')
|
||
keywords = get_content_from_meta(metas, 'keywords')
|
||
date_created = get_content_from_meta(metas, 'dateCreated')
|
||
date_modified = get_content_from_meta(metas, 'dateModified')
|
||
follower_count = get_content_from_meta(metas, 'zhihu:followerCount')
|
||
# print(date_created, date_modified, answer_count, comment_count, keywords)
|
||
topic.content_count = int(answer_count)
|
||
topic.comment_count = int(comment_count)
|
||
topic.follower_count = int(follower_count)
|
||
topic.keywords = keywords
|
||
topic.date_created = date_created
|
||
topic.date_modified = date_modified
|
||
try:
|
||
topic.topic_description = question_page.ele('.RichText ztext css-ob6uua').text
|
||
except ElementNotFoundError as e:
|
||
logger.error(f"元素缺失:不存在topic_description")
|
||
update_hot_topic(db, topic)
|
||
|
||
# 获取所有内容条目
|
||
content_items = self.tab.ele('.Question-mainColumn').eles('.List-item')
|
||
|
||
for item in content_items:
|
||
try:
|
||
content = THotContent()
|
||
content.topic_id = topic.id
|
||
content.url = item.ele('.ContentItem-time').ele('tag:a').link
|
||
upvote_str = item.ele('.Button VoteButton VoteButton--up FEfUrdfMIKpQDJDqkjte').text
|
||
match = re.search(r'(\d+\.?\d*)\s*万?', upvote_str)
|
||
if match:
|
||
number = float(match.group(1))
|
||
content.content_upvote_count = int(number * 10000) if '万' in upvote_str else int(number)
|
||
else:
|
||
content.content_upvote_count = 0
|
||
comment_str = item.ele('.Button ContentItem-action FEfUrdfMIKpQDJDqkjte Button--plain Button--withIcon Button--withLabel fEPKGkUK5jyc4fUuT0QP B46v1Ak6Gj5sL2JTS4PY RuuQ6TOh2cRzJr6WlyQp').text
|
||
match = re.search(r'(\d{1,3}(?:,\d{3})*)', comment_str)
|
||
if match:
|
||
content.content_comment_count = int(match.group(1).replace(',', ''))
|
||
else:
|
||
content.content_comment_count = 0
|
||
content.content = item.ele('.RichContent-inner').text
|
||
contents_result.append(content)
|
||
except ElementNotFoundError as e:
|
||
logger.error(f"元素缺失:{str(e)}")
|
||
except ValueError as e:
|
||
logger.error(f"热度值转换失败:{str(e)}")
|
||
|
||
except ElementNotFoundError as e:
|
||
logger.error(f"热榜容器元素未找到:{str(e)}")
|
||
except Exception as e:
|
||
logger.error(f"获取热榜数据异常:{str(e)}")
|
||
finally:
|
||
if self.tab:
|
||
self.tab.close()
|
||
return contents_result
|
||
|
||
|
||
|
||
def get_topics() -> list:
|
||
zhihu = Zhihu()
|
||
topics = zhihu.get_topics()
|
||
return topics
|
||
|
||
def gather_task():
|
||
"""任务执行入口"""
|
||
with get_session() as db:
|
||
zhihu = Zhihu()
|
||
topics = zhihu.get_topics()
|
||
inserted_topics = create_topics_if_url_not_exists(db, topics)
|
||
for topic in inserted_topics:
|
||
logger.info(f"采集到话题:{topic}")
|
||
contents = zhihu.get_content(topic, db)
|
||
create_contents_top3_if_url_not_exists(db, contents)
|
||
|
||
|
||
if __name__ == '__main__':
|
||
# 测试用例
|
||
logger.info('知乎采集测试')
|
||
# 执行采集任务
|
||
gather_task()
|
||
logger.info('测试完成')
|
||
|