import peter
This commit is contained in:
0
seek/zhihu_com/__init__.py
Normal file
0
seek/zhihu_com/__init__.py
Normal file
BIN
seek/zhihu_com/__pycache__/__init__.cpython-312.pyc
Normal file
BIN
seek/zhihu_com/__pycache__/__init__.cpython-312.pyc
Normal file
Binary file not shown.
BIN
seek/zhihu_com/__pycache__/zhihu.cpython-312.pyc
Normal file
BIN
seek/zhihu_com/__pycache__/zhihu.cpython-312.pyc
Normal file
Binary file not shown.
BIN
seek/zhihu_com/__pycache__/zhihu_hot.cpython-312.pyc
Normal file
BIN
seek/zhihu_com/__pycache__/zhihu_hot.cpython-312.pyc
Normal file
Binary file not shown.
25
seek/zhihu_com/demo.py
Normal file
25
seek/zhihu_com/demo.py
Normal file
@ -0,0 +1,25 @@
|
||||
from database.database import get_session
|
||||
from database.thotcontent.crud import get_hot_content_by_topic_id
|
||||
from database.thottopic.crud import get_latest_hot_topic
|
||||
|
||||
if __name__ == '__main__':
|
||||
with get_session() as db:
|
||||
# 1. 获取最新的热点话题
|
||||
latest_hot_topic = get_latest_hot_topic(db)
|
||||
topic = latest_hot_topic.topic
|
||||
print(latest_hot_topic)
|
||||
# 2. 获取话题内容
|
||||
hot_contents = get_hot_content_by_topic_id(db, latest_hot_topic.id)
|
||||
for hot_content in hot_contents:
|
||||
print(hot_content)
|
||||
# 统计hot_content.content的字数
|
||||
print(len(hot_content.content))
|
||||
topic_content = [hot_content.content for hot_content in hot_contents]
|
||||
print(topic_content)
|
||||
print(len(topic_content))
|
||||
print('---------------------------------------------------------------')
|
||||
print(topic_content[0])
|
||||
print('---------------------------------------------------------------')
|
||||
print(topic_content[1])
|
||||
print('---------------------------------------------------------------')
|
||||
print(topic_content[2])
|
||||
116
seek/zhihu_com/demo2.py
Normal file
116
seek/zhihu_com/demo2.py
Normal file
@ -0,0 +1,116 @@
|
||||
import json
|
||||
|
||||
from DrissionPage import Chromium
|
||||
from DrissionPage import ChromiumOptions
|
||||
from DrissionPage.errors import ElementNotFoundError
|
||||
|
||||
from log.log_manager import logger
|
||||
|
||||
|
||||
def get_content_from_meta(metas, itemprop):
|
||||
content = None
|
||||
for meta in metas:
|
||||
if meta.attr('itemprop') == itemprop:
|
||||
content = meta.attr('content')
|
||||
return content
|
||||
|
||||
|
||||
class Zhihu:
|
||||
def __init__(self):
|
||||
co = ChromiumOptions()
|
||||
self.browser = Chromium()
|
||||
self.tab = None
|
||||
|
||||
def get_content(self, url):
|
||||
"""获取话题内容数据"""
|
||||
global title, keywords, date_created, date_modified, follower_count, comment_count, answer_count, topic_description
|
||||
contents_result = []
|
||||
try:
|
||||
self.tab = self.browser.new_tab()
|
||||
# 访问话题/问题页面
|
||||
self.tab.get(url)
|
||||
|
||||
for _ in range(10):
|
||||
# 等待内容加载
|
||||
self.tab.wait.ele_displayed('.List-item')
|
||||
self.tab.wait(3)
|
||||
# 向下滚动页面,直到所有内容加载完成
|
||||
self.tab.scroll.to_bottom()
|
||||
self.tab.wait(1)
|
||||
self.tab.scroll.up(100)
|
||||
|
||||
# 获取话题/问题相关信息:话题内容、keywards、话题创建日期dateCreated、话题修改日期dateModified、回答数量answerCount、评论数量commentCount
|
||||
question_page = self.tab.ele('.QuestionPage')
|
||||
# 获取话题属性,为QuestionPage的前9个meta标签
|
||||
metas = question_page.eles('tag:meta')[0:9]
|
||||
# print(metas)
|
||||
title = get_content_from_meta(metas, 'name')
|
||||
answer_count = get_content_from_meta(metas, 'answerCount')
|
||||
comment_count = get_content_from_meta(metas, 'commentCount')
|
||||
keywords = get_content_from_meta(metas, 'keywords')
|
||||
date_created = get_content_from_meta(metas, 'dateCreated')
|
||||
date_modified = get_content_from_meta(metas, 'dateModified')
|
||||
follower_count = get_content_from_meta(metas, 'zhihu:followerCount')
|
||||
# print(date_created, date_modified, answer_count, comment_count, keywords)
|
||||
topic_description = ""
|
||||
try:
|
||||
topic_description = question_page.ele('.RichText ztext css-ob6uua').text
|
||||
except ElementNotFoundError as e:
|
||||
logger.error(f"元素缺失:不存在topic_description")
|
||||
|
||||
# 获取所有内容条目
|
||||
content_items = self.tab.ele('.Question-mainColumn').eles('.List-item')
|
||||
|
||||
total_characters = 0
|
||||
for item in content_items:
|
||||
try:
|
||||
content = item.ele('.RichContent-inner').text
|
||||
# 计算content的字数
|
||||
content_len = len(content)
|
||||
print(content_len)
|
||||
if content_len > 1000 or content_len < 100:
|
||||
logger.error(f"skip本条内容,内容长度:{content_len}")
|
||||
continue
|
||||
if total_characters > 5000:
|
||||
logger.error(f"contents_result长度超过5000,跳出循环")
|
||||
break
|
||||
total_characters += content_len
|
||||
contents_result.append(content)
|
||||
# 打印contents_result的长度
|
||||
logger.info(f"contents_result长度:{len(contents_result)}")
|
||||
except ElementNotFoundError as e:
|
||||
logger.error(f"元素缺失:{str(e)}")
|
||||
except ValueError as e:
|
||||
logger.error(f"热度值转换失败:{str(e)}")
|
||||
|
||||
except ElementNotFoundError as e:
|
||||
logger.error(f"热榜容器元素未找到:{str(e)}")
|
||||
except Exception as e:
|
||||
logger.error(f"获取热榜数据异常:{str(e)}")
|
||||
finally:
|
||||
if self.tab:
|
||||
self.tab.close()
|
||||
# 返回json格式的数据
|
||||
return json.dumps({
|
||||
'title': title,
|
||||
'answer_count': answer_count,
|
||||
'comment_count': comment_count,
|
||||
'topic_description': topic_description,
|
||||
'keywords': keywords,
|
||||
'date_created': date_created,
|
||||
'date_modified': date_modified,
|
||||
'follower_count': follower_count,
|
||||
'contents': contents_result
|
||||
}, ensure_ascii=False)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# 测试用例
|
||||
logger.info('知乎采集测试')
|
||||
# 执行采集任务
|
||||
zhihu = Zhihu()
|
||||
result = zhihu.get_content('https://www.zhihu.com/question/588507809')
|
||||
print(len(result))
|
||||
print(result)
|
||||
logger.info('测试完成')
|
||||
|
||||
85
seek/zhihu_com/hot.py
Normal file
85
seek/zhihu_com/hot.py
Normal file
@ -0,0 +1,85 @@
|
||||
import datetime
|
||||
import re # 添加正则表达式库的导入
|
||||
|
||||
from DrissionPage.errors import ElementNotFoundError
|
||||
|
||||
from database.database import get_session
|
||||
from database.tinformationsource.model import TInformationSource
|
||||
from database.tnews.model import TNews
|
||||
from log.log_manager import logger
|
||||
from seek.seek_base import SeekBase
|
||||
|
||||
|
||||
class ZhihuHot(SeekBase):
|
||||
def get_news(self):
|
||||
"""获取知乎热榜数据"""
|
||||
news_result = []
|
||||
try:
|
||||
# 访问热榜页面
|
||||
self.tab.get('https://www.zhihu.com/hot')
|
||||
|
||||
# 等待热榜内容加载
|
||||
self.tab.wait.ele_displayed('.HotItem')
|
||||
|
||||
# 获取所有热榜条目
|
||||
hot_items = self.tab.ele('.HotList-list').eles('.HotItem')
|
||||
|
||||
for item in hot_items:
|
||||
try:
|
||||
news = TNews()
|
||||
# 提取标题和链接
|
||||
news.title = item.ele('tag:a').attr('title').title()
|
||||
news.url = item.ele('tag:a').link
|
||||
|
||||
# 提取热度值(去除"热度"文字)
|
||||
heat_value = item('.HotItem-metrics HotItem-metrics--bottom').text
|
||||
logger.info(f"热度值:{heat_value}")
|
||||
# 使用正则表达式提取数值部分
|
||||
match = re.search(r'(\d+\s*万)', heat_value)
|
||||
|
||||
if match:
|
||||
news.heat = match.group(1).replace(' ', '') # 去除空格
|
||||
else:
|
||||
logger.error(f"无法提取热度值:{heat_value}")
|
||||
logger.info(f"提取到的热度值:{news.heat}")
|
||||
|
||||
news.source = self.information_source.title
|
||||
news.occurrence_date = datetime.datetime.now()
|
||||
news_result.append(news)
|
||||
except ElementNotFoundError as e:
|
||||
logger.error(f"元素缺失:{str(e)}")
|
||||
except ValueError as e:
|
||||
logger.error(f"热度值转换失败:{str(e)}")
|
||||
|
||||
except ElementNotFoundError as e:
|
||||
logger.error(f"热榜容器元素未找到:{str(e)}")
|
||||
except Exception as e:
|
||||
logger.error(f"获取热榜数据异常:{str(e)}")
|
||||
|
||||
return news_result
|
||||
|
||||
def get_news(information_source: TInformationSource) -> list:
|
||||
"""对外暴露的获取新闻接口"""
|
||||
zhihu = ZhihuHot(information_source)
|
||||
return zhihu.get_news()
|
||||
|
||||
def news_task(information_source: TInformationSource):
|
||||
"""任务执行入口"""
|
||||
with get_session() as db:
|
||||
news_list = get_news(information_source)
|
||||
# create_news_list_if_url_not_exists(db, news_list)
|
||||
for news in news_list:
|
||||
logger.info(f"采集到新闻:{news}")
|
||||
|
||||
if __name__ == '__main__':
|
||||
# 测试用例
|
||||
logger.info('知乎热榜采集测试')
|
||||
information_source_ = TInformationSource()
|
||||
information_source_.is_static = False # 知乎需要浏览器渲染
|
||||
information_source_.url = 'https://www.zhihu.com/hot'
|
||||
information_source_.title = '热榜_知乎'
|
||||
|
||||
# 执行采集任务
|
||||
news_task(information_source_)
|
||||
logger.info('测试完成')
|
||||
|
||||
173
seek/zhihu_com/zhihu.py
Normal file
173
seek/zhihu_com/zhihu.py
Normal file
@ -0,0 +1,173 @@
|
||||
import re
|
||||
|
||||
from DrissionPage import Chromium
|
||||
from DrissionPage import ChromiumOptions
|
||||
from DrissionPage.errors import ElementNotFoundError
|
||||
|
||||
from database.database import get_session
|
||||
from database.thotcontent.crud import create_contents_top3_if_url_not_exists
|
||||
from database.thotcontent.model import THotContent
|
||||
from database.thottopic.crud import create_topics_if_url_not_exists, update_hot_topic
|
||||
from database.thottopic.model import THotTopic
|
||||
from log.log_manager import logger
|
||||
|
||||
def get_content_from_meta(metas, itemprop):
|
||||
content = None
|
||||
for meta in metas:
|
||||
if meta.attr('itemprop') == itemprop:
|
||||
content = meta.attr('content')
|
||||
return content
|
||||
|
||||
|
||||
class Zhihu:
|
||||
def __init__(self):
|
||||
co = ChromiumOptions()
|
||||
self.browser = Chromium()
|
||||
self.tab = None
|
||||
|
||||
def get_topics(self):
|
||||
"""获取知乎数据"""
|
||||
topics_result = []
|
||||
try:
|
||||
self.tab = self.browser.new_tab()
|
||||
# 访问知乎主页面
|
||||
self.tab.get('https://www.zhihu.com')
|
||||
|
||||
# 等待热榜内容加载
|
||||
self.tab.wait.ele_displayed('.Card TopstoryItem TopstoryItem-isRecommend')
|
||||
|
||||
# 获取所有热榜条目
|
||||
hot_items = self.tab.ele('.Topstory-content').eles('.Card TopstoryItem TopstoryItem-isRecommend')
|
||||
|
||||
for item in hot_items:
|
||||
try:
|
||||
topic = THotTopic()
|
||||
topic.source = '知乎'
|
||||
# 提取标题和链接
|
||||
topic.topic = item.ele('tag:h2').ele('tag:a').text
|
||||
topic.url = item.ele('tag:h2').ele('tag:a').link
|
||||
pattern = r'^https://www\.zhihu\.com/question/\d+'
|
||||
result = re.findall(pattern, topic.url)
|
||||
if result:
|
||||
topic.url = result[0]
|
||||
else:
|
||||
continue
|
||||
topics_result.append(topic)
|
||||
except ElementNotFoundError as e:
|
||||
logger.error(f"元素缺失:{str(e)}")
|
||||
except ValueError as e:
|
||||
logger.error(f"热度值转换失败:{str(e)}")
|
||||
|
||||
except ElementNotFoundError as e:
|
||||
logger.error(f"热榜容器元素未找到:{str(e)}")
|
||||
except Exception as e:
|
||||
logger.error(f"获取热榜数据异常:{str(e)}")
|
||||
finally:
|
||||
if self.tab:
|
||||
self.tab.close()
|
||||
return topics_result
|
||||
|
||||
def get_content(self, topic: THotTopic, db):
|
||||
"""获取话题内容数据"""
|
||||
contents_result = []
|
||||
try:
|
||||
self.tab = self.browser.new_tab()
|
||||
# 访问话题/问题页面
|
||||
self.tab.get(topic.url)
|
||||
|
||||
for _ in range(10):
|
||||
# 等待内容加载
|
||||
self.tab.wait.ele_displayed('.List-item')
|
||||
self.tab.wait(3)
|
||||
# 向下滚动页面,直到所有内容加载完成
|
||||
self.tab.scroll.to_bottom()
|
||||
self.tab.wait(1)
|
||||
self.tab.scroll.up(100)
|
||||
|
||||
# 获取话题/问题相关信息:话题内容、keywards、话题创建日期dateCreated、话题修改日期dateModified、回答数量answerCount、评论数量commentCount
|
||||
question_page = self.tab.ele('.QuestionPage')
|
||||
# 获取话题属性,为QuestionPage的前9个meta标签
|
||||
metas = question_page.eles('tag:meta')[0:9]
|
||||
# print(metas)
|
||||
answer_count = get_content_from_meta(metas, 'answerCount')
|
||||
comment_count = get_content_from_meta(metas, 'commentCount')
|
||||
keywords = get_content_from_meta(metas, 'keywords')
|
||||
date_created = get_content_from_meta(metas, 'dateCreated')
|
||||
date_modified = get_content_from_meta(metas, 'dateModified')
|
||||
follower_count = get_content_from_meta(metas, 'zhihu:followerCount')
|
||||
# print(date_created, date_modified, answer_count, comment_count, keywords)
|
||||
topic.content_count = int(answer_count)
|
||||
topic.comment_count = int(comment_count)
|
||||
topic.follower_count = int(follower_count)
|
||||
topic.keywords = keywords
|
||||
topic.date_created = date_created
|
||||
topic.date_modified = date_modified
|
||||
try:
|
||||
topic.topic_description = question_page.ele('.RichText ztext css-ob6uua').text
|
||||
except ElementNotFoundError as e:
|
||||
logger.error(f"元素缺失:不存在topic_description")
|
||||
update_hot_topic(db, topic)
|
||||
|
||||
# 获取所有内容条目
|
||||
content_items = self.tab.ele('.Question-mainColumn').eles('.List-item')
|
||||
|
||||
for item in content_items:
|
||||
try:
|
||||
content = THotContent()
|
||||
content.topic_id = topic.id
|
||||
content.url = item.ele('.ContentItem-time').ele('tag:a').link
|
||||
upvote_str = item.ele('.Button VoteButton VoteButton--up FEfUrdfMIKpQDJDqkjte').text
|
||||
match = re.search(r'(\d+\.?\d*)\s*万?', upvote_str)
|
||||
if match:
|
||||
number = float(match.group(1))
|
||||
content.content_upvote_count = int(number * 10000) if '万' in upvote_str else int(number)
|
||||
else:
|
||||
content.content_upvote_count = 0
|
||||
comment_str = item.ele('.Button ContentItem-action FEfUrdfMIKpQDJDqkjte Button--plain Button--withIcon Button--withLabel fEPKGkUK5jyc4fUuT0QP B46v1Ak6Gj5sL2JTS4PY RuuQ6TOh2cRzJr6WlyQp').text
|
||||
match = re.search(r'(\d{1,3}(?:,\d{3})*)', comment_str)
|
||||
if match:
|
||||
content.content_comment_count = int(match.group(1).replace(',', ''))
|
||||
else:
|
||||
content.content_comment_count = 0
|
||||
content.content = item.ele('.RichContent-inner').text
|
||||
contents_result.append(content)
|
||||
except ElementNotFoundError as e:
|
||||
logger.error(f"元素缺失:{str(e)}")
|
||||
except ValueError as e:
|
||||
logger.error(f"热度值转换失败:{str(e)}")
|
||||
|
||||
except ElementNotFoundError as e:
|
||||
logger.error(f"热榜容器元素未找到:{str(e)}")
|
||||
except Exception as e:
|
||||
logger.error(f"获取热榜数据异常:{str(e)}")
|
||||
finally:
|
||||
if self.tab:
|
||||
self.tab.close()
|
||||
return contents_result
|
||||
|
||||
|
||||
|
||||
def get_topics() -> list:
|
||||
zhihu = Zhihu()
|
||||
topics = zhihu.get_topics()
|
||||
return topics
|
||||
|
||||
def gather_task():
|
||||
"""任务执行入口"""
|
||||
with get_session() as db:
|
||||
zhihu = Zhihu()
|
||||
topics = zhihu.get_topics()
|
||||
inserted_topics = create_topics_if_url_not_exists(db, topics)
|
||||
for topic in inserted_topics:
|
||||
logger.info(f"采集到话题:{topic}")
|
||||
contents = zhihu.get_content(topic, db)
|
||||
create_contents_top3_if_url_not_exists(db, contents)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# 测试用例
|
||||
logger.info('知乎采集测试')
|
||||
# 执行采集任务
|
||||
gather_task()
|
||||
logger.info('测试完成')
|
||||
|
||||
156
seek/zhihu_com/zhihu_hot.py
Normal file
156
seek/zhihu_com/zhihu_hot.py
Normal file
@ -0,0 +1,156 @@
|
||||
from DrissionPage import Chromium
|
||||
from DrissionPage import ChromiumOptions
|
||||
from DrissionPage.errors import ElementNotFoundError
|
||||
|
||||
from log.log_manager import logger
|
||||
|
||||
|
||||
def get_content_from_meta(metas, itemprop):
|
||||
content = None
|
||||
for meta in metas:
|
||||
if meta.attr('itemprop') == itemprop:
|
||||
content = meta.attr('content')
|
||||
return content
|
||||
|
||||
|
||||
class ZhihuHot:
|
||||
def __init__(self):
|
||||
co = ChromiumOptions()
|
||||
self.browser = Chromium()
|
||||
|
||||
def get_topic_url_list(self) -> list:
|
||||
"""获取知乎热榜数据"""
|
||||
_topic_url_list = []
|
||||
_tab = None
|
||||
try:
|
||||
_tab = self.browser.new_tab()
|
||||
# 访问热榜页面
|
||||
_tab.get('https://www.zhihu.com/hot')
|
||||
|
||||
# 等待热榜内容加载
|
||||
_tab.wait.ele_displayed('.HotItem')
|
||||
|
||||
# 获取所有热榜条目
|
||||
hot_items = _tab.ele('.HotList-list').eles('.HotItem')
|
||||
|
||||
for item in hot_items:
|
||||
try:
|
||||
# 提取标题和链接
|
||||
# title = item.ele('tag:a').attr('title').title()
|
||||
url = item.ele('tag:a').link
|
||||
_topic_url_list.append(url)
|
||||
except ElementNotFoundError as e:
|
||||
logger.error(f"元素缺失:{str(e)}")
|
||||
except ValueError as e:
|
||||
logger.error(f"热度值转换失败:{str(e)}")
|
||||
|
||||
except ElementNotFoundError as e:
|
||||
logger.error(f"热榜容器元素未找到:{str(e)}")
|
||||
except Exception as e:
|
||||
logger.error(f"获取热榜数据异常:{str(e)}")
|
||||
finally:
|
||||
if _tab:
|
||||
_tab.close()
|
||||
|
||||
return _topic_url_list
|
||||
|
||||
def get_content(self, url):
|
||||
"""获取话题内容数据"""
|
||||
contents_result = []
|
||||
_tab = None
|
||||
global title, keywords, date_created, date_modified, follower_count, comment_count, answer_count, topic_description
|
||||
try:
|
||||
_tab = self.browser.new_tab()
|
||||
# 访问话题/问题页面
|
||||
_tab.get(url)
|
||||
|
||||
for _ in range(10):
|
||||
# for _ in range(1):
|
||||
# 等待内容加载
|
||||
_tab.wait.ele_displayed('.List-item')
|
||||
_tab.wait(3)
|
||||
# 向下滚动页面,直到所有内容加载完成
|
||||
_tab.scroll.to_bottom()
|
||||
_tab.wait(1)
|
||||
_tab.scroll.up(100)
|
||||
|
||||
# 获取话题/问题相关信息:话题内容、keywards、话题创建日期dateCreated、话题修改日期dateModified、回答数量answerCount、评论数量commentCount
|
||||
question_page = _tab.ele('.QuestionPage')
|
||||
# 获取话题属性,为QuestionPage的前9个meta标签
|
||||
metas = question_page.eles('tag:meta')[0:9]
|
||||
# print(metas)
|
||||
title = get_content_from_meta(metas, 'name')
|
||||
answer_count = get_content_from_meta(metas, 'answerCount')
|
||||
comment_count = get_content_from_meta(metas, 'commentCount')
|
||||
keywords = get_content_from_meta(metas, 'keywords')
|
||||
date_created = get_content_from_meta(metas, 'dateCreated')
|
||||
date_modified = get_content_from_meta(metas, 'dateModified')
|
||||
follower_count = get_content_from_meta(metas, 'zhihu:followerCount')
|
||||
# print(date_created, date_modified, answer_count, comment_count, keywords)
|
||||
topic_description = ""
|
||||
try:
|
||||
unfold_topic_description = question_page.ele('.^Button QuestionRichText-more')
|
||||
if unfold_topic_description:
|
||||
unfold_topic_description.click()
|
||||
topic_description = question_page.ele('.^QuestionRichText').text
|
||||
except ElementNotFoundError as e:
|
||||
logger.error(f"元素缺失:不存在topic_description")
|
||||
|
||||
# 获取所有内容条目
|
||||
content_items = _tab.ele('.Question-mainColumn').eles('.List-item')
|
||||
|
||||
total_characters = 0
|
||||
for item in content_items:
|
||||
try:
|
||||
content = item.ele('.RichContent-inner').text
|
||||
# 计算content的字数
|
||||
content_len = len(content)
|
||||
print(content_len)
|
||||
if content_len > 1000 or content_len < 100:
|
||||
logger.error(f"skip本条内容,内容长度:{content_len}")
|
||||
continue
|
||||
if total_characters > 5000:
|
||||
logger.error(f"contents_result长度超过5000,跳出循环")
|
||||
break
|
||||
total_characters += content_len
|
||||
contents_result.append(content)
|
||||
# 打印contents_result的长度
|
||||
logger.info(f"contents_result长度:{len(contents_result)}")
|
||||
except ElementNotFoundError as e:
|
||||
logger.error(f"元素缺失:{str(e)}")
|
||||
except ValueError as e:
|
||||
logger.error(f"热度值转换失败:{str(e)}")
|
||||
|
||||
except ElementNotFoundError as e:
|
||||
logger.error(f"热榜容器元素未找到:{str(e)}")
|
||||
except Exception as e:
|
||||
logger.error(f"获取热榜数据异常:{str(e)}")
|
||||
finally:
|
||||
if _tab:
|
||||
_tab.close()
|
||||
return {
|
||||
'title': title,
|
||||
'topic_description': topic_description,
|
||||
'keywords': keywords,
|
||||
'url': url,
|
||||
'contents': contents_result,
|
||||
'date_created': date_created,
|
||||
'date_modified': date_modified,
|
||||
'follower_count': follower_count,
|
||||
'answer_count': answer_count,
|
||||
'comment_count': comment_count
|
||||
}
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# 测试用例
|
||||
logger.info('知乎采集测试')
|
||||
# 执行采集任务
|
||||
zhihu_hot = ZhihuHot()
|
||||
result = zhihu_hot.get_content('https://www.zhihu.com/question/14351228309')
|
||||
print(len(result))
|
||||
print(result)
|
||||
# topic_url_list = zhihu_hot.get_topic_url_list()
|
||||
# print(topic_url_list)
|
||||
logger.info('测试完成')
|
||||
|
||||
Reference in New Issue
Block a user