peter/seek/zhihu_com/zhihu_hot.py

from DrissionPage import Chromium
from DrissionPage import ChromiumOptions
from DrissionPage.errors import ElementNotFoundError

from log.log_manager import logger


def get_content_from_meta(metas, itemprop):
    content = None
    for meta in metas:
        if meta.attr('itemprop') == itemprop:
            content = meta.attr('content')
    return content


class ZhihuHot:
    def __init__(self):
        co = ChromiumOptions()
        self.browser = Chromium()

    def get_topic_url_list(self) -> list:
        """获取知乎热榜数据"""
        _topic_url_list = []
        _tab = None
        try:
            _tab = self.browser.new_tab()
            # 访问热榜页面
            _tab.get('https://www.zhihu.com/hot')

            # 等待热榜内容加载
            _tab.wait.ele_displayed('.HotItem')

            # 获取所有热榜条目
            hot_items = _tab.ele('.HotList-list').eles('.HotItem')

            for item in hot_items:
                try:
                    # 提取标题和链接
                    # title = item.ele('tag:a').attr('title').title()
                    url = item.ele('tag:a').link
                    _topic_url_list.append(url)
                except ElementNotFoundError as e:
                    logger.error(f"元素缺失：{str(e)}")
                except ValueError as e:
                    logger.error(f"热度值转换失败：{str(e)}")

        except ElementNotFoundError as e:
            logger.error(f"热榜容器元素未找到：{str(e)}")
        except Exception as e:
            logger.error(f"获取热榜数据异常：{str(e)}")
        finally:
            if _tab:
                _tab.close()

        return _topic_url_list

    def get_content(self, url):
        """获取话题内容数据"""
        contents_result = []
        _tab = None
        global title, keywords, date_created, date_modified, follower_count, comment_count, answer_count, topic_description
        try:
            _tab = self.browser.new_tab()
            # 访问话题/问题页面
            _tab.get(url)

            for _ in range(10):
            # for _ in range(1):
                # 等待内容加载
                _tab.wait.ele_displayed('.List-item')
                _tab.wait(3)
                # 向下滚动页面，直到所有内容加载完成
                _tab.scroll.to_bottom()
                _tab.wait(1)
                _tab.scroll.up(100)

            # 获取话题/问题相关信息：话题内容、keywards、话题创建日期dateCreated、话题修改日期dateModified、回答数量answerCount、评论数量commentCount
            question_page = _tab.ele('.QuestionPage')
            # 获取话题属性，为QuestionPage的前9个meta标签
            metas = question_page.eles('tag:meta')[0:9]
            # print(metas)
            title = get_content_from_meta(metas, 'name')
            answer_count = get_content_from_meta(metas, 'answerCount')
            comment_count = get_content_from_meta(metas, 'commentCount')
            keywords = get_content_from_meta(metas, 'keywords')
            date_created = get_content_from_meta(metas, 'dateCreated')
            date_modified = get_content_from_meta(metas, 'dateModified')
            follower_count = get_content_from_meta(metas, 'zhihu:followerCount')
            # print(date_created, date_modified, answer_count, comment_count, keywords)
            topic_description = ""
            try:
                unfold_topic_description = question_page.ele('.^Button QuestionRichText-more')
                if unfold_topic_description:
                    unfold_topic_description.click()
                topic_description = question_page.ele('.^QuestionRichText').text
            except ElementNotFoundError as e:
                logger.error(f"元素缺失：不存在topic_description")

            # 获取所有内容条目
            content_items = _tab.ele('.Question-mainColumn').eles('.List-item')

            total_characters = 0
            for item in content_items:
                try:
                    content = item.ele('.RichContent-inner').text
                    # 计算content的字数
                    content_len = len(content)
                    print(content_len)
                    if content_len > 1000 or content_len < 100:
                        logger.error(f"skip本条内容，内容长度：{content_len}")
                        continue
                    if total_characters > 5000:
                        logger.error(f"contents_result长度超过5000，跳出循环")
                        break
                    total_characters += content_len
                    contents_result.append(content)
                    # 打印contents_result的长度
                    logger.info(f"contents_result长度：{len(contents_result)}")
                except ElementNotFoundError as e:
                    logger.error(f"元素缺失：{str(e)}")
                except ValueError as e:
                    logger.error(f"热度值转换失败：{str(e)}")

        except ElementNotFoundError as e:
            logger.error(f"热榜容器元素未找到：{str(e)}")
        except Exception as e:
            logger.error(f"获取热榜数据异常：{str(e)}")
        finally:
            if _tab:
                _tab.close()
        return {
            'title': title,
            'topic_description': topic_description,
            'keywords': keywords,
            'url': url,
            'contents': contents_result,
            'date_created': date_created,
            'date_modified': date_modified,
            'follower_count': follower_count,
            'answer_count': answer_count,
            'comment_count': comment_count
        }


if __name__ == '__main__':
    # 测试用例
    logger.info('知乎采集测试')
    # 执行采集任务
    zhihu_hot = ZhihuHot()
    result = zhihu_hot.get_content('https://www.zhihu.com/question/14351228309')
    print(len(result))
    print(result)
    # topic_url_list = zhihu_hot.get_topic_url_list()
    # print(topic_url_list)
    logger.info('测试完成')