import json from time import sleep from DrissionPage import Chromium, ChromiumOptions from config.database import SessionLocal from models.source_content import SourceContent from utils import logger class DoubanGroupSeek: def __init__(self, group_id): co = ChromiumOptions().set_local_port(9333) self.browser = Chromium(addr_or_opts=co) self.group_id = group_id def seek(self): db = SessionLocal() # 获取最近100条数据的URL列表,用于过滤掉已存在的URL,避免重复爬取和存储 recent_contents = db.query(SourceContent).order_by(SourceContent.id.desc()).limit(100).all() group_url = f'https://www.douban.com/group/{self.group_id}' tab = self.browser.new_tab() tab.get(group_url) title = tab.title ele_table = tab.ele('tag:table@class=olt') ele_trs = ele_table.eles('tag:tr@!class=th') topics = [] for ele_tr in ele_trs: topic_title = ele_tr.ele('tag:a').text topic_url = ele_tr.ele('tag:a').attr('href') topics.append((topic_title, topic_url)) # 过滤掉已存在的URL existing_urls = set(content.link for content in recent_contents) topics = [(title, url) for title, url in topics if url not in existing_urls] # 打印要爬取的主题列表 logger.info(f"Found {len(topics)} new topics to crawl:") for topic_title, topic_url in topics: logger.info(f"标题:{topic_title} 链接:{topic_url}\n") results = [] for topic_title, topic_url in topics: logger.info(f"fetch 标题:{topic_title} 链接:{topic_url}\n") tab.get(topic_url) tab.wait(30) # 等待页面加载完成,时间可根据实际情况调整 try: title = tab.title ele_article = tab.ele('.article') # 获取帖子内容、发布时间、IP地址位置、作者等信息 ele_topic_content = ele_article.ele('#topic-content') ele_topic_doc = ele_topic_content.ele('.topic-doc') content = ele_topic_doc.ele('.topic-content').text post_time = ele_topic_doc.ele('.create-time').text ip_location = ele_topic_doc.ele('.ip-location').text author = ele_topic_doc.ele('.from').text # 获取评论列表 comments = [] # 评论不一定存在,需先判断 try: ele_comments = ele_article.ele('#comments') ele_comments_list = ele_comments.eles('tag:li') for ele_comment in ele_comments_list: comment_content = ele_comment.ele('.reply-content').text comment_time = ele_comment.ele('.pubtime').text comment_author = ele_comment.ele('tag:h4').child().text comments.append({ "comment_content": comment_content, "comment_time": comment_time, "comment_author": comment_author }) except Exception as e: logger.warning(f"No comments found for topic {topic_title}:{topic_url}: {str(e)}") results.append((topic_url, json.dumps({ "title": title, "content": content, "post_time": post_time, "ip_location": ip_location, "author": author, "comments": comments }, ensure_ascii=False))) except Exception as e: logger.error(f"Error processing topic {topic_title}:{topic_url}: {str(e)}") continue # 存入数据库 for topic_url, data in results: source_content = SourceContent( link=topic_url, platform='douban', content=data ) db.add(source_content) db.commit() tab.close()