seek: douban group
All checks were successful
Gitea Actions Demo / deploy (push) Successful in 33s

This commit is contained in:
konjacpotato
2026-02-15 12:37:48 +08:00
parent 72b117b57c
commit 51d1c403f5
10 changed files with 131 additions and 18 deletions

View File

@ -0,0 +1,98 @@
import json
from time import sleep
from DrissionPage import Chromium, ChromiumOptions
from config.database import SessionLocal
from models.source_content import SourceContent
from utils import logger
class DoubanGroupSeek:
def __init__(self, group_id):
co = ChromiumOptions().set_local_port(9333)
self.browser = Chromium(addr_or_opts=co)
self.group_id = group_id
def seek(self):
db = SessionLocal()
# 获取最近100条数据的URL列表用于过滤掉已存在的URL避免重复爬取和存储
recent_contents = db.query(SourceContent).order_by(SourceContent.id.desc()).limit(100).all()
group_url = f'https://www.douban.com/group/{self.group_id}'
tab = self.browser.new_tab()
tab.get(group_url)
title = tab.title
ele_table = tab.ele('tag:table@class=olt')
ele_trs = ele_table.eles('tag:tr@!class=th')
topics = []
for ele_tr in ele_trs:
topic_title = ele_tr.ele('tag:a').text
topic_url = ele_tr.ele('tag:a').attr('href')
topics.append((topic_title, topic_url))
# 过滤掉已存在的URL
existing_urls = set(content.link for content in recent_contents)
topics = [(title, url) for title, url in topics if url not in existing_urls]
# 打印要爬取的主题列表
logger.info(f"Found {len(topics)} new topics to crawl:")
for topic_title, topic_url in topics:
logger.info(f"标题:{topic_title} 链接:{topic_url}\n")
results = []
for topic_title, topic_url in topics:
logger.info(f"fetch 标题:{topic_title} 链接:{topic_url}\n")
tab.get(topic_url)
tab.wait(30) # 等待页面加载完成,时间可根据实际情况调整
try:
title = tab.title
ele_article = tab.ele('.article')
# 获取帖子内容、发布时间、IP地址位置、作者等信息
ele_topic_content = ele_article.ele('#topic-content')
ele_topic_doc = ele_topic_content.ele('.topic-doc')
content = ele_topic_doc.ele('.topic-content').text
post_time = ele_topic_doc.ele('.create-time').text
ip_location = ele_topic_doc.ele('.ip-location').text
author = ele_topic_doc.ele('.from').text
# 获取评论列表
comments = []
# 评论不一定存在,需先判断
try:
ele_comments = ele_article.ele('#comments')
ele_comments_list = ele_comments.eles('tag:li')
for ele_comment in ele_comments_list:
comment_content = ele_comment.ele('.reply-content').text
comment_time = ele_comment.ele('.pubtime').text
comment_author = ele_comment.ele('tag:h4').child().text
comments.append({
"comment_content": comment_content,
"comment_time": comment_time,
"comment_author": comment_author
})
except Exception as e:
logger.warning(f"No comments found for topic {topic_title}:{topic_url}: {str(e)}")
results.append((topic_url, json.dumps({
"title": title,
"content": content,
"post_time": post_time,
"ip_location": ip_location,
"author": author,
"comments": comments
}, ensure_ascii=False)))
except Exception as e:
logger.error(f"Error processing topic {topic_title}:{topic_url}: {str(e)}")
continue
# 存入数据库
for topic_url, data in results:
source_content = SourceContent(
link=topic_url,
platform='douban',
content=data
)
db.add(source_content)
db.commit()
tab.close()