refactor(web_parser): 优化URL提取和抖音解析器逻辑

重构URL提取逻辑,合并所有文本段处理分割链接并清理末尾标点
简化抖音解析器实现,移除冗余头部信息并改进URL验证
删除未使用的鸭子示例代码文件
This commit is contained in:
2026-01-23 09:51:28 +08:00
parent bbfc17d6e6
commit 489dd8c77d
3 changed files with 26 additions and 70 deletions

View File

@@ -1,32 +0,0 @@
class 真鸭子:
def (self):
print("嘎嘎嘎")
def (self):
print("鸭子摇摇摆摆跑")
class 玩具鸭子:
def (self):
print("玩具鸭发出嘎嘎声")
def (self):
print("玩具鸭轮子咕噜噜跑")
class 小猫:
def (self):
print("喵喵喵")
def (self):
print("猫咪跑跑")
def 逗鸭子(鸭子一样的东西):
鸭子一样的东西.()
鸭子一样的东西.()
逗鸭子(真鸭子())
逗鸭子(玩具鸭子())
逗鸭子(小猫())
鸭子 = 1

View File

@@ -117,7 +117,7 @@ class BaseParser(metaclass=abc.ABCMeta):
def extract_url_from_text_segments(self, segments):
"""
从消息的文本段中提取URL
从消息的文本段中提取URL,会合并所有文本段来处理被分割的链接。
Args:
segments: 消息段列表
@@ -125,14 +125,19 @@ class BaseParser(metaclass=abc.ABCMeta):
Returns:
Optional[str]: 提取到的URL或None
"""
for segment in segments:
if segment.type == "text":
text_content = segment.data.get("text", "")
match = self.url_pattern.search(text_content)
# 1. 拼接所有文本段内容,保留空格
full_text = "".join([segment.data.get("text", "") for segment in segments if segment.type == "text"])
# 2. 使用解析器自身的url_pattern进行匹配通常是匹配到第一个空格为止
match = self.url_pattern.search(full_text)
if match:
extracted_url = match.group(0)
logger.success(f"[{self.name}] 成功从文本中提取到链接: {extracted_url}")
# 清理一下链接末尾可能误包含的标点符号
extracted_url = re.sub(r'[,.!?]$', '', extracted_url)
logger.success(f"[{self.name}] 成功从合并后的文本中提取到链接: {extracted_url}")
return extracted_url
return None
async def process_url(self, event: MessageEvent, url: str):

View File

@@ -18,8 +18,8 @@ class DouyinParser(BaseParser):
def __init__(self):
super().__init__()
self.name = "抖音解析器"
self.url_pattern = re.compile(r"https?://v\.douyin\.com/[a-zA-Z0-9_]+/?", re.IGNORECASE)
self.short_pattern = re.compile(r"(?:https?://)?v\.douyin\.com/[a-zA-Z0-9_]+/?", re.IGNORECASE)
self.url_pattern = re.compile(r"https?://v\.douyin\.com/[a-zA-Z0-9_-]+/?", re.IGNORECASE)
self.short_pattern = re.compile(r"(?:https?://)?v\.douyin\.com/[a-zA-Z0-9_-]+/?", re.IGNORECASE)
self.nickname = "抖音视频解析"
# 消息去重缓存
self.processed_messages: TTLCache[int, bool] = TTLCache(maxsize=100, ttl=10)
@@ -94,34 +94,17 @@ class DouyinParser(BaseParser):
Optional[str]: 真实URL如果失败则返回None
"""
try:
# 首先尝试获取重定向后的URL
async with aiohttp.ClientSession() as session:
# 添加更多头部信息模拟移动端访问
mobile_headers = self.HEADERS.copy()
mobile_headers.update({
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Cache-Control': 'max-age=0',
# 模拟移动设备的额外头部
'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 16_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.0 Mobile/15E148 Safari/604.1',
'X-Requested-With': 'XMLHttpRequest',
'Referer': 'https://www.douyin.com/'
})
async with session.get(short_url, headers=mobile_headers, allow_redirects=True, timeout=aiohttp.ClientTimeout(total=10)) as response:
session = self.get_session()
async with session.get(short_url, allow_redirects=True, timeout=aiohttp.ClientTimeout(total=10)) as response:
redirected_url = str(response.url)
# 检查重定向后的URL是否包含视频ID
if 'video/' in redirected_url or '/note/' in redirected_url:
logger.info(f"[{self.name}] 重定向后的视频URL: {redirected_url}")
return redirected_url
elif 'share_item' in redirected_url:
logger.info(f"[{self.name}] 重定向后的分享URL: {redirected_url}")
# 检查重定向后的URL是否是有效的视频或图文页
if 'douyin.com/video/' in redirected_url or 'douyin.com/note/' in redirected_url:
logger.info(f"[{self.name}] 成功获取真实URL: {redirected_url}")
return redirected_url
else:
logger.warning(f"[{self.name}] 重定向到了非预期页面: {redirected_url}")
return redirected_url
logger.warning(f"[{self.name}] 短链接 {short_url} 重定向到了非预期页面: {redirected_url}")
return None
except Exception as e:
logger.error(f"[{self.name}] 获取真实URL失败: {e}")