refactor(web_parser): 优化URL提取和抖音解析器逻辑
重构URL提取逻辑,合并所有文本段处理分割链接并清理末尾标点 简化抖音解析器实现,移除冗余头部信息并改进URL验证 删除未使用的鸭子示例代码文件
This commit is contained in:
@@ -18,8 +18,8 @@ class DouyinParser(BaseParser):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.name = "抖音解析器"
|
||||
self.url_pattern = re.compile(r"https?://v\.douyin\.com/[a-zA-Z0-9_]+/?", re.IGNORECASE)
|
||||
self.short_pattern = re.compile(r"(?:https?://)?v\.douyin\.com/[a-zA-Z0-9_]+/?", re.IGNORECASE)
|
||||
self.url_pattern = re.compile(r"https?://v\.douyin\.com/[a-zA-Z0-9_-]+/?", re.IGNORECASE)
|
||||
self.short_pattern = re.compile(r"(?:https?://)?v\.douyin\.com/[a-zA-Z0-9_-]+/?", re.IGNORECASE)
|
||||
self.nickname = "抖音视频解析"
|
||||
# 消息去重缓存
|
||||
self.processed_messages: TTLCache[int, bool] = TTLCache(maxsize=100, ttl=10)
|
||||
@@ -94,34 +94,17 @@ class DouyinParser(BaseParser):
|
||||
Optional[str]: 真实URL,如果失败则返回None
|
||||
"""
|
||||
try:
|
||||
# 首先尝试获取重定向后的URL
|
||||
async with aiohttp.ClientSession() as session:
|
||||
# 添加更多头部信息模拟移动端访问
|
||||
mobile_headers = self.HEADERS.copy()
|
||||
mobile_headers.update({
|
||||
'Sec-Fetch-Dest': 'document',
|
||||
'Sec-Fetch-Mode': 'navigate',
|
||||
'Sec-Fetch-Site': 'none',
|
||||
'Cache-Control': 'max-age=0',
|
||||
# 模拟移动设备的额外头部
|
||||
'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 16_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.0 Mobile/15E148 Safari/604.1',
|
||||
'X-Requested-With': 'XMLHttpRequest',
|
||||
'Referer': 'https://www.douyin.com/'
|
||||
})
|
||||
session = self.get_session()
|
||||
async with session.get(short_url, allow_redirects=True, timeout=aiohttp.ClientTimeout(total=10)) as response:
|
||||
redirected_url = str(response.url)
|
||||
|
||||
async with session.get(short_url, headers=mobile_headers, allow_redirects=True, timeout=aiohttp.ClientTimeout(total=10)) as response:
|
||||
redirected_url = str(response.url)
|
||||
|
||||
# 检查重定向后的URL是否包含视频ID
|
||||
if 'video/' in redirected_url or '/note/' in redirected_url:
|
||||
logger.info(f"[{self.name}] 重定向后的视频URL: {redirected_url}")
|
||||
return redirected_url
|
||||
elif 'share_item' in redirected_url:
|
||||
logger.info(f"[{self.name}] 重定向后的分享URL: {redirected_url}")
|
||||
return redirected_url
|
||||
else:
|
||||
logger.warning(f"[{self.name}] 重定向到了非预期页面: {redirected_url}")
|
||||
return redirected_url
|
||||
# 检查重定向后的URL是否是有效的视频或图文页
|
||||
if 'douyin.com/video/' in redirected_url or 'douyin.com/note/' in redirected_url:
|
||||
logger.info(f"[{self.name}] 成功获取真实URL: {redirected_url}")
|
||||
return redirected_url
|
||||
else:
|
||||
logger.warning(f"[{self.name}] 短链接 {short_url} 重定向到了非预期的页面: {redirected_url}")
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"[{self.name}] 获取真实URL失败: {e}")
|
||||
|
||||
Reference in New Issue
Block a user