refactor(web_parser): 优化URL提取和抖音解析器逻辑
重构URL提取逻辑,合并所有文本段处理分割链接并清理末尾标点 简化抖音解析器实现,移除冗余头部信息并改进URL验证 删除未使用的鸭子示例代码文件
This commit is contained in:
@@ -1,32 +0,0 @@
|
|||||||
|
|
||||||
class 真鸭子:
|
|
||||||
def 叫(self):
|
|
||||||
print("嘎嘎嘎")
|
|
||||||
|
|
||||||
def 跑(self):
|
|
||||||
print("鸭子摇摇摆摆跑")
|
|
||||||
|
|
||||||
class 玩具鸭子:
|
|
||||||
def 叫(self):
|
|
||||||
print("玩具鸭发出嘎嘎声")
|
|
||||||
|
|
||||||
def 跑(self):
|
|
||||||
print("玩具鸭轮子咕噜噜跑")
|
|
||||||
|
|
||||||
class 小猫:
|
|
||||||
def 叫(self):
|
|
||||||
print("喵喵喵")
|
|
||||||
def 跑(self):
|
|
||||||
print("猫咪跑跑")
|
|
||||||
|
|
||||||
def 逗鸭子(鸭子一样的东西):
|
|
||||||
鸭子一样的东西.叫()
|
|
||||||
鸭子一样的东西.跑()
|
|
||||||
|
|
||||||
逗鸭子(真鸭子())
|
|
||||||
|
|
||||||
逗鸭子(玩具鸭子())
|
|
||||||
|
|
||||||
逗鸭子(小猫())
|
|
||||||
|
|
||||||
鸭子 = 1
|
|
||||||
@@ -117,7 +117,7 @@ class BaseParser(metaclass=abc.ABCMeta):
|
|||||||
|
|
||||||
def extract_url_from_text_segments(self, segments):
|
def extract_url_from_text_segments(self, segments):
|
||||||
"""
|
"""
|
||||||
从消息的文本段中提取URL
|
从消息的文本段中提取URL,会合并所有文本段来处理被分割的链接。
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
segments: 消息段列表
|
segments: 消息段列表
|
||||||
@@ -125,14 +125,19 @@ class BaseParser(metaclass=abc.ABCMeta):
|
|||||||
Returns:
|
Returns:
|
||||||
Optional[str]: 提取到的URL或None
|
Optional[str]: 提取到的URL或None
|
||||||
"""
|
"""
|
||||||
for segment in segments:
|
# 1. 拼接所有文本段内容,保留空格
|
||||||
if segment.type == "text":
|
full_text = "".join([segment.data.get("text", "") for segment in segments if segment.type == "text"])
|
||||||
text_content = segment.data.get("text", "")
|
|
||||||
match = self.url_pattern.search(text_content)
|
# 2. 使用解析器自身的url_pattern进行匹配,通常是匹配到第一个空格为止
|
||||||
|
match = self.url_pattern.search(full_text)
|
||||||
|
|
||||||
if match:
|
if match:
|
||||||
extracted_url = match.group(0)
|
extracted_url = match.group(0)
|
||||||
logger.success(f"[{self.name}] 成功从文本中提取到链接: {extracted_url}")
|
# 清理一下链接末尾可能误包含的标点符号
|
||||||
|
extracted_url = re.sub(r'[,.!?]$', '', extracted_url)
|
||||||
|
logger.success(f"[{self.name}] 成功从合并后的文本中提取到链接: {extracted_url}")
|
||||||
return extracted_url
|
return extracted_url
|
||||||
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
async def process_url(self, event: MessageEvent, url: str):
|
async def process_url(self, event: MessageEvent, url: str):
|
||||||
|
|||||||
@@ -18,8 +18,8 @@ class DouyinParser(BaseParser):
|
|||||||
def __init__(self):
|
def __init__(self):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.name = "抖音解析器"
|
self.name = "抖音解析器"
|
||||||
self.url_pattern = re.compile(r"https?://v\.douyin\.com/[a-zA-Z0-9_]+/?", re.IGNORECASE)
|
self.url_pattern = re.compile(r"https?://v\.douyin\.com/[a-zA-Z0-9_-]+/?", re.IGNORECASE)
|
||||||
self.short_pattern = re.compile(r"(?:https?://)?v\.douyin\.com/[a-zA-Z0-9_]+/?", re.IGNORECASE)
|
self.short_pattern = re.compile(r"(?:https?://)?v\.douyin\.com/[a-zA-Z0-9_-]+/?", re.IGNORECASE)
|
||||||
self.nickname = "抖音视频解析"
|
self.nickname = "抖音视频解析"
|
||||||
# 消息去重缓存
|
# 消息去重缓存
|
||||||
self.processed_messages: TTLCache[int, bool] = TTLCache(maxsize=100, ttl=10)
|
self.processed_messages: TTLCache[int, bool] = TTLCache(maxsize=100, ttl=10)
|
||||||
@@ -94,34 +94,17 @@ class DouyinParser(BaseParser):
|
|||||||
Optional[str]: 真实URL,如果失败则返回None
|
Optional[str]: 真实URL,如果失败则返回None
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
# 首先尝试获取重定向后的URL
|
session = self.get_session()
|
||||||
async with aiohttp.ClientSession() as session:
|
async with session.get(short_url, allow_redirects=True, timeout=aiohttp.ClientTimeout(total=10)) as response:
|
||||||
# 添加更多头部信息模拟移动端访问
|
|
||||||
mobile_headers = self.HEADERS.copy()
|
|
||||||
mobile_headers.update({
|
|
||||||
'Sec-Fetch-Dest': 'document',
|
|
||||||
'Sec-Fetch-Mode': 'navigate',
|
|
||||||
'Sec-Fetch-Site': 'none',
|
|
||||||
'Cache-Control': 'max-age=0',
|
|
||||||
# 模拟移动设备的额外头部
|
|
||||||
'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 16_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.0 Mobile/15E148 Safari/604.1',
|
|
||||||
'X-Requested-With': 'XMLHttpRequest',
|
|
||||||
'Referer': 'https://www.douyin.com/'
|
|
||||||
})
|
|
||||||
|
|
||||||
async with session.get(short_url, headers=mobile_headers, allow_redirects=True, timeout=aiohttp.ClientTimeout(total=10)) as response:
|
|
||||||
redirected_url = str(response.url)
|
redirected_url = str(response.url)
|
||||||
|
|
||||||
# 检查重定向后的URL是否包含视频ID
|
# 检查重定向后的URL是否是有效的视频或图文页
|
||||||
if 'video/' in redirected_url or '/note/' in redirected_url:
|
if 'douyin.com/video/' in redirected_url or 'douyin.com/note/' in redirected_url:
|
||||||
logger.info(f"[{self.name}] 重定向后的视频URL: {redirected_url}")
|
logger.info(f"[{self.name}] 成功获取真实URL: {redirected_url}")
|
||||||
return redirected_url
|
|
||||||
elif 'share_item' in redirected_url:
|
|
||||||
logger.info(f"[{self.name}] 重定向后的分享URL: {redirected_url}")
|
|
||||||
return redirected_url
|
return redirected_url
|
||||||
else:
|
else:
|
||||||
logger.warning(f"[{self.name}] 重定向到了非预期页面: {redirected_url}")
|
logger.warning(f"[{self.name}] 短链接 {short_url} 重定向到了非预期的页面: {redirected_url}")
|
||||||
return redirected_url
|
return None
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"[{self.name}] 获取真实URL失败: {e}")
|
logger.error(f"[{self.name}] 获取真实URL失败: {e}")
|
||||||
|
|||||||
Reference in New Issue
Block a user