refactor(web_parser): 优化URL提取和抖音解析器逻辑

重构URL提取逻辑，合并所有文本段处理分割链接并清理末尾标点简化抖音解析器实现，移除冗余头部信息并改进URL验证删除未使用的鸭子示例代码文件
2026-01-23 09:51:28 +08:00
parent bbfc17d6e6
commit 489dd8c77d
3 changed files with 26 additions and 70 deletions
--- a/core/managers/1.py
+++ b/core/managers/1.py
@@ -1,32 +0,0 @@
-
-class 真鸭子:
-    def 叫(self):
-        print("嘎嘎嘎")
-    
-    def 跑(self):
-        print("鸭子摇摇摆摆跑")
-
-class 玩具鸭子:
-    def 叫(self):
-        print("玩具鸭发出嘎嘎声")
-    
-    def 跑(self):
-        print("玩具鸭轮子咕噜噜跑")
-
-class 小猫:
-    def 叫(self):
-        print("喵喵喵")
-    def 跑(self):
-        print("猫咪跑跑")
-
-def 逗鸭子(鸭子一样的东西):
-    鸭子一样的东西.叫()
-    鸭子一样的东西.跑()
-
-逗鸭子(真鸭子())
-
-逗鸭子(玩具鸭子())
-
-逗鸭子(小猫())
-
-鸭子 = 1
--- a/plugins/web_parser/base.py
+++ b/plugins/web_parser/base.py
@@ -117,7 +117,7 @@ class BaseParser(metaclass=abc.ABCMeta):
    
    def extract_url_from_text_segments(self, segments):
        """
-        从消息的文本段中提取URL
+        从消息的文本段中提取URL，会合并所有文本段来处理被分割的链接。
        
        Args:
            segments: 消息段列表
@@ -125,14 +125,19 @@ class BaseParser(metaclass=abc.ABCMeta):
        Returns:
            Optional[str]: 提取到的URL或None
        """
-        for segment in segments:
-            if segment.type == "text":
-                text_content = segment.data.get("text", "")
-                match = self.url_pattern.search(text_content)
+        # 1. 拼接所有文本段内容，保留空格
+        full_text = "".join([segment.data.get("text", "") for segment in segments if segment.type == "text"])
+        
+        # 2. 使用解析器自身的url_pattern进行匹配，通常是匹配到第一个空格为止
+        match = self.url_pattern.search(full_text)
+        
        if match:
            extracted_url = match.group(0)
-                    logger.success(f"[{self.name}] 成功从文本中提取到链接: {extracted_url}")
+            # 清理一下链接末尾可能误包含的标点符号
+            extracted_url = re.sub(r'[,.!?]$', '', extracted_url)
+            logger.success(f"[{self.name}] 成功从合并后的文本中提取到链接: {extracted_url}")
            return extracted_url
+            
        return None
    
    async def process_url(self, event: MessageEvent, url: str):
--- a/plugins/web_parser/parsers/douyin.py
+++ b/plugins/web_parser/parsers/douyin.py
@@ -18,8 +18,8 @@ class DouyinParser(BaseParser):
    def __init__(self):
        super().__init__()
        self.name = "抖音解析器"
-        self.url_pattern = re.compile(r"https?://v\.douyin\.com/[a-zA-Z0-9_]+/?", re.IGNORECASE)
-        self.short_pattern = re.compile(r"(?:https?://)?v\.douyin\.com/[a-zA-Z0-9_]+/?", re.IGNORECASE)
+        self.url_pattern = re.compile(r"https?://v\.douyin\.com/[a-zA-Z0-9_-]+/?", re.IGNORECASE)
+        self.short_pattern = re.compile(r"(?:https?://)?v\.douyin\.com/[a-zA-Z0-9_-]+/?", re.IGNORECASE)
        self.nickname = "抖音视频解析"
        # 消息去重缓存
        self.processed_messages: TTLCache[int, bool] = TTLCache(maxsize=100, ttl=10)
@@ -94,34 +94,17 @@ class DouyinParser(BaseParser):
            Optional[str]: 真实URL，如果失败则返回None
        """
        try:
-            # 首先尝试获取重定向后的URL
-            async with aiohttp.ClientSession() as session:
-                # 添加更多头部信息模拟移动端访问
-                mobile_headers = self.HEADERS.copy()
-                mobile_headers.update({
-                    'Sec-Fetch-Dest': 'document',
-                    'Sec-Fetch-Mode': 'navigate',
-                    'Sec-Fetch-Site': 'none',
-                    'Cache-Control': 'max-age=0',
-                    # 模拟移动设备的额外头部
-                    'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 16_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.0 Mobile/15E148 Safari/604.1',
-                    'X-Requested-With': 'XMLHttpRequest',
-                    'Referer': 'https://www.douyin.com/'
-                })
-                
-                async with session.get(short_url, headers=mobile_headers, allow_redirects=True, timeout=aiohttp.ClientTimeout(total=10)) as response:
+            session = self.get_session()
+            async with session.get(short_url, allow_redirects=True, timeout=aiohttp.ClientTimeout(total=10)) as response:
                redirected_url = str(response.url)
                
-                    # 检查重定向后的URL是否包含视频ID
-                    if 'video/' in redirected_url or '/note/' in redirected_url:
-                        logger.info(f"[{self.name}] 重定向后的视频URL: {redirected_url}")
-                        return redirected_url
-                    elif 'share_item' in redirected_url:
-                        logger.info(f"[{self.name}] 重定向后的分享URL: {redirected_url}")
+                # 检查重定向后的URL是否是有效的视频或图文页
+                if 'douyin.com/video/' in redirected_url or 'douyin.com/note/' in redirected_url:
+                    logger.info(f"[{self.name}] 成功获取真实URL: {redirected_url}")
                    return redirected_url
                else:
-                        logger.warning(f"[{self.name}] 重定向到了非预期页面: {redirected_url}")
-                        return redirected_url
+                    logger.warning(f"[{self.name}] 短链接 {short_url} 重定向到了非预期的页面: {redirected_url}")
+                    return None
                        
        except Exception as e:
            logger.error(f"[{self.name}] 获取真实URL失败: {e}")