refactor(web_parser): 优化URL提取和抖音解析器逻辑
重构URL提取逻辑,合并所有文本段处理分割链接并清理末尾标点 简化抖音解析器实现,移除冗余头部信息并改进URL验证 删除未使用的鸭子示例代码文件
This commit is contained in:
@@ -117,7 +117,7 @@ class BaseParser(metaclass=abc.ABCMeta):
|
||||
|
||||
def extract_url_from_text_segments(self, segments):
|
||||
"""
|
||||
从消息的文本段中提取URL
|
||||
从消息的文本段中提取URL,会合并所有文本段来处理被分割的链接。
|
||||
|
||||
Args:
|
||||
segments: 消息段列表
|
||||
@@ -125,14 +125,19 @@ class BaseParser(metaclass=abc.ABCMeta):
|
||||
Returns:
|
||||
Optional[str]: 提取到的URL或None
|
||||
"""
|
||||
for segment in segments:
|
||||
if segment.type == "text":
|
||||
text_content = segment.data.get("text", "")
|
||||
match = self.url_pattern.search(text_content)
|
||||
if match:
|
||||
extracted_url = match.group(0)
|
||||
logger.success(f"[{self.name}] 成功从文本中提取到链接: {extracted_url}")
|
||||
return extracted_url
|
||||
# 1. 拼接所有文本段内容,保留空格
|
||||
full_text = "".join([segment.data.get("text", "") for segment in segments if segment.type == "text"])
|
||||
|
||||
# 2. 使用解析器自身的url_pattern进行匹配,通常是匹配到第一个空格为止
|
||||
match = self.url_pattern.search(full_text)
|
||||
|
||||
if match:
|
||||
extracted_url = match.group(0)
|
||||
# 清理一下链接末尾可能误包含的标点符号
|
||||
extracted_url = re.sub(r'[,.!?]$', '', extracted_url)
|
||||
logger.success(f"[{self.name}] 成功从合并后的文本中提取到链接: {extracted_url}")
|
||||
return extracted_url
|
||||
|
||||
return None
|
||||
|
||||
async def process_url(self, event: MessageEvent, url: str):
|
||||
|
||||
Reference in New Issue
Block a user