refactor(WS): 使用连接池上下文管理器简化连接管理
重构 WS 类中的连接获取和释放逻辑,使用 connection 上下文管理器确保连接正确释放。 同时改进消息处理循环中的异常处理和连接管理。 refactor(ws_pool): 增强连接池的健壮性和管理能力 1. 添加连接上下文管理器支持 2. 改进连接获取和释放逻辑,增加连接计数和锁保护 3. 优化连接健康检查和清理机制 4. 增强错误处理和日志记录 fix(bot_status): 增加系统信息获取和渲染的错误处理 1. 为系统信息获取添加超时和错误处理 2. 为Redis数据获取添加异常捕获 3. 为图片渲染添加异常处理 4. 改进日志记录和用户反馈
This commit is contained in:
63
core/WS.py
63
core/WS.py
@@ -127,17 +127,14 @@ class WS:
|
||||
while True:
|
||||
try:
|
||||
# 从连接池获取一个连接
|
||||
conn = await self.pool.get_connection()
|
||||
|
||||
try:
|
||||
# 监听连接上的消息
|
||||
async for message in conn.conn:
|
||||
await self._handle_message(message, conn)
|
||||
except Exception as e:
|
||||
self.logger.error(f"连接 {conn.conn_id} 监听异常: {e}")
|
||||
finally:
|
||||
# 释放连接回连接池
|
||||
await self.pool.release_connection(conn)
|
||||
# 使用 connection 上下文管理器确保释放
|
||||
async with self.pool.connection() as conn:
|
||||
try:
|
||||
# 监听连接上的消息
|
||||
async for message in conn.conn:
|
||||
await self._handle_message(message, conn)
|
||||
except Exception as e:
|
||||
self.logger.error(f"连接 {conn.conn_id} 监听异常: {e}")
|
||||
except Exception as e:
|
||||
self.logger.error(f"连接池监听循环异常: {e}")
|
||||
await asyncio.sleep(self.reconnect_interval)
|
||||
@@ -324,27 +321,33 @@ class WS:
|
||||
)
|
||||
|
||||
# 从连接池获取一个连接
|
||||
conn = await self.pool.get_connection()
|
||||
try:
|
||||
echo_id = str(uuid.uuid4())
|
||||
payload = {"action": action, "params": params or {}, "echo": echo_id}
|
||||
async with self.pool.connection() as conn:
|
||||
echo_id = str(uuid.uuid4())
|
||||
payload = {"action": action, "params": params or {}, "echo": echo_id}
|
||||
|
||||
await conn.send(orjson.dumps(payload))
|
||||
await conn.send(orjson.dumps(payload))
|
||||
|
||||
# 在当前连接上等待特定 echo 的响应,并设置超时
|
||||
try:
|
||||
async def wait_for_response():
|
||||
async for message in conn.conn:
|
||||
data = orjson.loads(message)
|
||||
if data.get("echo") == echo_id:
|
||||
return data
|
||||
|
||||
return await asyncio.wait_for(wait_for_response(), timeout=30.0)
|
||||
# 在当前连接上等待特定 echo 的响应,并设置超时
|
||||
try:
|
||||
async def wait_for_response():
|
||||
async for message in conn.conn:
|
||||
data = orjson.loads(message)
|
||||
|
||||
# 检查是否是我们要的响应
|
||||
if data.get("echo") == echo_id:
|
||||
return data
|
||||
|
||||
# 如果不是,可能是事件,需要分发
|
||||
if "post_type" in data:
|
||||
asyncio.create_task(self.on_event(data))
|
||||
|
||||
return await asyncio.wait_for(wait_for_response(), timeout=30.0)
|
||||
|
||||
except asyncio.TimeoutError:
|
||||
raise # 重新抛出超时异常
|
||||
except Exception as e:
|
||||
raise WebSocketError(f"在等待API响应时连接出错: {e}")
|
||||
except asyncio.TimeoutError:
|
||||
raise # 重新抛出超时异常
|
||||
except Exception as e:
|
||||
raise WebSocketError(f"在等待API响应时连接出错: {e}")
|
||||
|
||||
except asyncio.TimeoutError:
|
||||
self.logger.warning(f"API 调用超时: action={action}, params={params}")
|
||||
@@ -360,9 +363,6 @@ class WS:
|
||||
message=f"API调用异常: {str(e)}",
|
||||
data={"action": action, "params": params}
|
||||
)
|
||||
finally:
|
||||
# 释放连接回连接池
|
||||
await self.pool.release_connection(conn)
|
||||
else:
|
||||
# 单连接模式
|
||||
if not self.ws:
|
||||
@@ -409,4 +409,3 @@ class WS:
|
||||
message=f"API调用异常: {str(e)}",
|
||||
data={"action": action, "params": params}
|
||||
)
|
||||
|
||||
|
||||
171
core/ws_pool.py
171
core/ws_pool.py
@@ -7,9 +7,10 @@ WebSocket 连接池模块
|
||||
import asyncio
|
||||
import websockets
|
||||
from websockets.legacy.client import WebSocketClientProtocol
|
||||
from typing import Optional, Dict, Any, cast, Union
|
||||
from typing import Optional, Dict, Any, cast, Union, AsyncGenerator
|
||||
import uuid
|
||||
from loguru import logger
|
||||
import contextlib
|
||||
|
||||
from .config_loader import global_config
|
||||
from .utils.exceptions import WebSocketError, WebSocketConnectionError
|
||||
@@ -64,9 +65,11 @@ class WSConnection:
|
||||
if not self.is_active:
|
||||
return False
|
||||
try:
|
||||
await asyncio.wait_for(self.conn.ping(), timeout=timeout)
|
||||
# 使用 wait_for 包装 ping
|
||||
pong_waiter = await self.conn.ping()
|
||||
await asyncio.wait_for(pong_waiter, timeout=timeout)
|
||||
return True
|
||||
except (asyncio.TimeoutError, websockets.exceptions.ConnectionClosed):
|
||||
except (asyncio.TimeoutError, websockets.exceptions.ConnectionClosed, Exception):
|
||||
self.is_active = False
|
||||
return False
|
||||
|
||||
@@ -76,7 +79,10 @@ class WSConnection:
|
||||
"""
|
||||
if self.is_active:
|
||||
self.is_active = False
|
||||
await self.conn.close()
|
||||
try:
|
||||
await self.conn.close()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
class WSConnectionPool:
|
||||
@@ -97,6 +103,8 @@ class WSConnectionPool:
|
||||
self.pool: asyncio.Queue[WSConnection] = asyncio.Queue(maxsize=pool_size)
|
||||
self._closed = False
|
||||
self._cleanup_task: Optional[asyncio.Task] = None
|
||||
self._current_size = 0 # 当前管理的连接数(包括池中和借出的)
|
||||
self._lock = asyncio.Lock() # 用于保护 _current_size 的修改
|
||||
|
||||
# 从全局配置读取参数
|
||||
self.url = global_config.napcat_ws.uri
|
||||
@@ -115,14 +123,17 @@ class WSConnectionPool:
|
||||
# 启动连接清理任务
|
||||
self._cleanup_task = asyncio.create_task(self._cleanup_idle_connections())
|
||||
|
||||
# 创建初始连接
|
||||
# 预热连接池
|
||||
for _ in range(self.pool_size):
|
||||
try:
|
||||
conn = await self._create_connection()
|
||||
await self.pool.put(conn)
|
||||
async with self._lock:
|
||||
self._current_size += 1
|
||||
logger.info(f"WebSocket 连接 {conn.conn_id} 已创建并加入连接池")
|
||||
except Exception as e:
|
||||
logger.error(f"创建初始连接失败: {e}")
|
||||
# 初始连接失败不抛出异常,允许后续动态创建
|
||||
|
||||
async def _create_connection(self) -> WSConnection:
|
||||
"""
|
||||
@@ -143,6 +154,17 @@ class WSConnectionPool:
|
||||
except Exception as e:
|
||||
raise WebSocketConnectionError(f"创建 WebSocket 连接失败: {e}")
|
||||
|
||||
@contextlib.asynccontextmanager
|
||||
async def connection(self) -> AsyncGenerator[WSConnection, None]:
|
||||
"""
|
||||
获取连接的上下文管理器
|
||||
"""
|
||||
conn = await self.get_connection()
|
||||
try:
|
||||
yield conn
|
||||
finally:
|
||||
await self.release_connection(conn)
|
||||
|
||||
async def get_connection(self) -> WSConnection:
|
||||
"""
|
||||
从连接池获取一个健康的连接,包含健康检查。
|
||||
@@ -150,25 +172,64 @@ class WSConnectionPool:
|
||||
if self._closed:
|
||||
raise WebSocketError("连接池已关闭")
|
||||
|
||||
try:
|
||||
# 尝试从连接池获取连接
|
||||
conn = await asyncio.wait_for(self.pool.get(), timeout=5)
|
||||
|
||||
# 健康检查
|
||||
if await conn.ping():
|
||||
logger.debug(f"连接 {conn.conn_id} 健康检查通过")
|
||||
return conn
|
||||
else:
|
||||
logger.warning(f"连接 {conn.conn_id} 健康检查失败,丢弃并获取新连接")
|
||||
await conn.close()
|
||||
return await self.get_connection() # 递归获取下一个
|
||||
start_time = asyncio.get_event_loop().time()
|
||||
timeout = 10 # 获取连接的总超时时间
|
||||
|
||||
except asyncio.TimeoutError:
|
||||
# 连接池为空,创建新连接
|
||||
logger.warning("连接池在5秒内无可用连接,创建新连接")
|
||||
return await self._create_connection()
|
||||
except Exception as e:
|
||||
raise WebSocketError(f"获取连接时发生未知错误: {e}")
|
||||
while True:
|
||||
if asyncio.get_event_loop().time() - start_time > timeout:
|
||||
raise WebSocketError("获取连接超时")
|
||||
|
||||
try:
|
||||
# 1. 尝试从池中获取
|
||||
conn = self.pool.get_nowait()
|
||||
|
||||
# 健康检查
|
||||
if await conn.ping():
|
||||
logger.debug(f"连接 {conn.conn_id} 健康检查通过")
|
||||
return conn
|
||||
else:
|
||||
logger.warning(f"连接 {conn.conn_id} 健康检查失败,丢弃")
|
||||
await conn.close()
|
||||
async with self._lock:
|
||||
self._current_size -= 1
|
||||
# 继续循环,尝试获取下一个或创建新的
|
||||
continue
|
||||
|
||||
except asyncio.QueueEmpty:
|
||||
# 池为空,检查是否可以创建新连接
|
||||
async with self._lock:
|
||||
if self._current_size < self.pool_size:
|
||||
# 有配额,创建新连接
|
||||
self._current_size += 1 # 先占位
|
||||
create_new = True
|
||||
else:
|
||||
create_new = False
|
||||
|
||||
if create_new:
|
||||
try:
|
||||
conn = await self._create_connection()
|
||||
return conn
|
||||
except Exception as e:
|
||||
async with self._lock:
|
||||
self._current_size -= 1 # 回滚占位
|
||||
logger.error(f"创建新连接失败: {e}")
|
||||
await asyncio.sleep(1) # 避免快速失败循环
|
||||
continue
|
||||
else:
|
||||
# 没有配额,等待池中有可用连接
|
||||
try:
|
||||
conn = await asyncio.wait_for(self.pool.get(), timeout=1.0)
|
||||
# 获取到了,进行健康检查(在下一次循环中处理,或者这里直接处理)
|
||||
# 为了代码复用,我们把 conn 放回去(或者直接用),这里直接用
|
||||
if await conn.ping():
|
||||
return conn
|
||||
else:
|
||||
await conn.close()
|
||||
async with self._lock:
|
||||
self._current_size -= 1
|
||||
continue
|
||||
except asyncio.TimeoutError:
|
||||
continue
|
||||
|
||||
async def release_connection(self, conn: WSConnection):
|
||||
"""
|
||||
@@ -180,19 +241,26 @@ class WSConnectionPool:
|
||||
|
||||
if not conn.is_active:
|
||||
logger.warning(f"连接 {conn.conn_id} 已失效,不返回连接池")
|
||||
await conn.close()
|
||||
async with self._lock:
|
||||
self._current_size -= 1
|
||||
return
|
||||
|
||||
try:
|
||||
if self.pool.full():
|
||||
# 连接池已满,关闭该连接
|
||||
await conn.close()
|
||||
logger.info(f"连接池已满,关闭连接 {conn.conn_id}")
|
||||
else:
|
||||
await self.pool.put(conn)
|
||||
logger.debug(f"连接 {conn.conn_id} 已返回连接池")
|
||||
# 尝试放回池中
|
||||
self.pool.put_nowait(conn)
|
||||
logger.debug(f"连接 {conn.conn_id} 已返回连接池")
|
||||
except asyncio.QueueFull:
|
||||
# 理论上不应该发生,除非 _current_size 逻辑有误
|
||||
logger.warning(f"连接池已满,关闭多余连接 {conn.conn_id}")
|
||||
await conn.close()
|
||||
async with self._lock:
|
||||
self._current_size -= 1
|
||||
except Exception as e:
|
||||
logger.error(f"释放连接失败: {e}")
|
||||
await conn.close()
|
||||
async with self._lock:
|
||||
self._current_size -= 1
|
||||
|
||||
async def _cleanup_idle_connections(self):
|
||||
"""
|
||||
@@ -202,23 +270,33 @@ class WSConnectionPool:
|
||||
await asyncio.sleep(60) # 每分钟检查一次
|
||||
|
||||
try:
|
||||
# 检查连接池中的连接
|
||||
new_pool = asyncio.Queue(maxsize=self.pool_size)
|
||||
current_time = asyncio.get_event_loop().time()
|
||||
# 我们不替换队列,而是取出检查再放回
|
||||
# 这样比较安全,但可能会暂时清空池子
|
||||
# 更好的做法是只检查队头的连接
|
||||
|
||||
while not self.pool.empty():
|
||||
conn = await self.pool.get()
|
||||
# 获取当前队列大小
|
||||
qsize = self.pool.qsize()
|
||||
for _ in range(qsize):
|
||||
try:
|
||||
conn = self.pool.get_nowait()
|
||||
except asyncio.QueueEmpty:
|
||||
break
|
||||
|
||||
current_time = asyncio.get_event_loop().time()
|
||||
if current_time - conn.last_used > self.max_idle_time:
|
||||
# 连接空闲时间过长,关闭
|
||||
await conn.close()
|
||||
logger.info(f"清理空闲连接 {conn.conn_id}")
|
||||
await conn.close()
|
||||
async with self._lock:
|
||||
self._current_size -= 1
|
||||
else:
|
||||
# 放回新队列
|
||||
await new_pool.put(conn)
|
||||
|
||||
# 替换原连接池
|
||||
self.pool = new_pool
|
||||
# 还没过期,放回去
|
||||
try:
|
||||
self.pool.put_nowait(conn)
|
||||
except asyncio.QueueFull:
|
||||
# 竞争条件下可能满了
|
||||
await conn.close()
|
||||
async with self._lock:
|
||||
self._current_size -= 1
|
||||
except Exception as e:
|
||||
logger.error(f"清理空闲连接失败: {e}")
|
||||
|
||||
@@ -241,7 +319,10 @@ class WSConnectionPool:
|
||||
|
||||
# 关闭所有连接
|
||||
while not self.pool.empty():
|
||||
conn = await self.pool.get()
|
||||
await conn.close()
|
||||
try:
|
||||
conn = self.pool.get_nowait()
|
||||
await conn.close()
|
||||
except asyncio.QueueEmpty:
|
||||
break
|
||||
|
||||
logger.info("WebSocket 连接池已关闭")
|
||||
logger.info("WebSocket 连接池已关闭")
|
||||
|
||||
@@ -6,6 +6,7 @@ Bot 状态查询插件
|
||||
import os
|
||||
import psutil
|
||||
import time
|
||||
import asyncio
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
from core.bot import Bot
|
||||
@@ -32,15 +33,23 @@ def _get_system_info():
|
||||
"""
|
||||
同步函数:使用 psutil 获取系统信息,避免阻塞事件循环。
|
||||
"""
|
||||
# interval=1 会阻塞1秒,必须在线程池中运行
|
||||
cpu_percent = psutil.cpu_percent(interval=1)
|
||||
mem_info = psutil.virtual_memory()
|
||||
bot_mem_mb = PROCESS.memory_info().rss / (1024 * 1024)
|
||||
return {
|
||||
"cpu_percent": f"{cpu_percent:.1f}",
|
||||
"mem_percent": f"{mem_info.percent:.1f}",
|
||||
"bot_mem_mb": f"{bot_mem_mb:.2f}",
|
||||
}
|
||||
try:
|
||||
# interval=1 会阻塞1秒,必须在线程池中运行
|
||||
cpu_percent = psutil.cpu_percent(interval=1)
|
||||
mem_info = psutil.virtual_memory()
|
||||
bot_mem_mb = PROCESS.memory_info().rss / (1024 * 1024)
|
||||
return {
|
||||
"cpu_percent": f"{cpu_percent:.1f}",
|
||||
"mem_percent": f"{mem_info.percent:.1f}",
|
||||
"bot_mem_mb": f"{bot_mem_mb:.2f}",
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error(f"获取系统信息失败: {e}")
|
||||
return {
|
||||
"cpu_percent": "N/A",
|
||||
"mem_percent": "N/A",
|
||||
"bot_mem_mb": "N/A",
|
||||
}
|
||||
|
||||
@matcher.command("status", "状态")
|
||||
async def handle_status(bot: Bot, event: MessageEvent, args: list[str]):
|
||||
@@ -93,26 +102,51 @@ async def handle_status(bot: Bot, event: MessageEvent, args: list[str]):
|
||||
}
|
||||
|
||||
# 3. 获取统计数据
|
||||
msgs_recv = await redis_manager.get("neobot:stats:messages_received") or 0
|
||||
msgs_sent = await redis_manager.get("neobot:stats:messages_sent") or 0
|
||||
command_stats_raw = await redis_manager.redis.hgetall("neobot:command_stats")
|
||||
|
||||
total_commands = sum(int(v) for v in command_stats_raw.values())
|
||||
|
||||
stats_data = {
|
||||
"messages_received": int(msgs_recv),
|
||||
"messages_sent": int(msgs_sent),
|
||||
"total_commands": total_commands,
|
||||
}
|
||||
try:
|
||||
msgs_recv = await redis_manager.get("neobot:stats:messages_received") or 0
|
||||
msgs_sent = await redis_manager.get("neobot:stats:messages_sent") or 0
|
||||
command_stats_raw = await redis_manager.redis.hgetall("neobot:command_stats")
|
||||
|
||||
total_commands = sum(int(v) for v in command_stats_raw.values())
|
||||
|
||||
stats_data = {
|
||||
"messages_received": int(msgs_recv),
|
||||
"messages_sent": int(msgs_sent),
|
||||
"total_commands": total_commands,
|
||||
}
|
||||
|
||||
command_stats_data = sorted(
|
||||
[{"name": k, "count": int(v)} for k, v in command_stats_raw.items()],
|
||||
key=lambda x: x["count"],
|
||||
reverse=True
|
||||
)
|
||||
command_stats_data = sorted(
|
||||
[{"name": k, "count": int(v)} for k, v in command_stats_raw.items()],
|
||||
key=lambda x: x["count"],
|
||||
reverse=True
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"获取Redis统计数据失败: {e}")
|
||||
stats_data = {
|
||||
"messages_received": 0,
|
||||
"messages_sent": 0,
|
||||
"total_commands": 0,
|
||||
}
|
||||
command_stats_data = []
|
||||
|
||||
# 4. 异步获取系统信息
|
||||
system_data = await run_in_thread_pool(_get_system_info)
|
||||
# 设置超时,防止 psutil 阻塞过久
|
||||
try:
|
||||
system_data = await asyncio.wait_for(run_in_thread_pool(_get_system_info), timeout=5.0)
|
||||
except asyncio.TimeoutError:
|
||||
logger.error("获取系统信息超时")
|
||||
system_data = {
|
||||
"cpu_percent": "Timeout",
|
||||
"mem_percent": "Timeout",
|
||||
"bot_mem_mb": "Timeout",
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error(f"获取系统信息异常: {e}")
|
||||
system_data = {
|
||||
"cpu_percent": "Error",
|
||||
"mem_percent": "Error",
|
||||
"bot_mem_mb": "Error",
|
||||
}
|
||||
|
||||
# 5. 准备模板所需的所有数据
|
||||
template_data = {
|
||||
@@ -125,18 +159,22 @@ async def handle_status(bot: Bot, event: MessageEvent, args: list[str]):
|
||||
}
|
||||
|
||||
# 6. 渲染图片
|
||||
base64_str = await image_manager.render_template_to_base64(
|
||||
template_name="status.html",
|
||||
data=template_data,
|
||||
output_name="status.png",
|
||||
image_type="png"
|
||||
)
|
||||
try:
|
||||
base64_str = await image_manager.render_template_to_base64(
|
||||
template_name="status.html",
|
||||
data=template_data,
|
||||
output_name="status.png",
|
||||
image_type="png"
|
||||
)
|
||||
|
||||
if base64_str:
|
||||
await event.reply(MessageSegment.image(base64_str))
|
||||
else:
|
||||
# 如果渲染失败,image_manager 内部会记录错误,这里给用户一个通用提示
|
||||
await event.reply("状态图片生成失败,可能是渲染服务出现问题,请联系管理员。")
|
||||
if base64_str:
|
||||
await event.reply(MessageSegment.image(base64_str))
|
||||
else:
|
||||
# 如果渲染失败,image_manager 内部会记录错误,这里给用户一个通用提示
|
||||
await event.reply("状态图片生成失败,可能是渲染服务出现问题,请联系管理员。")
|
||||
except Exception as e:
|
||||
logger.error(f"渲染图片失败: {e}")
|
||||
await event.reply("状态图片渲染过程中发生错误。")
|
||||
|
||||
except Exception as e:
|
||||
logger.exception(f"生成状态图时发生意外错误, 用户: {event.user_id}")
|
||||
|
||||
Reference in New Issue
Block a user