import logging
from scrapy import Request
from scrapy.exceptions import IgnoreRequest
class ErrorHandlingMiddleware:
"""
Scrapy网络请求异常处理Middleware
功能:
1. 自动重试超时、5xx服务器错误(除了特别极端的500)
2. 放弃4xx客户端错误(比如404/403,重试没用)
3. 用详细日志记录所有错误,方便后续排查
"""
# 定义允许重试的状态码和异常
ALLOWED_RETRY_STATUS_CODES = [502, 503, 504] # 500一般是服务端内部逻辑错误,可能不是临时的,暂不自动重试
ALLOWED_RETRY_EXCEPTIONS = [
TimeoutError, # 请求超时
ConnectionRefusedError, # 被拒绝连接
]
# 定义最大重试次数(和settings.py里的RETRY_TIMES可以配合,但这里单独加个更灵活)
MAX_RETRY_TIMES = 3
def process_response(self, request, response, spider):
"""
处理正常返回但状态码不对的响应
"""
if response.status in self.ALLOWED_RETRY_STATUS_CODES:
# 获取当前已经重试的次数
retry_times = request.meta.get("retry_times", 0)
if retry_times < self.MAX_RETRY_TIMES:
retry_times += 1
spider.logger.warning(
f"🔄 状态码{response.status}触发重试:第{retry_times}次 | URL:{request.url}"
)
# 把当前重试次数写进meta,下次middleware还能读
new_request = request.copy()
new_request.meta["retry_times"] = retry_times
# 优先用Scrapy自带的重试调度(dont_filter=True防止重复URL被去重拦截)
new_request.dont_filter = True
return new_request
else:
spider.logger.error(
f"❌ 放弃重试(超过{self.MAX_RETRY_TIMES}次):状态码{response.status} | URL:{request.url}"
)
raise IgnoreRequest
# 4xx直接放弃
elif response.status >= 400:
spider.logger.warning(
f"🚫 放弃请求(客户端/不可恢复服务端错误):状态码{response.status} | URL:{request.url}"
)
raise IgnoreRequest
# 状态码没问题,直接返回响应给spider
return response
def process_exception(self, request, exception, spider):
"""
处理请求直接抛出的异常
"""
if isinstance(exception, tuple(self.ALLOWED_RETRY_EXCEPTIONS)):
retry_times = request.meta.get("retry_times", 0)
if retry_times < self.MAX_RETRY_TIMES:
retry_times += 1
spider.logger.warning(
f"🔄 异常触发重试:{type(exception).__name__} | 第{retry_times}次 | URL:{request.url}"
)
new_request = request.copy()
new_request.meta["retry_times"] = retry_times
new_request.dont_filter = True
return new_request
else:
spider.logger.error(
f"❌ 放弃重试(超过{self.MAX_RETRY_TIMES}次):{type(exception).__name__} | URL:{request.url}"
)
return None
# 其他异常直接记录并放弃
spider.logger.error(
f"🚫 放弃请求(未知异常):{type(exception).__name__} | 详情:{str(exception)} | URL:{request.url}"
)
return None