#Downloader Middleware完全指南 - 请求响应拦截与反爬策略详解
📂 所属阶段:第三阶段 — 攻防演练(中间件与反爬篇)
🔗 相关章节:Spider 实战 · Pipeline管道实战
#目录
#Middleware基础概念
Downloader Middleware是Scrapy框架中的核心组件,位于Engine和Downloader之间,用于拦截请求和响应。它是实现反爬策略的关键工具。
#Middleware的主要作用
- 请求拦截:修改请求头、添加认证信息、设置代理等
- 响应处理:处理响应内容、解密数据、重试失败请求等
- 异常处理:处理下载异常、重试机制、错误恢复等
- 反爬策略:实现User-Agent轮换、IP代理、请求频率控制等
#Scrapy请求处理流程
Engine -> Scheduler -> Engine -> Downloader Middleware -> Downloader ->
Downloader Middleware -> Engine -> Spider Middleware -> Spider#Middleware生命周期
Downloader Middleware具有完整的生命周期方法,可以在不同阶段对请求和响应进行处理。
#基础Middleware结构
class BaseMiddleware:
"""基础Downloader Middleware示例"""
@classmethod
def from_crawler(cls, crawler):
"""从crawler实例创建middleware的方法"""
return cls()
def process_request(self, request, spider):
"""处理请求的方法"""
# 返回None表示继续处理
# 返回Response对象表示终止请求,直接返回响应
# 返回Request对象表示发起新的请求
# 抛出IgnoreRequest异常表示忽略此请求
return None
def process_response(self, request, response, spider):
"""处理响应的方法"""
# 必须返回Response对象
return response
def process_exception(self, request, exception, spider):
"""处理异常的方法"""
# 返回Response对象表示异常已处理
# 返回Request对象表示发起新的请求
# 返回None表示异常未处理
return None#Middleware配置
# settings.py
DOWNLOADER_MIDDLEWARES = {
# 格式:'路径.到.中间件类': 优先级数字
'myproject.middlewares.UserAgentMiddleware': 543,
'myproject.middlewares.ProxyMiddleware': 544,
'myproject.middlewares.CookiesMiddleware': 545,
}
# 优先级说明:
# 数字越小,优先级越高,越先执行
# Scrapy内置中间件的优先级范围:0-1000
# 自定义中间件建议使用500-1000之间的数字#核心处理方法
#请求处理
process_request 方法在请求被Downloader处理之前调用,是最常用的中间件方法之一。
class BasicRequestMiddleware:
"""基础请求处理中间件"""
def process_request(self, request, spider):
"""基础请求处理逻辑"""
# 添加通用请求头
request.headers.setdefault('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8')
request.headers.setdefault('Accept-Language', 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3')
# 添加认证信息
auth_token = spider.crawler.settings.get('AUTH_TOKEN')
if auth_token:
request.headers['Authorization'] = f'Bearer {auth_token}'
return None#响应处理
process_response 方法在响应被Downloader处理后、Spider处理前调用,用于处理响应内容。
class RetryResponseMiddleware:
"""响应重试中间件"""
def process_response(self, request, response, spider):
"""根据响应内容判断是否需要重试"""
# 检查是否是反爬页面
response_text = response.text.lower()
anti_crawl_indicators = [
'访问过于频繁', '请稍后重试', 'blocked', 'forbidden',
'验证码', 'captcha', 'rate limit', 'too many requests'
]
if any(indicator in response_text for indicator in anti_crawl_indicators):
retry_times = request.meta.get('retry_times', 0)
max_retries = spider.crawler.settings.getint('MAX_RETRY_TIMES', 3)
if retry_times < max_retries:
spider.logger.info(f"Retrying {request.url}, attempt {retry_times + 1}")
new_request = request.copy()
new_request.meta['retry_times'] = retry_times + 1
new_request.dont_filter = True
return new_request
return response#异常处理
process_exception 方法在下载过程中发生异常时调用,用于处理各种网络异常。
from twisted.internet.error import TimeoutError, DNSLookupError
class BasicExceptionMiddleware:
"""基础异常处理中间件"""
def process_exception(self, request, exception, spider):
"""处理常见的下载异常"""
if isinstance(exception, TimeoutError):
spider.logger.warning(f"Timeout for {request.url}")
return self.handle_retry(request, exception, spider)
return None
def handle_retry(self, request, exception, spider):
"""处理重试逻辑"""
retry_times = request.meta.get('retry_times', 0)
max_retries = spider.crawler.settings.getint('MAX_RETRY_TIMES', 2)
if retry_times < max_retries:
new_request = request.copy()
new_request.meta['retry_times'] = retry_times + 1
new_request.dont_filter = True
return new_request
return None#User-Agent轮换策略
User-Agent轮换是反爬的基础策略之一,可以有效避免被识别为爬虫。
import random
class UserAgentMiddleware:
"""基础User-Agent轮换中间件"""
def __init__(self):
self.user_agents = [
# Chrome on Windows
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
# Chrome on Mac
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
# Firefox on Windows
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/121.0',
# Safari on Mac
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15',
# Mobile - iPhone
'Mozilla/5.0 (iPhone; CPU iPhone OS 17_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Mobile/15E148 Safari/604.1',
]
def process_request(self, request, spider):
"""随机选择User-Agent"""
ua = random.choice(self.user_agents)
request.headers['User-Agent'] = ua
return None#代理IP管理
代理IP管理是应对IP封禁的重要策略,包括代理池管理和智能切换。
import random
import time
from collections import defaultdict
class ProxyMiddleware:
"""基础代理管理中间件"""
def __init__(self):
# 代理池(实际使用时应从配置或API获取)
self.proxy_pool = [
'http://proxy1:port',
'http://proxy2:port',
'http://proxy3:port',
]
# 代理使用统计
self.proxy_stats = defaultdict(lambda: {
'success_count': 0,
'failure_count': 0,
'ban_count': 0
})
def process_request(self, request, spider):
"""为请求分配代理"""
if request.meta.get('proxy'):
return None
proxy = self._select_proxy()
if proxy:
request.meta['proxy'] = proxy
return None
def process_response(self, request, response, spider):
"""处理响应,更新代理状态"""
proxy = request.meta.get('proxy')
if proxy:
if response.status in [200, 301, 302]:
self.proxy_stats[proxy]['success_count'] += 1
elif response.status in [403, 404, 429, 503]:
self.proxy_stats[proxy]['failure_count'] += 1
if response.status == 403:
self.proxy_stats[proxy]['ban_count'] += 1
return response
def process_exception(self, request, exception, spider):
"""处理异常,更新代理状态"""
proxy = request.meta.get('proxy')
if proxy:
self.proxy_stats[proxy]['failure_count'] += 1
def _select_proxy(self):
"""选择一个健康的代理"""
healthy_proxies = []
for proxy, stats in self.proxy_stats.items():
if stats['failure_count'] < 5 and stats['ban_count'] < 3:
healthy_proxies.append(proxy)
if not healthy_proxies:
return random.choice(self.proxy_pool) if self.proxy_pool else None
return random.choice(healthy_proxies)#Cookies管理
Cookies管理对于维持会话和处理需要登录的网站很重要。
import time
from urllib.parse import urlparse
class CookiesMiddleware:
"""基础Cookies管理中间件"""
def __init__(self):
self.domain_cookies = {}
def process_request(self, request, spider):
"""为请求添加cookies"""
domain = self._extract_domain(request.url)
cookies = self.domain_cookies.get(domain, {})
if cookies:
cookie_header = '; '.join([f"{name}={data['value']}"
for name, data in cookies.items()])
request.headers['Cookie'] = cookie_header
return None
def process_response(self, request, response, spider):
"""从响应中提取cookies"""
domain = self._extract_domain(request.url)
set_cookies = response.headers.getlist('Set-Cookie')
for cookie_header in set_cookies:
cookie_data = self._parse_cookie(cookie_header.decode('utf-8'))
if cookie_data:
if domain not in self.domain_cookies:
self.domain_cookies[domain] = {}
self.domain_cookies[domain][cookie_data['name']] = {
'value': cookie_data['value'],
'timestamp': time.time()
}
return response
def _extract_domain(self, url):
"""从URL提取域名"""
parsed = urlparse(url)
return parsed.netloc
def _parse_cookie(self, cookie_str):
"""解析cookie字符串"""
parts = cookie_str.split(';')
if not parts:
return None
name_value = parts[0].split('=', 1)
if len(name_value) != 2:
return None
return {
'name': name_value[0].strip(),
'value': name_value[1].strip()
}#请求延迟与限速
请求延迟和限速是避免被反爬检测的重要策略。
import time
import random
from collections import defaultdict
class RateLimitMiddleware:
"""基础速率限制中间件"""
def __init__(self):
self.domain_last_request = defaultdict(float)
def process_request(self, request, spider):
"""处理请求延迟"""
domain = self._extract_domain(request.url)
min_delay = spider.crawler.settings.getfloat('DOWNLOAD_DELAY', 1)
randomize_delay = spider.crawler.settings.getbool('RANDOMIZE_DOWNLOAD_DELAY', True)
delay = min_delay
if randomize_delay:
delay = random.uniform(min_delay * 0.5, min_delay * 1.5)
current_time = time.time()
wait_time = (self.domain_last_request[domain] + delay) - current_time
if wait_time > 0:
time.sleep(wait_time)
self.domain_last_request[domain] = time.time()
return None
def _extract_domain(self, url):
"""提取域名"""
from urllib.parse import urlparse
return urlparse(url).netloc#常见问题与解决方案
#问题1: 中间件不生效
现象: 配置了中间件但没有执行
解决方案:
# 1. 检查settings.py中的配置路径是否正确
DOWNLOADER_MIDDLEWARES = {
'myproject.middlewares.MyMiddleware': 543,
}
# 2. 确保中间件类名正确且可导入
# 3. 检查是否有语法错误#问题2: 请求被无限重定向
现象: 请求在中间件中被不断重定向
解决方案:
class SafeRedirectMiddleware:
def process_request(self, request, spider):
# 检查重定向次数
redirect_times = request.meta.get('redirect_times', 0)
if redirect_times > 5: # 限制重定向次数
return None
# 你的重定向逻辑
new_request = request.replace(url=new_url)
new_request.meta['redirect_times'] = redirect_times + 1
return new_request#问题3: 代理切换不及时
现象: 代理IP失效后仍继续使用
解决方案:
class SmartProxyMiddleware:
def process_response(self, request, response, spider):
# 检测代理是否失效
if response.status in [403, 429, 503]:
# 标记当前代理为失效
current_proxy = request.meta.get('proxy')
if current_proxy:
self.mark_proxy_failed(current_proxy)
# 立即更换代理
new_request = request.copy()
new_request.meta['change_proxy'] = True
new_request.dont_filter = True
return new_request
return response#最佳实践建议
#设计原则
- 模块化: 将不同功能分离到不同的中间件中
- 可配置: 通过settings.py配置中间件参数
- 健壮性: 妥善处理异常,避免影响整个爬虫
- 性能考虑: 避免在中间件中进行耗时操作
#安全考虑
- 隐私保护: 注意处理敏感信息
- 合规性: 遵守网站的robots.txt和使用条款
- 频率控制: 合理控制请求频率,避免对目标服务器造成压力
💡 核心要点: Downloader Middleware是Scrapy反爬策略的核心组件,通过合理配置和使用中间件,可以有效应对各种反爬机制。记住要根据目标网站的特点选择合适的策略。

