#Scrapy反爬对抗实战完全指南 - 验证码破解与全方位反检测技术详解
📂 所属阶段:第三阶段 — 攻防演练(中间件与反爬篇)
🔗 相关章节:Downloader Middleware · Selenium与Playwright集成 · 代理IP池集成
#反爬机制概述
现代网站采用了四层反爬体系,我们需要针对性设计攻防方案:
| 层级 | 核心检测项 |
|---|---|
| 请求特征层 | User-Agent/IP频率/请求头完整性/Cookie验证 |
| 行为特征层 | 访问频率/页面停留时间/鼠标轨迹/点击模式检测 |
| 技术指纹层 | JS执行检测/浏览器指纹识别/设备指纹/网络栈特征 |
| 内容验证层 | 动态内容生成/验证码挑战/人机验证 |
#核心攻防技术实战
#一、智能IP轮换与封禁规避
痛点:IP被封是最常见的反爬触发,传统轮询/随机轮换效率低。
#实现代码
import time
import random
from collections import defaultdict, deque
class IntelligentIPManager:
"""智能IP管理器:评分+冷却+自动解封"""
def __init__(self, proxy_list=None):
self.proxy_list = proxy_list or []
self.ip_stats = defaultdict(lambda: {
'success': 0, 'failure': 0, 'score': 100,
'last_used': 0, 'banned': False, 'ban_time': 0
})
def get_best_proxy(self):
"""综合评分选最优IP:成功率>时间冷却>基础分数"""
scored = []
for proxy, stats in self.ip_stats.items():
if stats['banned']:
# 自动解封(30分钟)
if time.time() - stats['ban_time'] > 1800:
stats['banned'] = False
else:
continue
# 计算权重
total = stats['success'] + stats['failure']
success_rate = stats['success'] / max(1, total)
cool_down = 1.0 if time.time() - stats['last_used'] > 300 else 0.7
score = stats['score'] * success_rate * cool_down
scored.append((proxy, score))
if scored:
return max(scored, key=lambda x: x[1])[0]
return None
def mark_banned(self, proxy):
"""标记IP被封"""
self.ip_stats[proxy]['banned'] = True
self.ip_stats[proxy]['ban_time'] = time.time()
self.ip_stats[proxy]['score'] = 0
def update_stats(self, proxy, success):
"""更新IP统计"""
stats = self.ip_stats[proxy]
stats['last_used'] = time.time()
if success:
stats['success'] +=1; stats['score'] = min(100, stats['score']+5)
else:
stats['failure'] +=1; stats['score'] = max(0, stats['score']-10)#二、请求头与浏览器指纹反检测
#1. 动态请求头生成器
from fake_useragent import UserAgent
import random
class DynamicHeaders:
"""动态生成浏览器级请求头,覆盖Chrome/Safari/Firefox主流版本"""
def __init__(self):
self.ua = UserAgent()
self.accepts = [
'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
]
self.languages = ['zh-CN,zh;q=0.9,en;q=0.8', 'en-US,en;q=0.9']
def generate(self, url=None):
"""生成带随机性的完整请求头"""
headers = {
'User-Agent': self.ua.random,
'Accept': random.choice(self.accepts),
'Accept-Language': random.choice(self.languages),
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate'
}
return headers#2. Selenium/Playwright基础反检测脚本
// 通用浏览器反检测JS,隐藏webdriver、修改关键属性
Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
Object.defineProperty(navigator, 'plugins', {
get: () => [
{ filename: 'internal-pdf-viewer', description: 'Portable Document Format' }
]
});
Object.defineProperty(navigator, 'languages', { get: () => ['zh-CN', 'zh', 'en'] });
const originalToString = Function.prototype.toString;
Function.prototype.toString = function() {
if (this === window.cdc_adoQpoasnfa76pfcZLmcfl_Array) {
return 'function Array() { [native code] }';
}
return originalToString.call(this);
};#三、验证码识别快速入门
#1. 简单字符验证码预处理+OCR
使用pytesseract+OpenCV快速处理常规验证码(非复杂扭曲/旋转):
import cv2
import numpy as np
import pytesseract
def preprocess_captcha(img_path):
"""灰度化→去噪→二值化→形态学闭运算"""
img = cv2.imread(img_path)
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
denoised = cv2.medianBlur(gray, 3)
_, binary = cv2.threshold(denoised, 0, 255, cv2.THRESH_BINARY+cv2.THRESH_OTSU)
kernel = np.ones((2,2), np.uint8)
return cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel)
def ocr_captcha(img_path):
processed = preprocess_captcha(img_path)
# psm 8表示单个单词模式,效果更好
return pytesseract.image_to_string(processed, config='--psm 8 --oem 3').strip()#2. 滑块验证码快速处理思路
使用Selenium的ActionChains生成分段减速+随机抖动的人类轨迹:
from selenium.webdriver.common.action_chains import ActionChains
import time
import random
def generate_track(distance):
"""模拟人类滑动轨迹:先加速后减速,带随机偏移"""
track = []
current = 0
mid = distance * 4/5
t = random.uniform(0.2, 0.3)
v = 0
while current < distance:
a = 2 if current < mid else -3
v0 = v
v = v0 + a * t
x = v0 * t + 0.5 * a * t * t
current += x
track.append(round(x))
# 补回最后一小段
track.append(round(distance - sum(track)))
return track
def slide_captcha(driver, slider, track):
"""执行滑动操作"""
ActionChains(driver).click_and_hold(slider).perform()
for x in track:
ActionChains(driver).move_by_offset(xoffset=x, yoffset=random.randint(-1,1)).perform()
time.sleep(random.uniform(0.01, 0.02))
time.sleep(0.5)
ActionChains(driver).release().perform()#四、频率限制与人类行为模拟
#实现代码
import time
import random
from datetime import datetime
class HumanSimulator:
"""模拟人类浏览行为:活跃度时段调整延迟+页面停留时间"""
activity_patterns = {
(6,9): (0.3, 1.2), # 早上:低活跃,快访问
(9,18): (0.8, 0.9), # 工作:高活跃,正常访问
(18,22): (0.6, 1.1), # 晚上:中等活跃,稍快访问
(22,6): (0.2, 1.5) # 深夜:低活跃,慢访问
}
@classmethod
def get_delay(cls, base=1):
"""基于时段生成延迟"""
hour = datetime.now().hour
for (start, end), (_, speed) in cls.activity_patterns.items():
if (start <= hour < end) or (start > end and (hour >= start or hour < end)):
adjusted = base * speed
return max(0.1, adjusted + random.uniform(-0.3, 0.3))
return base
@classmethod
def simulate_stay(cls):
"""模拟页面停留时间(10-60秒)"""
time.sleep(random.uniform(10, 60))#法律合规与最佳实践
#合规红线
- 尊重版权:仅抓取公开数据,避免商业滥用
- 遵守协议:严格遵循robots.txt、服务条款
- 数据安全:遵守《个人信息保护法》,不存储敏感信息
- 资源约束:控制请求频率,避免服务器压力过大
#最佳实践
- 优先API:使用官方公开API替代爬虫
- 明确身份:User-Agent中添加爬虫标识和联系方式
- 智能重试:遇到429/503自动延长间隔
- 持续监控:记录错误率、响应时间,及时调整策略
#总结
反爬对抗是一场攻防博弈,没有万能方案,需要:
- 分层防护:IP→请求头→行为→指纹多层反检测
- 动态调整:根据监控结果实时切换策略
- 法律底线:始终遵守法律法规和道德准则
💡 核心工具推荐:
fake_useragent(请求头)、redis(分布式/IP池)、playwright-stealth(浏览器反检测)、ddddocr(更强大的验证码OCR)
🏷️ 标签云: Scrapy 反爬虫 验证码破解 IP轮换 请求头伪造 浏览器指纹 反检测 爬虫安全

