#Spider实战指南 - Request、Response、yield深度解析与爬虫逻辑实现
📂 所属阶段:第一阶段 — 初出茅庐(框架核心篇)
🔗 相关章节:创建你的首个工程 · Selector 选择器
Spider是Scrapy框架的核心组件,负责解析响应、提取数据和发起新请求。本文将深入解析Spider的核心要素,助你掌握高效编写爬虫的技巧。
#目录
#Spider基础结构
一个完整的Spider包含几个基本元素,下面是最常用的模板:
#基础Spider模板
import scrapy
class ExampleSpider(scrapy.Spider):
name = 'example' # 爬虫名称(必须唯一)
allowed_domains = ['example.com'] # 限制爬取范围
start_urls = ['http://example.com'] # 起始URL
def parse(self, response):
# 提取数据
for item in response.css('div.item'):
yield {
'title': item.css('h2::text').get(),
'price': item.css('span.price::text').get(),
'url': response.url
}
# 提取下一页链接
next_page = response.css('a.next::attr(href)').get()
if next_page:
yield response.follow(next_page, callback=self.parse)#Spider核心属性
name: 爬虫唯一标识符,用于启动爬虫allowed_domains: 限制爬虫只爬取指定域名,防止爬取外部链接start_urls: 爬虫开始爬取的URL列表custom_settings: 爬虫特定配置,覆盖全局设置
#Request详解
Request对象是Scrapy中发起HTTP请求的核心组件,包含了请求的所有必要信息。
#Request基本用法
import scrapy
class RequestSpider(scrapy.Spider):
name = 'request_example'
def start_requests(self):
# 普通GET请求
yield scrapy.Request(
url='http://example.com/api/data',
callback=self.parse_get_data,
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Authorization': 'Bearer token123'
},
meta={'request_type': 'api_call'}
)
# POST请求
yield scrapy.Request(
url='http://example.com/login',
callback=self.parse_login,
method='POST',
headers={'Content-Type': 'application/json'},
body='{"username": "user", "password": "pass"}'
)
def parse_get_data(self, response):
data = response.json() if 'application/json' in response.headers.get('Content-Type', b'').decode() else response.text
yield {'api_data': data}#Response详解
Response对象包含了HTTP响应的所有信息,是数据提取的主要来源。
#Response常用属性与方法
def parse_response_attributes(self, response):
# 基本属性
url = response.url # 响应URL
status = response.status # HTTP状态码
headers = response.headers # 响应头
text = response.text # 响应体(字符串)
meta = response.meta # 元数据(从Request传递过来)
# 选择器方法
titles = response.css('h1::text').getall() # CSS选择器
links = response.xpath('//a/@href').getall() # XPath选择器
# 链接跟随
next_page = response.follow('next.html', callback=self.parse_next)
# URL处理
absolute_url = response.urljoin('/relative/path')
# JSON数据提取
if 'application/json' in response.headers.get('Content-Type', b'').decode():
json_data = response.json()#yield的使用技巧
yield是Python生成器的关键字,在Scrapy中用于返回数据和请求。
#yield的主要用途
def parse_with_yield_examples(self, response):
# 1. 返回结构化数据
yield {
'title': response.css('h1::text').get(),
'url': response.url
}
# 2. 返回新的请求
next_page = response.css('a.next::attr(href)').get()
if next_page:
yield scrapy.Request(
url=response.urljoin(next_page),
callback=self.parse,
meta={'page': response.meta.get('page', 1) + 1}
)
# 3. 使用response.follow跟随链接
product_links = response.css('a.product-link::attr(href)').getall()
for link in product_links:
yield response.follow(link, callback=self.parse_product)#爬虫解析逻辑
#多层级解析策略
对于复杂网站,通常需要多层级解析:分类 -> 产品列表 -> 产品详情。
class MultiLevelParsingSpider(scrapy.Spider):
name = 'multi_level_parsing'
def start_requests(self):
yield scrapy.Request('http://example.com/categories', callback=self.parse_categories)
def parse_categories(self, response):
# 解析分类页面,提取分类链接
for category_link in response.css('a.category-link::attr(href)').getall():
yield response.follow(
category_link,
callback=self.parse_products,
meta={'category': response.css('h1::text').get()}
)
def parse_products(self, response):
# 解析产品列表页面,提取产品链接
category = response.meta['category']
for product_link in response.css('a.product-link::attr(href)').getall():
yield response.follow(
product_link,
callback=self.parse_product_detail,
meta={'category': category}
)
# 处理翻页
next_page = response.css('a.next::attr(href)').get()
if next_page:
yield response.follow(next_page, callback=self.parse_products, meta={'category': category})
def parse_product_detail(self, response):
# 解析产品详情页面,提取完整信息
yield {
'category': response.meta['category'],
'url': response.url,
'name': response.css('h1.product-title::text').get(),
'price': response.css('.price::text').get(),
'description': response.css('.description::text').get()
}#链接跟随策略
#response.follow() vs scrapy.Request
response.follow(): 推荐使用,自动处理相对URL,更简洁scrapy.Request: 需要手动处理URL,更灵活
def comparison_of_link_following(self, response):
# 方法1: response.follow() - 推荐使用
next_page = response.css('a.next::attr(href)').get()
if next_page:
yield response.follow(next_page, callback=self.parse)
# 方法2: scrapy.Request - 需要手动处理URL
next_page = response.css('a.next::attr(href)').get()
if next_page:
yield scrapy.Request(url=response.urljoin(next_page), callback=self.parse)#使用LinkExtractor
from scrapy.linkextractors import LinkExtractor
class AdvancedLinkExtractionSpider(scrapy.Spider):
name = 'advanced_links'
def parse(self, response):
# 使用LinkExtractor提取链接
link_extractor = LinkExtractor(
allow=r'/category/\w+', # 允许的URL模式
deny=r'/admin/', # 拒绝的URL模式
restrict_css='.main-content' # 限制在特定CSS选择器内
)
links = link_extractor.extract_links(response)
for link in links:
yield response.follow(link.url, callback=self.parse_category)#数据提取技术
#数据清洗与验证
import re
class DataCleaningSpider(scrapy.Spider):
name = 'data_cleaning'
def parse(self, response):
for product in response.css('div.product'):
# 提取原始数据
raw_title = product.css('.title::text').get()
raw_price = product.css('.price::text').get()
# 清洗数据
cleaned_title = self.clean_text(raw_title)
cleaned_price = self.clean_price(raw_price)
# 验证数据
if self.validate_data(cleaned_title, cleaned_price):
yield {
'title': cleaned_title,
'price': cleaned_price,
'url': response.url
}
def clean_text(self, text):
if not text:
return ''
# 去除首尾空白和多余空白字符
return re.sub(r'\s+', ' ', text.strip())
def clean_price(self, price_str):
if not price_str:
return None
# 提取数字
numbers = re.findall(r'\d+\.?\d*', price_str.replace(',', ''))
return float(numbers[0]) if numbers else None
def validate_data(self, title, price):
if not title or len(title) < 2:
return False
if price is None or price <= 0:
return False
return True#错误处理与异常捕获
#请求错误处理
class ErrorHandlingSpider(scrapy.Spider):
name = 'error_handling'
def start_requests(self):
urls = [
'http://example.com/valid-page',
'http://example.com/404-page',
]
for url in urls:
yield scrapy.Request(
url=url,
callback=self.parse,
errback=self.handle_error
)
def parse(self, response):
if response.status == 200:
yield {
'url': response.url,
'status': response.status,
'title': response.css('title::text').get(),
'success': True
}
def handle_error(self, failure):
request = failure.request
self.logger.error(f"Request failed: {request.url}, Error: {failure.value}")
yield {
'url': request.url,
'error': str(failure.value),
'success': False
}💡 核心要点: Spider是Scrapy的核心组件,掌握Request、Response、yield的使用是编写高效爬虫的关键。理解各种解析技术和错误处理方法,能够帮助你构建稳定可靠的爬虫系统。
🔗 相关教程推荐
- 创建你的首个工程 - 项目初始化
- Selector 选择器 - 数据提取技术
- Pipeline管道实战 - 数据处理管道

