#Scrapy电商全站爬取实战项目 - 从零构建多层级电商数据抓取系统
📂 所属阶段:第四阶段 — 实战演练(项目开发篇)
🔗 相关章节:Spider实战 · Selector选择器 · Pipeline管道实战 · 反爬对抗实战
#目录
#项目背景与目标
电商网站爬取是爬虫领域的经典应用,涉及多级分类、深度翻页、数据提取等挑战。本项目构建的系统具备以下核心能力:
- 多级分类处理:递归爬取多层级分类结构
- 深度翻页:智能处理商品列表页翻页
- 精准数据提取:提取商品标题、价格、规格等关键信息
- 反爬虫对抗:集成IP轮换、请求头伪装等策略
- 数据质量保证:实现去重、清洗、验证机制
#项目技术栈
- 爬虫框架:Scrapy 2.x
- 数据处理:Pydantic
- 存储方案:MongoDB/MySQL
- 代理管理:Scrapy-Proxy-Pool
#电商网站架构分析
典型电商网站结构可分为四层:
- 首页:包含导航菜单,链接到各级分类
- 分类页:展示子分类和该分类下的商品列表
- 列表页:展示商品卡片,支持翻页和筛选
- 详情页:展示商品完整信息
爬取难点主要集中在反爬虫机制、动态内容加载和网站结构变化上。
#项目架构设计
#项目目录结构
ecommerce_spider/
├── ecommerce_spider/
│ ├── spiders/ # 爬虫文件
│ ├── items/ # 数据模型
│ ├── pipelines/ # 管道(清洗、存储)
│ ├── middlewares/ # 中间件(反爬、代理)
│ ├── utils/ # 工具函数
│ └── settings.py # 配置
├── scrapy.cfg
└── requirements.txt#核心组件关系
graph TD
A[起始URL] --> B[分类解析]
B --> C[列表页解析]
C --> D[详情页解析]
D --> E[数据清洗]
E --> F[数据存储]
G[反爬中间件] -.-> B
H[代理中间件] -.-> C#数据模型定义
使用Scrapy的Item定义数据模型,结合ItemLoader简化数据处理:
# ecommerce_spider/items/product_item.py
import scrapy
from itemloaders.processors import TakeFirst, MapCompose, Join
from w3lib.html import remove_tags
import re
def clean_price(value):
if value:
cleaned = re.sub(r'[^\d.,]', '', value.strip())
try:
return float(cleaned.replace(',', ''))
except ValueError:
return None
return None
class ProductItem(scrapy.Item):
product_id = scrapy.Field(output_processor=TakeFirst())
title = scrapy.Field(output_processor=TakeFirst())
brand = scrapy.Field(output_processor=TakeFirst())
category_path = scrapy.Field(output_processor=Join(' > '))
current_price = scrapy.Field(input_processor=MapCompose(clean_price), output_processor=TakeFirst())
original_price = scrapy.Field(input_processor=MapCompose(clean_price), output_processor=TakeFirst())
main_image = scrapy.Field(output_processor=TakeFirst())
gallery_images = scrapy.Field()
rating = scrapy.Field(output_processor=TakeFirst())
review_count = scrapy.Field(output_processor=TakeFirst())
specifications = scrapy.Field()
url = scrapy.Field(output_processor=TakeFirst())
crawled_at = scrapy.Field()#核心功能实现
#主爬虫框架
主爬虫负责串联分类、列表页、详情页的爬取流程:
# ecommerce_spider/spiders/ecommerce_spider.py
import scrapy
from urllib.parse import urljoin
from ecommerce_spider.items import ProductItem
from datetime import datetime
class EcommerceSpider(scrapy.Spider):
name = 'ecommerce'
custom_settings = {
'DOWNLOAD_DELAY': 2,
'CONCURRENT_REQUESTS': 8,
}
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.start_urls = kwargs.get('start_urls', ['https://example.com/categories'])
self.max_pages = int(kwargs.get('max_pages', 100))
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(url, callback=self.parse_categories)
def parse_categories(self, response):
# 提取分类链接
category_links = response.css('a.category-link::attr(href)').getall()
for link in category_links:
category_url = urljoin(response.url, link)
yield scrapy.Request(category_url, callback=self.parse_product_list)
def parse_product_list(self, response):
# 提取商品链接
product_links = response.css('a.product-link::attr(href)').getall()
for link in product_links:
product_url = urljoin(response.url, link)
yield scrapy.Request(product_url, callback=self.parse_product_detail)
# 处理翻页
next_page = response.css('a.next::attr(href)').get()
if next_page and response.meta.get('page', 1) < self.max_pages:
next_url = urljoin(response.url, next_page)
yield scrapy.Request(next_url, callback=self.parse_product_list, meta={'page': response.meta.get('page', 1) + 1})
def parse_product_detail(self, response):
item = ProductItem()
item['product_id'] = self._extract_product_id(response)
item['title'] = response.css('h1::text').get().strip()
item['current_price'] = response.css('.price::text').get()
item['main_image'] = urljoin(response.url, response.css('.main-image img::attr(src)').get())
item['url'] = response.url
item['crawled_at'] = datetime.now().isoformat()
yield item
def _extract_product_id(self, response):
# 从URL或页面元素提取商品ID
match = re.search(r'/(\d+)/?$', response.url)
return match.group(1) if match else 'unknown'#多级分类解析
使用面包屑导航和URL结构解析分类层级:
# ecommerce_spider/utils/category_parser.py
from urllib.parse import urljoin
class CategoryParser:
def parse_hierarchy(self, response):
# 从面包屑导航解析
breadcrumbs = response.css('.breadcrumb a')
hierarchy = []
for i, crumb in enumerate(breadcrumbs):
name = crumb.css('::text').get().strip()
url = urljoin(response.url, crumb.css('::attr(href)').get())
hierarchy.append({'name': name, 'url': url, 'level': i})
return hierarchy#智能翻页处理
支持多种翻页方式,包括链接翻页和URL参数翻页:
# ecommerce_spider/utils/pagination_handler.py
import re
from urllib.parse import urlparse, parse_qs, urlencode
class PaginationHandler:
def handle(self, response, max_pages):
current_page = self._extract_current_page(response.url)
if current_page >= max_pages:
return None
# 尝试找下一页链接
next_link = response.css('a[rel="next"]::attr(href), a.next::attr(href)').get()
if next_link:
return urljoin(response.url, next_link)
# 尝试生成下一页URL
return self._generate_next_url(response.url, current_page)
def _extract_current_page(self, url):
parsed = urlparse(url)
query = parse_qs(parsed.query)
if 'page' in query:
return int(query['page'][0])
match = re.search(r'/page/(\d+)', url)
return int(match.group(1)) if match else 1
def _generate_next_url(self, url, current_page):
parsed = urlparse(url)
query = parse_qs(parsed.query)
query['page'] = [current_page + 1]
new_query = urlencode(query, doseq=True)
return parsed._replace(query=new_query).geturl()#反爬虫对抗策略
#反爬虫中间件
集成User-Agent随机、请求头伪装和随机延迟:
# ecommerce_spider/middlewares/anti_crawler_middleware.py
import random
import time
from fake_useragent import UserAgent
class AntiCrawlerMiddleware:
def __init__(self):
self.ua = UserAgent()
def process_request(self, request, spider):
# 随机User-Agent
request.headers['User-Agent'] = self.ua.random
# 随机请求头
request.headers['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
request.headers['Accept-Language'] = 'zh-CN,zh;q=0.9,en;q=0.8'
# 随机延迟
time.sleep(random.uniform(1, 3))
return None#代理中间件
简单的代理列表轮换:
# ecommerce_spider/middlewares/proxy_middleware.py
import random
class ProxyMiddleware:
def __init__(self):
self.proxies = [
'http://proxy1:port',
'http://proxy2:port',
]
def process_request(self, request, spider):
if self.proxies:
request.meta['proxy'] = random.choice(self.proxies)
return None#数据去重与清洗
#数据清洗管道
清洗数据并验证必需字段:
# ecommerce_spider/pipelines/cleaning_pipeline.py
from ecommerce_spider.items import ProductItem
from itemloaders import ItemLoader
class CleaningPipeline:
def process_item(self, item, spider):
loader = ItemLoader(item=ProductItem(), response=item.get('response'))
# 清洗标题
title = item.get('title', '')
loader.add_value('title', title.strip())
# 清洗价格
price = item.get('current_price')
loader.add_value('current_price', price)
# 验证必需字段
if not all([loader.get_output_value('title'), loader.get_output_value('current_price')]):
spider.logger.warning(f"Missing required fields for item: {item.get('url')}")
return None
return loader.load_item()#去重管道
基于商品ID和URL去重:
# ecommerce_spider/pipelines/deduplication_pipeline.py
class DeduplicationPipeline:
def __init__(self):
self.seen = set()
def process_item(self, item, spider):
identifier = f"{item.get('product_id')}_{item.get('url')}"
if identifier in self.seen:
spider.logger.info(f"Duplicate item: {item.get('title')}")
return None
self.seen.add(identifier)
return item#性能优化与部署
#性能优化配置
调整Scrapy设置以提高性能:
# ecommerce_spider/settings.py
CONCURRENT_REQUESTS = 32
CONCURRENT_REQUESTS_PER_DOMAIN = 8
DOWNLOAD_DELAY = 1
RANDOMIZE_DOWNLOAD_DELAY = 0.5
AUTOTHROTTLE_ENABLED = True
AUTOTHROTTLE_TARGET_CONCURRENCY = 4.0#Docker部署
使用Docker简化部署:
# Dockerfile
FROM python:3.9-slim
WORKDIR /app
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
COPY . .
CMD ["scrapy", "crawl", "ecommerce"]# docker-compose.yml
version: '3.8'
services:
spider:
build: .
volumes:
- ./output:/app/output
mongodb:
image: mongo:4.4
ports:
- "27017:27017"
volumes:
- mongodb_data:/data/db
volumes:
mongodb_data:通过本项目的实施,你将掌握电商全站爬取的核心技术,构建出高效、稳定的数据采集系统,为后续的数据分析提供有力支撑。

