Scrapy电商全站爬取实战项目 - 从零构建多层级电商数据抓取系统

📂 所属阶段:第四阶段 — 实战演练(项目开发篇)
🔗 相关章节:Spider实战 · Selector选择器 · Pipeline管道实战 · 反爬对抗实战

目录

项目背景与目标

电商网站爬取是爬虫领域的经典应用,涉及多级分类、深度翻页、数据提取等挑战。本项目构建的系统具备以下核心能力:

  • 多级分类处理:递归爬取多层级分类结构
  • 深度翻页:智能处理商品列表页翻页
  • 精准数据提取:提取商品标题、价格、规格等关键信息
  • 反爬虫对抗:集成IP轮换、请求头伪装等策略
  • 数据质量保证:实现去重、清洗、验证机制

项目技术栈

  • 爬虫框架:Scrapy 2.x
  • 数据处理:Pydantic
  • 存储方案:MongoDB/MySQL
  • 代理管理:Scrapy-Proxy-Pool

电商网站架构分析

典型电商网站结构可分为四层:

  1. 首页:包含导航菜单,链接到各级分类
  2. 分类页:展示子分类和该分类下的商品列表
  3. 列表页:展示商品卡片,支持翻页和筛选
  4. 详情页:展示商品完整信息

爬取难点主要集中在反爬虫机制、动态内容加载和网站结构变化上。

项目架构设计

项目目录结构

ecommerce_spider/
├── ecommerce_spider/
   ├── spiders/         # 爬虫文件
   ├── items/           # 数据模型
   ├── pipelines/       # 管道(清洗、存储)
   ├── middlewares/     # 中间件(反爬、代理)
   ├── utils/           # 工具函数
   └── settings.py      # 配置
├── scrapy.cfg
└── requirements.txt

核心组件关系

graph TD
    A[起始URL] --> B[分类解析]
    B --> C[列表页解析]
    C --> D[详情页解析]
    D --> E[数据清洗]
    E --> F[数据存储]
    G[反爬中间件] -.-> B
    H[代理中间件] -.-> C

数据模型定义

使用Scrapy的Item定义数据模型,结合ItemLoader简化数据处理:

# ecommerce_spider/items/product_item.py
import scrapy
from itemloaders.processors import TakeFirst, MapCompose, Join
from w3lib.html import remove_tags
import re

def clean_price(value):
    if value:
        cleaned = re.sub(r'[^\d.,]', '', value.strip())
        try:
            return float(cleaned.replace(',', ''))
        except ValueError:
            return None
    return None

class ProductItem(scrapy.Item):
    product_id = scrapy.Field(output_processor=TakeFirst())
    title = scrapy.Field(output_processor=TakeFirst())
    brand = scrapy.Field(output_processor=TakeFirst())
    category_path = scrapy.Field(output_processor=Join(' > '))
    current_price = scrapy.Field(input_processor=MapCompose(clean_price), output_processor=TakeFirst())
    original_price = scrapy.Field(input_processor=MapCompose(clean_price), output_processor=TakeFirst())
    main_image = scrapy.Field(output_processor=TakeFirst())
    gallery_images = scrapy.Field()
    rating = scrapy.Field(output_processor=TakeFirst())
    review_count = scrapy.Field(output_processor=TakeFirst())
    specifications = scrapy.Field()
    url = scrapy.Field(output_processor=TakeFirst())
    crawled_at = scrapy.Field()

核心功能实现

主爬虫框架

主爬虫负责串联分类、列表页、详情页的爬取流程:

# ecommerce_spider/spiders/ecommerce_spider.py
import scrapy
from urllib.parse import urljoin
from ecommerce_spider.items import ProductItem
from datetime import datetime

class EcommerceSpider(scrapy.Spider):
    name = 'ecommerce'
    custom_settings = {
        'DOWNLOAD_DELAY': 2,
        'CONCURRENT_REQUESTS': 8,
    }

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.start_urls = kwargs.get('start_urls', ['https://example.com/categories'])
        self.max_pages = int(kwargs.get('max_pages', 100))

    def start_requests(self):
        for url in self.start_urls:
            yield scrapy.Request(url, callback=self.parse_categories)

    def parse_categories(self, response):
        # 提取分类链接
        category_links = response.css('a.category-link::attr(href)').getall()
        for link in category_links:
            category_url = urljoin(response.url, link)
            yield scrapy.Request(category_url, callback=self.parse_product_list)

    def parse_product_list(self, response):
        # 提取商品链接
        product_links = response.css('a.product-link::attr(href)').getall()
        for link in product_links:
            product_url = urljoin(response.url, link)
            yield scrapy.Request(product_url, callback=self.parse_product_detail)

        # 处理翻页
        next_page = response.css('a.next::attr(href)').get()
        if next_page and response.meta.get('page', 1) < self.max_pages:
            next_url = urljoin(response.url, next_page)
            yield scrapy.Request(next_url, callback=self.parse_product_list, meta={'page': response.meta.get('page', 1) + 1})

    def parse_product_detail(self, response):
        item = ProductItem()
        item['product_id'] = self._extract_product_id(response)
        item['title'] = response.css('h1::text').get().strip()
        item['current_price'] = response.css('.price::text').get()
        item['main_image'] = urljoin(response.url, response.css('.main-image img::attr(src)').get())
        item['url'] = response.url
        item['crawled_at'] = datetime.now().isoformat()
        yield item

    def _extract_product_id(self, response):
        # 从URL或页面元素提取商品ID
        match = re.search(r'/(\d+)/?$', response.url)
        return match.group(1) if match else 'unknown'

多级分类解析

使用面包屑导航和URL结构解析分类层级:

# ecommerce_spider/utils/category_parser.py
from urllib.parse import urljoin

class CategoryParser:
    def parse_hierarchy(self, response):
        # 从面包屑导航解析
        breadcrumbs = response.css('.breadcrumb a')
        hierarchy = []
        for i, crumb in enumerate(breadcrumbs):
            name = crumb.css('::text').get().strip()
            url = urljoin(response.url, crumb.css('::attr(href)').get())
            hierarchy.append({'name': name, 'url': url, 'level': i})
        return hierarchy

智能翻页处理

支持多种翻页方式,包括链接翻页和URL参数翻页:

# ecommerce_spider/utils/pagination_handler.py
import re
from urllib.parse import urlparse, parse_qs, urlencode

class PaginationHandler:
    def handle(self, response, max_pages):
        current_page = self._extract_current_page(response.url)
        if current_page >= max_pages:
            return None
        
        # 尝试找下一页链接
        next_link = response.css('a[rel="next"]::attr(href), a.next::attr(href)').get()
        if next_link:
            return urljoin(response.url, next_link)
        
        # 尝试生成下一页URL
        return self._generate_next_url(response.url, current_page)

    def _extract_current_page(self, url):
        parsed = urlparse(url)
        query = parse_qs(parsed.query)
        if 'page' in query:
            return int(query['page'][0])
        match = re.search(r'/page/(\d+)', url)
        return int(match.group(1)) if match else 1

    def _generate_next_url(self, url, current_page):
        parsed = urlparse(url)
        query = parse_qs(parsed.query)
        query['page'] = [current_page + 1]
        new_query = urlencode(query, doseq=True)
        return parsed._replace(query=new_query).geturl()

反爬虫对抗策略

反爬虫中间件

集成User-Agent随机、请求头伪装和随机延迟:

# ecommerce_spider/middlewares/anti_crawler_middleware.py
import random
import time
from fake_useragent import UserAgent

class AntiCrawlerMiddleware:
    def __init__(self):
        self.ua = UserAgent()

    def process_request(self, request, spider):
        # 随机User-Agent
        request.headers['User-Agent'] = self.ua.random
        # 随机请求头
        request.headers['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
        request.headers['Accept-Language'] = 'zh-CN,zh;q=0.9,en;q=0.8'
        # 随机延迟
        time.sleep(random.uniform(1, 3))
        return None

代理中间件

简单的代理列表轮换:

# ecommerce_spider/middlewares/proxy_middleware.py
import random

class ProxyMiddleware:
    def __init__(self):
        self.proxies = [
            'http://proxy1:port',
            'http://proxy2:port',
        ]

    def process_request(self, request, spider):
        if self.proxies:
            request.meta['proxy'] = random.choice(self.proxies)
        return None

数据去重与清洗

数据清洗管道

清洗数据并验证必需字段:

# ecommerce_spider/pipelines/cleaning_pipeline.py
from ecommerce_spider.items import ProductItem
from itemloaders import ItemLoader

class CleaningPipeline:
    def process_item(self, item, spider):
        loader = ItemLoader(item=ProductItem(), response=item.get('response'))
        # 清洗标题
        title = item.get('title', '')
        loader.add_value('title', title.strip())
        # 清洗价格
        price = item.get('current_price')
        loader.add_value('current_price', price)
        # 验证必需字段
        if not all([loader.get_output_value('title'), loader.get_output_value('current_price')]):
            spider.logger.warning(f"Missing required fields for item: {item.get('url')}")
            return None
        return loader.load_item()

去重管道

基于商品ID和URL去重:

# ecommerce_spider/pipelines/deduplication_pipeline.py
class DeduplicationPipeline:
    def __init__(self):
        self.seen = set()

    def process_item(self, item, spider):
        identifier = f"{item.get('product_id')}_{item.get('url')}"
        if identifier in self.seen:
            spider.logger.info(f"Duplicate item: {item.get('title')}")
            return None
        self.seen.add(identifier)
        return item

性能优化与部署

性能优化配置

调整Scrapy设置以提高性能:

# ecommerce_spider/settings.py
CONCURRENT_REQUESTS = 32
CONCURRENT_REQUESTS_PER_DOMAIN = 8
DOWNLOAD_DELAY = 1
RANDOMIZE_DOWNLOAD_DELAY = 0.5
AUTOTHROTTLE_ENABLED = True
AUTOTHROTTLE_TARGET_CONCURRENCY = 4.0

Docker部署

使用Docker简化部署:

# Dockerfile
FROM python:3.9-slim
WORKDIR /app
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
COPY . .
CMD ["scrapy", "crawl", "ecommerce"]
# docker-compose.yml
version: '3.8'
services:
  spider:
    build: .
    volumes:
      - ./output:/app/output
  mongodb:
    image: mongo:4.4
    ports:
      - "27017:27017"
    volumes:
      - mongodb_data:/data/db
volumes:
  mongodb_data:

通过本项目的实施,你将掌握电商全站爬取的核心技术,构建出高效、稳定的数据采集系统,为后续的数据分析提供有力支撑。