Aller au contenu

Python Scraper

Guide d'intégration complet pour le monitoring du scraper Python.

Informations

Propriété Valeur
Label app python-scraper
Rétention 7 jours (prod), 30 jours (erreurs)
Type de logs JSON structuré recommandé

Intégration Docker Compose

Structure du Projet

python-scraper/
├── .github/
│   └── workflows/
│       └── deploy.yml
├── docker-compose.yml
├── docker-compose.prod.yml
├── Dockerfile
├── alloy/
│   └── config.alloy
├── .env.example
├── requirements.txt
└── src/
    ├── main.py
    └── logger.py

Dockerfile

FROM python:3.12-slim

WORKDIR /app

COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt

COPY src/ ./src/

CMD ["python", "-u", "src/main.py"]

Important

L'option -u (unbuffered) est obligatoire pour que les logs Python soient envoyés en temps réel à Docker.

docker-compose.yml

# Note: 'version' is obsolete in Docker Compose v2+

services:
  # ═══════════════════════════════════════════════════════════════
  # SCRAPER
  # ═══════════════════════════════════════════════════════════════
  scraper:
    build: .
    container_name: python-scraper
    restart: unless-stopped
    environment:
      - PYTHONUNBUFFERED=1
      - LOG_LEVEL=${LOG_LEVEL:-INFO}
    labels:
      - "app=python-scraper"
    volumes:
      - ./data:/app/data

  # ═══════════════════════════════════════════════════════════════
  # ALLOY - Log Collection
  # ═══════════════════════════════════════════════════════════════
  alloy:
    image: grafana/alloy:latest
    container_name: python-scraper-alloy
    restart: unless-stopped
    volumes:
      - ./alloy/config.alloy:/etc/alloy/config.alloy:ro
      - /var/run/docker.sock:/var/run/docker.sock:ro
      - /var/lib/docker/containers:/var/lib/docker/containers:ro
      - alloy_data:/var/lib/alloy
    command:
      - run
      - --server.http.listen-addr=0.0.0.0:12345
      - --storage.path=/var/lib/alloy
      - /etc/alloy/config.alloy
    environment:
      - LOKI_URL=${LOKI_URL}
      - LOKI_USER=${LOKI_USER}
      - LOKI_PASSWORD=${LOKI_PASSWORD}
      - APP_ENV=${APP_ENV:-prod}
      - HOST_NAME=${HOST_NAME:-scraper-server}
    healthcheck:
      test: ["CMD", "wget", "-q", "--spider", "http://localhost:12345/ready"]
      interval: 30s
      timeout: 10s
      retries: 3

volumes:
  alloy_data:

alloy/config.alloy

// Discovery Docker
discovery.docker "containers" {
  host = "unix:///var/run/docker.sock"
}

// Relabeling
discovery.relabel "docker_labels" {
  targets = discovery.docker.containers.targets

  rule {
    source_labels = ["__meta_docker_container_name"]
    regex         = "/(.*)"
    target_label  = "container"
  }

  rule {
    source_labels = ["__meta_docker_container_label_app"]
    target_label  = "app"
  }

  rule {
    source_labels = ["app"]
    regex         = ".+"
    action        = "keep"
  }
}

// Source
loki.source.docker "docker_logs" {
  host       = "unix:///var/run/docker.sock"
  targets    = discovery.relabel.docker_labels.output
  labels     = {
    "env"  = env("APP_ENV"),
    "host" = env("HOST_NAME"),
  }
  forward_to = [loki.process.pipeline.receiver]
}

// Processing Pipeline
loki.process "pipeline" {
  forward_to = [loki.write.loki_remote.receiver]

  // Parse JSON logs
  stage.json {
    expressions = {
      level     = "level",
      message   = "message",
      timestamp = "timestamp",
      module    = "module",
      action    = "action",
      url       = "url",
      items     = "items",
      error     = "error",
    }
  }

  // Normalize level
  stage.template {
    source   = "level"
    template = "{{ ToLower .Value }}"
  }

  stage.labels {
    values = {
      level  = "",
      action = "",
    }
  }

  // Drop empty
  stage.drop {
    expression = "^\\s*$"
  }
}

// Send to Loki
loki.write "loki_remote" {
  endpoint {
    url = env("LOKI_URL")
    basic_auth {
      username = env("LOKI_USER")
      password = env("LOKI_PASSWORD")
    }
  }
}

.env.example

# Application
LOG_LEVEL=INFO

# Monitoring
LOKI_URL=https://loki.monitoring.lyroh.com/loki/api/v1/push
LOKI_USER=loki-push
LOKI_PASSWORD=your_password_here
APP_ENV=prod
HOST_NAME=scraper-server-1

Logger Python Recommandé

src/logger.py

import json
import logging
import sys
from datetime import datetime, timezone


class JSONFormatter(logging.Formatter):
    """Format logs as JSON for Loki parsing."""

    def format(self, record: logging.LogRecord) -> str:
        log_obj = {
            "timestamp": datetime.now(timezone.utc).isoformat(),
            "level": record.levelname,
            "message": record.getMessage(),
            "module": record.module,
            "function": record.funcName,
            "line": record.lineno,
        }

        # Add extra fields
        if hasattr(record, "action"):
            log_obj["action"] = record.action
        if hasattr(record, "url"):
            log_obj["url"] = record.url
        if hasattr(record, "items"):
            log_obj["items"] = record.items
        if hasattr(record, "duration"):
            log_obj["duration"] = record.duration

        # Add exception info
        if record.exc_info:
            log_obj["exception"] = self.formatException(record.exc_info)
            log_obj["level"] = "ERROR"

        return json.dumps(log_obj, ensure_ascii=False)


def setup_logger(name: str = "scraper", level: str = "INFO") -> logging.Logger:
    """Configure and return a JSON logger."""
    logger = logging.getLogger(name)
    logger.setLevel(getattr(logging, level.upper()))

    # Remove existing handlers
    logger.handlers.clear()

    # Console handler with JSON format
    handler = logging.StreamHandler(sys.stdout)
    handler.setFormatter(JSONFormatter())
    logger.addHandler(handler)

    return logger


# Global logger instance
logger = setup_logger()

src/main.py (exemple d'utilisation)

import time
from logger import logger


def scrape_url(url: str) -> list:
    """Example scraping function with logging."""
    start = time.time()

    logger.info(
        f"Starting scrape",
        extra={"action": "start", "url": url}
    )

    try:
        # Your scraping logic here
        items = [{"id": 1}, {"id": 2}]
        duration = int((time.time() - start) * 1000)

        logger.info(
            f"Scraped {len(items)} items",
            extra={
                "action": "success",
                "url": url,
                "items": len(items),
                "duration": duration
            }
        )
        return items

    except Exception as e:
        logger.error(
            f"Scrape failed: {e}",
            extra={"action": "error", "url": url},
            exc_info=True
        )
        raise


def main():
    logger.info("Scraper started", extra={"action": "init"})

    urls = [
        "https://example.com/page1",
        "https://example.com/page2",
    ]

    for url in urls:
        try:
            scrape_url(url)
        except Exception:
            pass  # Already logged

    logger.info("Scraper finished", extra={"action": "complete"})


if __name__ == "__main__":
    main()

CI/CD

Stratégie de Branches

Branche Environnement APP_ENV
staging Staging staging
main Production prod

Un seul stack monitoring

Staging et production envoient leurs logs au même serveur de monitoring.

Guide complet CI/CD

Voir le guide détaillé : CI/CD GitHub Actions

GitHub Actions - .github/workflows/deploy.yml

name: Deploy Python Scraper

on:
  push:
    branches: [main, staging]

jobs:
  deploy-staging:
    if: github.ref_name == 'staging'
    runs-on: ubuntu-latest
    environment: staging

    steps:
      - uses: actions/checkout@v4

      - name: Deploy to Staging
        uses: appleboy/ssh-action@v1.0.3
        with:
          host: ${{ secrets.STAGING_SERVER_HOST }}
          username: ${{ secrets.SERVER_USER }}
          key: ${{ secrets.SERVER_SSH_KEY }}
          script: |
            cd /opt/python-scraper

            git pull origin develop

            cat > .env << 'EOF'
            LOG_LEVEL=DEBUG
            LOKI_URL=${{ secrets.LOKI_URL }}
            LOKI_USER=${{ secrets.LOKI_USER }}
            LOKI_PASSWORD=${{ secrets.LOKI_PASSWORD }}
            APP_ENV=staging
            HOST_NAME=$(hostname)
            EOF

            docker-compose -f docker-compose.yml -f docker-compose.staging.yml build --no-cache scraper
            docker-compose -f docker-compose.yml -f docker-compose.staging.yml up -d

  deploy-prod:
    if: github.ref_name == 'main'
    runs-on: ubuntu-latest
    environment: production

    steps:
      - uses: actions/checkout@v4

      - name: Deploy to Production
        uses: appleboy/ssh-action@v1.0.3
        with:
          host: ${{ secrets.PROD_SERVER_HOST }}
          username: ${{ secrets.SERVER_USER }}
          key: ${{ secrets.SERVER_SSH_KEY }}
          script: |
            cd /opt/python-scraper

            git pull origin main

            cat > .env << 'EOF'
            LOG_LEVEL=INFO
            LOKI_URL=${{ secrets.LOKI_URL }}
            LOKI_USER=${{ secrets.LOKI_USER }}
            LOKI_PASSWORD=${{ secrets.LOKI_PASSWORD }}
            APP_ENV=prod
            HOST_NAME=$(hostname)
            EOF

            docker-compose -f docker-compose.yml -f docker-compose.prod.yml build --no-cache scraper
            docker-compose -f docker-compose.yml -f docker-compose.prod.yml up -d

docker-compose.staging.yml

# Note: 'version' is obsolete in Docker Compose v2+

services:
  scraper:
    labels:
      - "app=python-scraper"
      - "env=staging"

  alloy:
    environment:
      - APP_ENV=staging

docker-compose.prod.yml

# Note: 'version' is obsolete in Docker Compose v2+

services:
  scraper:
    labels:
      - "app=python-scraper"
      - "env=prod"

  alloy:
    environment:
      - APP_ENV=prod

Requêtes LogQL Utiles

# Tous les logs
{app="python-scraper", env="prod"}

# Actions de scraping
{app="python-scraper"} | json | action="success"

# Erreurs uniquement
{app="python-scraper"} | json | level="ERROR"

# URLs en erreur
{app="python-scraper"} | json | action="error" | line_format "{{.url}}: {{.message}}"

# Statistiques (items scrapés)
{app="python-scraper"} | json | action="success" | unwrap items | __error__=""

# Durée moyenne de scraping
avg_over_time({app="python-scraper"} | json | action="success" | unwrap duration [1h])

Alertes Recommandées

# Dans Grafana Alerting
- Scraper silencieux (aucun log en 30 min)
- Taux d'erreurs > 10% sur 5 minutes
- Durée de scraping > 60 secondes