Python Scraper
Guide d'intégration complet pour le monitoring du scraper Python.
Informations
| Propriété | Valeur |
|---|---|
Label app |
python-scraper |
| Rétention | 7 jours (prod), 30 jours (erreurs) |
| Type de logs | JSON structuré recommandé |
Intégration Docker Compose
Structure du Projet
python-scraper/
├── .github/
│ └── workflows/
│ └── deploy.yml
├── docker-compose.yml
├── docker-compose.prod.yml
├── Dockerfile
├── alloy/
│ └── config.alloy
├── .env.example
├── requirements.txt
└── src/
├── main.py
└── logger.py
Dockerfile
FROM python:3.12-slim
WORKDIR /app
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
COPY src/ ./src/
CMD ["python", "-u", "src/main.py"]
Important
L'option -u (unbuffered) est obligatoire pour que les logs Python soient envoyés en temps réel à Docker.
docker-compose.yml
# Note: 'version' is obsolete in Docker Compose v2+
services:
# ═══════════════════════════════════════════════════════════════
# SCRAPER
# ═══════════════════════════════════════════════════════════════
scraper:
build: .
container_name: python-scraper
restart: unless-stopped
environment:
- PYTHONUNBUFFERED=1
- LOG_LEVEL=${LOG_LEVEL:-INFO}
labels:
- "app=python-scraper"
volumes:
- ./data:/app/data
# ═══════════════════════════════════════════════════════════════
# ALLOY - Log Collection
# ═══════════════════════════════════════════════════════════════
alloy:
image: grafana/alloy:latest
container_name: python-scraper-alloy
restart: unless-stopped
volumes:
- ./alloy/config.alloy:/etc/alloy/config.alloy:ro
- /var/run/docker.sock:/var/run/docker.sock:ro
- /var/lib/docker/containers:/var/lib/docker/containers:ro
- alloy_data:/var/lib/alloy
command:
- run
- --server.http.listen-addr=0.0.0.0:12345
- --storage.path=/var/lib/alloy
- /etc/alloy/config.alloy
environment:
- LOKI_URL=${LOKI_URL}
- LOKI_USER=${LOKI_USER}
- LOKI_PASSWORD=${LOKI_PASSWORD}
- APP_ENV=${APP_ENV:-prod}
- HOST_NAME=${HOST_NAME:-scraper-server}
healthcheck:
test: ["CMD", "wget", "-q", "--spider", "http://localhost:12345/ready"]
interval: 30s
timeout: 10s
retries: 3
volumes:
alloy_data:
alloy/config.alloy
// Discovery Docker
discovery.docker "containers" {
host = "unix:///var/run/docker.sock"
}
// Relabeling
discovery.relabel "docker_labels" {
targets = discovery.docker.containers.targets
rule {
source_labels = ["__meta_docker_container_name"]
regex = "/(.*)"
target_label = "container"
}
rule {
source_labels = ["__meta_docker_container_label_app"]
target_label = "app"
}
rule {
source_labels = ["app"]
regex = ".+"
action = "keep"
}
}
// Source
loki.source.docker "docker_logs" {
host = "unix:///var/run/docker.sock"
targets = discovery.relabel.docker_labels.output
labels = {
"env" = env("APP_ENV"),
"host" = env("HOST_NAME"),
}
forward_to = [loki.process.pipeline.receiver]
}
// Processing Pipeline
loki.process "pipeline" {
forward_to = [loki.write.loki_remote.receiver]
// Parse JSON logs
stage.json {
expressions = {
level = "level",
message = "message",
timestamp = "timestamp",
module = "module",
action = "action",
url = "url",
items = "items",
error = "error",
}
}
// Normalize level
stage.template {
source = "level"
template = "{{ ToLower .Value }}"
}
stage.labels {
values = {
level = "",
action = "",
}
}
// Drop empty
stage.drop {
expression = "^\\s*$"
}
}
// Send to Loki
loki.write "loki_remote" {
endpoint {
url = env("LOKI_URL")
basic_auth {
username = env("LOKI_USER")
password = env("LOKI_PASSWORD")
}
}
}
.env.example
# Application
LOG_LEVEL=INFO
# Monitoring
LOKI_URL=https://loki.monitoring.lyroh.com/loki/api/v1/push
LOKI_USER=loki-push
LOKI_PASSWORD=your_password_here
APP_ENV=prod
HOST_NAME=scraper-server-1
Logger Python Recommandé
src/logger.py
import json
import logging
import sys
from datetime import datetime, timezone
class JSONFormatter(logging.Formatter):
"""Format logs as JSON for Loki parsing."""
def format(self, record: logging.LogRecord) -> str:
log_obj = {
"timestamp": datetime.now(timezone.utc).isoformat(),
"level": record.levelname,
"message": record.getMessage(),
"module": record.module,
"function": record.funcName,
"line": record.lineno,
}
# Add extra fields
if hasattr(record, "action"):
log_obj["action"] = record.action
if hasattr(record, "url"):
log_obj["url"] = record.url
if hasattr(record, "items"):
log_obj["items"] = record.items
if hasattr(record, "duration"):
log_obj["duration"] = record.duration
# Add exception info
if record.exc_info:
log_obj["exception"] = self.formatException(record.exc_info)
log_obj["level"] = "ERROR"
return json.dumps(log_obj, ensure_ascii=False)
def setup_logger(name: str = "scraper", level: str = "INFO") -> logging.Logger:
"""Configure and return a JSON logger."""
logger = logging.getLogger(name)
logger.setLevel(getattr(logging, level.upper()))
# Remove existing handlers
logger.handlers.clear()
# Console handler with JSON format
handler = logging.StreamHandler(sys.stdout)
handler.setFormatter(JSONFormatter())
logger.addHandler(handler)
return logger
# Global logger instance
logger = setup_logger()
src/main.py (exemple d'utilisation)
import time
from logger import logger
def scrape_url(url: str) -> list:
"""Example scraping function with logging."""
start = time.time()
logger.info(
f"Starting scrape",
extra={"action": "start", "url": url}
)
try:
# Your scraping logic here
items = [{"id": 1}, {"id": 2}]
duration = int((time.time() - start) * 1000)
logger.info(
f"Scraped {len(items)} items",
extra={
"action": "success",
"url": url,
"items": len(items),
"duration": duration
}
)
return items
except Exception as e:
logger.error(
f"Scrape failed: {e}",
extra={"action": "error", "url": url},
exc_info=True
)
raise
def main():
logger.info("Scraper started", extra={"action": "init"})
urls = [
"https://example.com/page1",
"https://example.com/page2",
]
for url in urls:
try:
scrape_url(url)
except Exception:
pass # Already logged
logger.info("Scraper finished", extra={"action": "complete"})
if __name__ == "__main__":
main()
CI/CD
Stratégie de Branches
| Branche | Environnement | APP_ENV |
|---|---|---|
staging |
Staging | staging |
main |
Production | prod |
Un seul stack monitoring
Staging et production envoient leurs logs au même serveur de monitoring.
Guide complet CI/CD
Voir le guide détaillé : CI/CD GitHub Actions
GitHub Actions - .github/workflows/deploy.yml
name: Deploy Python Scraper
on:
push:
branches: [main, staging]
jobs:
deploy-staging:
if: github.ref_name == 'staging'
runs-on: ubuntu-latest
environment: staging
steps:
- uses: actions/checkout@v4
- name: Deploy to Staging
uses: appleboy/ssh-action@v1.0.3
with:
host: ${{ secrets.STAGING_SERVER_HOST }}
username: ${{ secrets.SERVER_USER }}
key: ${{ secrets.SERVER_SSH_KEY }}
script: |
cd /opt/python-scraper
git pull origin develop
cat > .env << 'EOF'
LOG_LEVEL=DEBUG
LOKI_URL=${{ secrets.LOKI_URL }}
LOKI_USER=${{ secrets.LOKI_USER }}
LOKI_PASSWORD=${{ secrets.LOKI_PASSWORD }}
APP_ENV=staging
HOST_NAME=$(hostname)
EOF
docker-compose -f docker-compose.yml -f docker-compose.staging.yml build --no-cache scraper
docker-compose -f docker-compose.yml -f docker-compose.staging.yml up -d
deploy-prod:
if: github.ref_name == 'main'
runs-on: ubuntu-latest
environment: production
steps:
- uses: actions/checkout@v4
- name: Deploy to Production
uses: appleboy/ssh-action@v1.0.3
with:
host: ${{ secrets.PROD_SERVER_HOST }}
username: ${{ secrets.SERVER_USER }}
key: ${{ secrets.SERVER_SSH_KEY }}
script: |
cd /opt/python-scraper
git pull origin main
cat > .env << 'EOF'
LOG_LEVEL=INFO
LOKI_URL=${{ secrets.LOKI_URL }}
LOKI_USER=${{ secrets.LOKI_USER }}
LOKI_PASSWORD=${{ secrets.LOKI_PASSWORD }}
APP_ENV=prod
HOST_NAME=$(hostname)
EOF
docker-compose -f docker-compose.yml -f docker-compose.prod.yml build --no-cache scraper
docker-compose -f docker-compose.yml -f docker-compose.prod.yml up -d
docker-compose.staging.yml
# Note: 'version' is obsolete in Docker Compose v2+
services:
scraper:
labels:
- "app=python-scraper"
- "env=staging"
alloy:
environment:
- APP_ENV=staging
docker-compose.prod.yml
# Note: 'version' is obsolete in Docker Compose v2+
services:
scraper:
labels:
- "app=python-scraper"
- "env=prod"
alloy:
environment:
- APP_ENV=prod
Requêtes LogQL Utiles
# Tous les logs
{app="python-scraper", env="prod"}
# Actions de scraping
{app="python-scraper"} | json | action="success"
# Erreurs uniquement
{app="python-scraper"} | json | level="ERROR"
# URLs en erreur
{app="python-scraper"} | json | action="error" | line_format "{{.url}}: {{.message}}"
# Statistiques (items scrapés)
{app="python-scraper"} | json | action="success" | unwrap items | __error__=""
# Durée moyenne de scraping
avg_over_time({app="python-scraper"} | json | action="success" | unwrap duration [1h])