Initial commit: Lighthouse scanner service

This commit is contained in:
Sereth1 2025-12-08 10:06:56 +07:00
commit 90ad47a721
38 changed files with 5375 additions and 0 deletions

27
backend/.env.example Normal file
View File

@ -0,0 +1,27 @@
# Django Core Settings
DEBUG=True
SECRET_KEY=your-secret-key-change-in-production-abc123xyz789
ALLOWED_HOSTS=localhost,127.0.0.1,web
# Database
DATABASE_URL=postgres://analyzer:analyzer_password@db:5432/website_analyzer
# Redis & Celery
REDIS_URL=redis://redis:6379/0
CELERY_BROKER_URL=redis://redis:6379/0
CELERY_RESULT_BACKEND=redis://redis:6379/1
# OWASP ZAP Configuration
ZAP_API_KEY=zap-api-key-change-me
ZAP_HOST=http://zap:8080
# Lighthouse Configuration
LIGHTHOUSE_CHROME_FLAGS=--headless --no-sandbox --disable-gpu
# Scan Settings
MAX_SCAN_TIME_SECONDS=300
SCAN_RATE_LIMIT_MINUTES=5
MAX_CONCURRENT_SCANS=3
# Security
CORS_ALLOWED_ORIGINS=http://localhost:3000,http://localhost:8000

84
backend/Dockerfile Normal file
View File

@ -0,0 +1,84 @@
# Website Analyzer Backend - Dockerfile
# Multi-stage build for efficient image size
FROM python:3.11-slim as builder
# Set environment variables
ENV PYTHONDONTWRITEBYTECODE=1 \
PYTHONUNBUFFERED=1 \
PIP_NO_CACHE_DIR=1 \
PIP_DISABLE_PIP_VERSION_CHECK=1
WORKDIR /app
# Install system dependencies
RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential \
libpq-dev \
curl \
&& rm -rf /var/lib/apt/lists/*
# Install Python dependencies
COPY requirements.txt .
RUN pip install --user -r requirements.txt
# Install Playwright and its dependencies
RUN pip install --user playwright && \
python -m playwright install chromium && \
python -m playwright install-deps chromium
# ==========================================================================
# Production Stage
# ==========================================================================
FROM python:3.11-slim
ENV PYTHONDONTWRITEBYTECODE=1 \
PYTHONUNBUFFERED=1 \
PATH="/root/.local/bin:$PATH"
WORKDIR /app
# Install runtime dependencies
RUN apt-get update && apt-get install -y --no-install-recommends \
libpq5 \
curl \
# Playwright/Chromium dependencies
libnss3 \
libnspr4 \
libatk1.0-0 \
libatk-bridge2.0-0 \
libcups2 \
libdrm2 \
libdbus-1-3 \
libxkbcommon0 \
libxcomposite1 \
libxdamage1 \
libxfixes3 \
libxrandr2 \
libgbm1 \
libasound2 \
libpango-1.0-0 \
libcairo2 \
libatspi2.0-0 \
&& rm -rf /var/lib/apt/lists/*
# Copy Python packages from builder
COPY --from=builder /root/.local /root/.local
COPY --from=builder /root/.cache/ms-playwright /root/.cache/ms-playwright
# Copy application code
COPY . .
# Create logs directory
RUN mkdir -p logs staticfiles
# Create non-root user for security
RUN useradd -m -u 1000 appuser && \
chown -R appuser:appuser /app /root/.local /root/.cache
USER appuser
# Expose port
EXPOSE 8000
# Default command
CMD ["gunicorn", "core.wsgi:application", "--bind", "0.0.0.0:8000", "--workers", "4"]

5
backend/api/__init__.py Normal file
View File

@ -0,0 +1,5 @@
"""
API app initialization.
"""
default_app_config = 'api.apps.ApiConfig'

11
backend/api/apps.py Normal file
View File

@ -0,0 +1,11 @@
"""
API app configuration.
"""
from django.apps import AppConfig
class ApiConfig(AppConfig):
default_auto_field = 'django.db.models.BigAutoField'
name = 'api'
verbose_name = 'REST API'

52
backend/api/exceptions.py Normal file
View File

@ -0,0 +1,52 @@
"""
Custom exception handler for DRF.
"""
from rest_framework.views import exception_handler
from rest_framework.response import Response
from rest_framework import status
import logging
logger = logging.getLogger(__name__)
def custom_exception_handler(exc, context):
"""
Custom exception handler that provides consistent error responses.
Handles common exceptions and formats them consistently.
"""
# Call REST framework's default exception handler first
response = exception_handler(exc, context)
if response is not None:
# Customize the response data
custom_response_data = {
'error': True,
'status_code': response.status_code,
}
if isinstance(response.data, dict):
if 'detail' in response.data:
custom_response_data['message'] = str(response.data['detail'])
else:
custom_response_data['errors'] = response.data
elif isinstance(response.data, list):
custom_response_data['errors'] = response.data
else:
custom_response_data['message'] = str(response.data)
response.data = custom_response_data
return response
# Handle unexpected exceptions
logger.exception(f"Unhandled exception: {exc}")
return Response(
{
'error': True,
'status_code': 500,
'message': 'An unexpected error occurred',
},
status=status.HTTP_500_INTERNAL_SERVER_ERROR
)

243
backend/api/serializers.py Normal file
View File

@ -0,0 +1,243 @@
"""
DRF Serializers for the API.
This module defines serializers for converting model instances
to JSON and validating input data.
"""
from rest_framework import serializers
from websites.models import Website, Scan, Issue, Metric, ScanStatus
class IssueSerializer(serializers.ModelSerializer):
"""Serializer for Issue model."""
severity_display = serializers.CharField(source='get_severity_display', read_only=True)
category_display = serializers.CharField(source='get_category_display', read_only=True)
tool_display = serializers.CharField(source='get_tool_display', read_only=True)
class Meta:
model = Issue
fields = [
'id',
'category',
'category_display',
'severity',
'severity_display',
'tool',
'tool_display',
'title',
'description',
'affected_url',
'remediation',
'created_at',
]
read_only_fields = fields
class MetricSerializer(serializers.ModelSerializer):
"""Serializer for Metric model."""
formatted_value = serializers.CharField(source='get_formatted_value', read_only=True)
unit_display = serializers.CharField(source='get_unit_display', read_only=True)
class Meta:
model = Metric
fields = [
'id',
'name',
'display_name',
'value',
'unit',
'unit_display',
'formatted_value',
'source',
'score',
]
read_only_fields = fields
class ScanListSerializer(serializers.ModelSerializer):
"""Serializer for Scan list views (minimal data)."""
status_display = serializers.CharField(source='get_status_display', read_only=True)
website_url = serializers.CharField(source='website.url', read_only=True)
issues_count = serializers.SerializerMethodField()
class Meta:
model = Scan
fields = [
'id',
'website_url',
'status',
'status_display',
'created_at',
'completed_at',
'overall_score',
'performance_score',
'security_score',
'issues_count',
]
read_only_fields = fields
def get_issues_count(self, obj):
return obj.issues.count()
class ScanDetailSerializer(serializers.ModelSerializer):
"""Serializer for Scan detail views (full data)."""
status_display = serializers.CharField(source='get_status_display', read_only=True)
website_url = serializers.CharField(source='website.url', read_only=True)
website_domain = serializers.CharField(source='website.domain', read_only=True)
issues = IssueSerializer(many=True, read_only=True)
metrics = MetricSerializer(many=True, read_only=True)
issues_by_category = serializers.SerializerMethodField()
issues_by_severity = serializers.SerializerMethodField()
class Meta:
model = Scan
fields = [
'id',
'website_url',
'website_domain',
'status',
'status_display',
'created_at',
'started_at',
'completed_at',
'overall_score',
'performance_score',
'accessibility_score',
'seo_score',
'best_practices_score',
'security_score',
'error_message',
'issues',
'metrics',
'issues_by_category',
'issues_by_severity',
]
read_only_fields = fields
def get_issues_by_category(self, obj):
"""Group issues by category."""
from collections import defaultdict
grouped = defaultdict(list)
for issue in obj.issues.all():
grouped[issue.category].append(IssueSerializer(issue).data)
return dict(grouped)
def get_issues_by_severity(self, obj):
"""Count issues by severity."""
from django.db.models import Count
counts = obj.issues.values('severity').annotate(count=Count('id'))
return {item['severity']: item['count'] for item in counts}
class ScanCreateSerializer(serializers.Serializer):
"""Serializer for creating new scans."""
url = serializers.URLField(
required=True,
help_text="The URL to scan (must be http or https)"
)
def validate_url(self, value):
"""Validate and normalize the URL."""
from scanner.utils import validate_url
is_valid, result = validate_url(value)
if not is_valid:
raise serializers.ValidationError(result)
return result # Return normalized URL
def create(self, validated_data):
"""Create Website and Scan records."""
from scanner.tasks import check_rate_limit, check_concurrent_scan_limit, run_scan_task
url = validated_data['url']
# Check rate limit
rate_limit_error = check_rate_limit(url)
if rate_limit_error:
raise serializers.ValidationError({'url': rate_limit_error})
# Check concurrent scan limit
concurrent_error = check_concurrent_scan_limit()
if concurrent_error:
raise serializers.ValidationError({'non_field_errors': concurrent_error})
# Get or create Website
website, created = Website.objects.get_or_create(
url=url,
defaults={'domain': validated_data.get('domain', '')}
)
# Create Scan
scan = Scan.objects.create(
website=website,
status=ScanStatus.PENDING
)
# Trigger Celery task
task = run_scan_task.delay(str(scan.id))
# Update scan with task ID
scan.celery_task_id = task.id
scan.save(update_fields=['celery_task_id'])
return scan
class WebsiteSerializer(serializers.ModelSerializer):
"""Serializer for Website model."""
scans_count = serializers.SerializerMethodField()
latest_scan = serializers.SerializerMethodField()
class Meta:
model = Website
fields = [
'id',
'url',
'domain',
'created_at',
'last_scanned_at',
'scans_count',
'latest_scan',
]
read_only_fields = fields
def get_scans_count(self, obj):
return obj.scans.count()
def get_latest_scan(self, obj):
latest = obj.scans.first()
if latest:
return ScanListSerializer(latest).data
return None
class WebsiteDetailSerializer(WebsiteSerializer):
"""Detailed Website serializer with scan list."""
scans = ScanListSerializer(many=True, read_only=True)
class Meta(WebsiteSerializer.Meta):
fields = WebsiteSerializer.Meta.fields + ['scans']
class HealthCheckSerializer(serializers.Serializer):
"""Serializer for health check response."""
status = serializers.CharField()
database = serializers.CharField()
redis = serializers.CharField()
celery = serializers.CharField()
timestamp = serializers.DateTimeField()

18
backend/api/urls.py Normal file
View File

@ -0,0 +1,18 @@
"""
URL routing for the API.
"""
from django.urls import path, include
from rest_framework.routers import DefaultRouter
from . import views
router = DefaultRouter()
router.register(r'scans', views.ScanViewSet, basename='scan')
router.register(r'websites', views.WebsiteViewSet, basename='website')
router.register(r'issues', views.IssueViewSet, basename='issue')
urlpatterns = [
path('', views.api_root, name='api-root'),
path('health/', views.health_check, name='health-check'),
path('', include(router.urls)),
]

336
backend/api/views.py Normal file
View File

@ -0,0 +1,336 @@
"""
DRF Views for the API.
This module defines API views for scans, websites, and issues.
"""
import logging
from django.db import connection
from django.utils import timezone
from django.core.cache import cache
from rest_framework import viewsets, status, generics
from rest_framework.decorators import api_view, action
from rest_framework.response import Response
from rest_framework.pagination import PageNumberPagination
from rest_framework.throttling import AnonRateThrottle
from websites.models import Website, Scan, Issue, Metric
from .serializers import (
WebsiteSerializer,
WebsiteDetailSerializer,
ScanListSerializer,
ScanDetailSerializer,
ScanCreateSerializer,
IssueSerializer,
MetricSerializer,
HealthCheckSerializer,
)
logger = logging.getLogger(__name__)
class ScanRateThrottle(AnonRateThrottle):
"""Custom throttle for scan creation."""
rate = '10/hour'
class StandardResultsPagination(PageNumberPagination):
"""Standard pagination for list views."""
page_size = 20
page_size_query_param = 'page_size'
max_page_size = 100
class ScanViewSet(viewsets.ModelViewSet):
"""
ViewSet for Scan operations.
Endpoints:
- POST /api/scans/ - Create a new scan
- GET /api/scans/ - List all scans
- GET /api/scans/{id}/ - Get scan details
- DELETE /api/scans/{id}/ - Delete a scan
"""
queryset = Scan.objects.select_related('website').prefetch_related('issues', 'metrics')
pagination_class = StandardResultsPagination
def get_serializer_class(self):
if self.action == 'list':
return ScanListSerializer
elif self.action == 'create':
return ScanCreateSerializer
return ScanDetailSerializer
def get_throttles(self):
if self.action == 'create':
return [ScanRateThrottle()]
return super().get_throttles()
def create(self, request, *args, **kwargs):
"""
Create a new scan.
Request body:
```json
{"url": "https://example.com"}
```
Returns the created scan with pending status.
The scan will be processed asynchronously.
"""
serializer = self.get_serializer(data=request.data)
serializer.is_valid(raise_exception=True)
try:
scan = serializer.save()
# Return the created scan details
response_serializer = ScanDetailSerializer(scan)
return Response(
response_serializer.data,
status=status.HTTP_201_CREATED
)
except Exception as e:
logger.exception("Error creating scan")
return Response(
{'error': str(e)},
status=status.HTTP_500_INTERNAL_SERVER_ERROR
)
@action(detail=True, methods=['get'])
def issues(self, request, pk=None):
"""Get all issues for a scan."""
scan = self.get_object()
issues = scan.issues.all()
# Optional filtering
category = request.query_params.get('category')
severity = request.query_params.get('severity')
tool = request.query_params.get('tool')
if category:
issues = issues.filter(category=category)
if severity:
issues = issues.filter(severity=severity)
if tool:
issues = issues.filter(tool=tool)
serializer = IssueSerializer(issues, many=True)
return Response(serializer.data)
@action(detail=True, methods=['get'])
def metrics(self, request, pk=None):
"""Get all metrics for a scan."""
scan = self.get_object()
metrics = scan.metrics.all()
# Optional filtering by source
source = request.query_params.get('source')
if source:
metrics = metrics.filter(source=source)
serializer = MetricSerializer(metrics, many=True)
return Response(serializer.data)
@action(detail=True, methods=['get'])
def status(self, request, pk=None):
"""Get just the status of a scan (for polling)."""
scan = self.get_object()
return Response({
'id': str(scan.id),
'status': scan.status,
'status_display': scan.get_status_display(),
'progress': self._get_scan_progress(scan),
})
def _get_scan_progress(self, scan):
"""Estimate scan progress based on status and results."""
if scan.status == 'done':
return 100
elif scan.status == 'failed':
return 0
elif scan.status == 'running':
# Estimate based on what data we have
progress = 10 # Started
if scan.raw_headers_data:
progress += 20
if scan.raw_playwright_data:
progress += 25
if scan.raw_lighthouse_data:
progress += 30
if scan.raw_zap_data:
progress += 15
return min(progress, 95)
return 0
class WebsiteViewSet(viewsets.ReadOnlyModelViewSet):
"""
ViewSet for Website operations.
Endpoints:
- GET /api/websites/ - List all websites
- GET /api/websites/{id}/ - Get website details
- GET /api/websites/{id}/scans/ - Get scans for a website
"""
queryset = Website.objects.prefetch_related('scans')
pagination_class = StandardResultsPagination
def get_serializer_class(self):
if self.action == 'retrieve':
return WebsiteDetailSerializer
return WebsiteSerializer
@action(detail=True, methods=['get'])
def scans(self, request, pk=None):
"""Get all scans for a website."""
website = self.get_object()
scans = website.scans.all()
# Apply pagination
page = self.paginate_queryset(scans)
if page is not None:
serializer = ScanListSerializer(page, many=True)
return self.get_paginated_response(serializer.data)
serializer = ScanListSerializer(scans, many=True)
return Response(serializer.data)
class IssueViewSet(viewsets.ReadOnlyModelViewSet):
"""
ViewSet for Issue operations.
Endpoints:
- GET /api/issues/ - List all issues (with filtering)
- GET /api/issues/{id}/ - Get issue details
"""
queryset = Issue.objects.select_related('scan', 'scan__website')
serializer_class = IssueSerializer
pagination_class = StandardResultsPagination
def get_queryset(self):
queryset = super().get_queryset()
# Filter by scan
scan_id = self.request.query_params.get('scan')
if scan_id:
queryset = queryset.filter(scan_id=scan_id)
# Filter by category
category = self.request.query_params.get('category')
if category:
queryset = queryset.filter(category=category)
# Filter by severity
severity = self.request.query_params.get('severity')
if severity:
queryset = queryset.filter(severity=severity)
# Filter by tool
tool = self.request.query_params.get('tool')
if tool:
queryset = queryset.filter(tool=tool)
return queryset
@api_view(['GET'])
def health_check(request):
"""
Health check endpoint.
Checks:
- Database connectivity
- Redis connectivity
- Celery worker status
Returns health status of all components.
"""
health = {
'status': 'healthy',
'database': 'unknown',
'redis': 'unknown',
'celery': 'unknown',
'timestamp': timezone.now(),
}
# Check database
try:
connection.ensure_connection()
health['database'] = 'healthy'
except Exception as e:
health['database'] = f'unhealthy: {e}'
health['status'] = 'unhealthy'
# Check Redis
try:
cache.set('health_check', 'ok', 10)
if cache.get('health_check') == 'ok':
health['redis'] = 'healthy'
else:
health['redis'] = 'unhealthy: cache not working'
health['status'] = 'degraded'
except Exception as e:
health['redis'] = f'unhealthy: {e}'
health['status'] = 'degraded'
# Check Celery (basic check)
try:
from core.celery import app as celery_app
inspect = celery_app.control.inspect()
# Try to get active workers
active = inspect.active()
if active:
health['celery'] = f'healthy ({len(active)} workers)'
else:
health['celery'] = 'degraded: no active workers'
health['status'] = 'degraded'
except Exception as e:
health['celery'] = f'unknown: {e}'
status_code = 200 if health['status'] == 'healthy' else 503
serializer = HealthCheckSerializer(health)
return Response(serializer.data, status=status_code)
@api_view(['GET'])
def api_root(request):
"""
API root endpoint.
Returns available endpoints and basic API information.
"""
return Response({
'message': 'Website Analyzer API',
'version': '1.0.0',
'endpoints': {
'scans': '/api/scans/',
'websites': '/api/websites/',
'issues': '/api/issues/',
'health': '/api/health/',
},
'documentation': {
'create_scan': {
'method': 'POST',
'url': '/api/scans/',
'body': {'url': 'https://example.com'},
'description': 'Create a new website scan'
},
'get_scan': {
'method': 'GET',
'url': '/api/scans/{id}/',
'description': 'Get scan results and details'
},
'list_scans': {
'method': 'GET',
'url': '/api/scans/',
'description': 'List all scans with pagination'
},
}
})

9
backend/core/__init__.py Normal file
View File

@ -0,0 +1,9 @@
"""
Core module initialization.
This module loads the Celery app so that shared_task will use this app.
"""
from .celery import app as celery_app
__all__ = ('celery_app',)

11
backend/core/asgi.py Normal file
View File

@ -0,0 +1,11 @@
"""
ASGI config for Website Analyzer project.
"""
import os
from django.core.asgi import get_asgi_application
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'core.settings')
application = get_asgi_application()

28
backend/core/celery.py Normal file
View File

@ -0,0 +1,28 @@
"""
Celery configuration for Website Analyzer.
This module configures Celery for asynchronous task processing,
specifically for running website scans in the background.
"""
import os
from celery import Celery
# Set the default Django settings module for the 'celery' program.
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'core.settings')
app = Celery('website_analyzer')
# Using a string here means the worker doesn't have to serialize
# the configuration object to child processes.
app.config_from_object('django.conf:settings', namespace='CELERY')
# Load task modules from all registered Django apps.
app.autodiscover_tasks()
@app.task(bind=True, ignore_result=True)
def debug_task(self):
"""Debug task for testing Celery connectivity."""
print(f'Request: {self.request!r}')

300
backend/core/settings.py Normal file
View File

@ -0,0 +1,300 @@
"""
Django settings for Website Analyzer project.
This module contains all configuration settings for the Django application,
including database, caching, security, and third-party integrations.
"""
import os
from pathlib import Path
from dotenv import load_dotenv
# Load environment variables from .env file
load_dotenv()
# Build paths inside the project like this: BASE_DIR / 'subdir'.
BASE_DIR = Path(__file__).resolve().parent.parent
# SECURITY WARNING: keep the secret key used in production secret!
SECRET_KEY = os.getenv('SECRET_KEY', 'django-insecure-change-me-in-production')
# SECURITY WARNING: don't run with debug turned on in production!
DEBUG = os.getenv('DEBUG', 'False').lower() in ('true', '1', 'yes')
ALLOWED_HOSTS = os.getenv('ALLOWED_HOSTS', 'localhost,127.0.0.1').split(',')
# Application definition
INSTALLED_APPS = [
'django.contrib.admin',
'django.contrib.auth',
'django.contrib.contenttypes',
'django.contrib.sessions',
'django.contrib.messages',
'django.contrib.staticfiles',
# Third-party apps
'rest_framework',
'corsheaders',
# Local apps
'websites',
'scanner',
'api',
]
MIDDLEWARE = [
'django.middleware.security.SecurityMiddleware',
'whitenoise.middleware.WhiteNoiseMiddleware',
'corsheaders.middleware.CorsMiddleware',
'django.contrib.sessions.middleware.SessionMiddleware',
'django.middleware.common.CommonMiddleware',
'django.middleware.csrf.CsrfViewMiddleware',
'django.contrib.auth.middleware.AuthenticationMiddleware',
'django.contrib.messages.middleware.MessageMiddleware',
'django.middleware.clickjacking.XFrameOptionsMiddleware',
]
ROOT_URLCONF = 'core.urls'
TEMPLATES = [
{
'BACKEND': 'django.template.backends.django.DjangoTemplates',
'DIRS': [BASE_DIR / 'templates'],
'APP_DIRS': True,
'OPTIONS': {
'context_processors': [
'django.template.context_processors.debug',
'django.template.context_processors.request',
'django.contrib.auth.context_processors.auth',
'django.contrib.messages.context_processors.messages',
],
},
},
]
WSGI_APPLICATION = 'core.wsgi.application'
# Database
# Parse DATABASE_URL or use default PostgreSQL settings
DATABASE_URL = os.getenv('DATABASE_URL', 'postgres://analyzer:analyzer_password@localhost:5432/website_analyzer')
# Parse the DATABASE_URL
import re
db_pattern = r'postgres://(?P<user>[^:]+):(?P<password>[^@]+)@(?P<host>[^:]+):(?P<port>\d+)/(?P<name>.+)'
db_match = re.match(db_pattern, DATABASE_URL)
if db_match:
DATABASES = {
'default': {
'ENGINE': 'django.db.backends.postgresql',
'NAME': db_match.group('name'),
'USER': db_match.group('user'),
'PASSWORD': db_match.group('password'),
'HOST': db_match.group('host'),
'PORT': db_match.group('port'),
}
}
else:
# Fallback for development
DATABASES = {
'default': {
'ENGINE': 'django.db.backends.sqlite3',
'NAME': BASE_DIR / 'db.sqlite3',
}
}
# Password validation
AUTH_PASSWORD_VALIDATORS = [
{
'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator',
},
{
'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator',
},
{
'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator',
},
{
'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator',
},
]
# Internationalization
LANGUAGE_CODE = 'en-us'
TIME_ZONE = 'UTC'
USE_I18N = True
USE_TZ = True
# Static files (CSS, JavaScript, Images)
STATIC_URL = 'static/'
STATIC_ROOT = BASE_DIR / 'staticfiles'
STATICFILES_DIRS = [BASE_DIR / 'static']
STATICFILES_STORAGE = 'whitenoise.storage.CompressedManifestStaticFilesStorage'
# Default primary key field type
DEFAULT_AUTO_FIELD = 'django.db.models.BigAutoField'
# =============================================================================
# REST Framework Configuration
# =============================================================================
REST_FRAMEWORK = {
'DEFAULT_RENDERER_CLASSES': [
'rest_framework.renderers.JSONRenderer',
'rest_framework.renderers.BrowsableAPIRenderer',
],
'DEFAULT_PAGINATION_CLASS': 'rest_framework.pagination.PageNumberPagination',
'PAGE_SIZE': 20,
'DEFAULT_THROTTLE_CLASSES': [
'rest_framework.throttling.AnonRateThrottle',
'rest_framework.throttling.UserRateThrottle'
],
'DEFAULT_THROTTLE_RATES': {
'anon': '100/hour',
'user': '1000/hour',
'scan': '10/hour', # Specific rate for scan creation
},
'EXCEPTION_HANDLER': 'api.exceptions.custom_exception_handler',
}
# =============================================================================
# CORS Configuration
# =============================================================================
CORS_ALLOWED_ORIGINS = os.getenv(
'CORS_ALLOWED_ORIGINS',
'http://localhost:3000,http://localhost:8000'
).split(',')
CORS_ALLOW_CREDENTIALS = True
# =============================================================================
# Celery Configuration
# =============================================================================
CELERY_BROKER_URL = os.getenv('CELERY_BROKER_URL', 'redis://localhost:6379/0')
CELERY_RESULT_BACKEND = os.getenv('CELERY_RESULT_BACKEND', 'redis://localhost:6379/1')
CELERY_ACCEPT_CONTENT = ['json']
CELERY_TASK_SERIALIZER = 'json'
CELERY_RESULT_SERIALIZER = 'json'
CELERY_TIMEZONE = TIME_ZONE
CELERY_TASK_TRACK_STARTED = True
CELERY_TASK_TIME_LIMIT = int(os.getenv('MAX_SCAN_TIME_SECONDS', '300'))
CELERY_TASK_SOFT_TIME_LIMIT = CELERY_TASK_TIME_LIMIT - 30
# =============================================================================
# Redis Cache Configuration
# =============================================================================
REDIS_URL = os.getenv('REDIS_URL', 'redis://localhost:6379/0')
CACHES = {
'default': {
'BACKEND': 'django.core.cache.backends.redis.RedisCache',
'LOCATION': REDIS_URL,
}
}
# =============================================================================
# Scanner Configuration
# =============================================================================
SCANNER_CONFIG = {
# OWASP ZAP settings
'ZAP_API_KEY': os.getenv('ZAP_API_KEY', ''),
'ZAP_HOST': os.getenv('ZAP_HOST', 'http://localhost:8080'),
'ZAP_TIMEOUT': 120,
# Lighthouse settings
'LIGHTHOUSE_CHROME_FLAGS': os.getenv(
'LIGHTHOUSE_CHROME_FLAGS',
'--headless --no-sandbox --disable-gpu'
),
'LIGHTHOUSE_TIMEOUT': 60,
# Playwright settings
'PLAYWRIGHT_TIMEOUT': 30000, # milliseconds
'PLAYWRIGHT_VIEWPORT': {'width': 1920, 'height': 1080},
# General scan settings
'MAX_SCAN_TIME_SECONDS': int(os.getenv('MAX_SCAN_TIME_SECONDS', '300')),
'SCAN_RATE_LIMIT_MINUTES': int(os.getenv('SCAN_RATE_LIMIT_MINUTES', '5')),
'MAX_CONCURRENT_SCANS': int(os.getenv('MAX_CONCURRENT_SCANS', '3')),
# Safety settings - blocked IP ranges (RFC1918 private ranges + localhost)
'BLOCKED_IP_RANGES': [
'10.0.0.0/8',
'172.16.0.0/12',
'192.168.0.0/16',
'127.0.0.0/8',
'169.254.0.0/16', # Link-local
'::1/128', # IPv6 localhost
'fc00::/7', # IPv6 private
'fe80::/10', # IPv6 link-local
],
'BLOCKED_HOSTS': ['localhost', 'localhost.localdomain'],
# Large file thresholds
'LARGE_IMAGE_THRESHOLD_BYTES': 1024 * 1024, # 1 MB
'LARGE_JS_BUNDLE_THRESHOLD_BYTES': 500 * 1024, # 500 KB
}
# =============================================================================
# Logging Configuration
# =============================================================================
LOGGING = {
'version': 1,
'disable_existing_loggers': False,
'formatters': {
'verbose': {
'format': '{levelname} {asctime} {module} {process:d} {thread:d} {message}',
'style': '{',
},
'simple': {
'format': '{levelname} {asctime} {module} {message}',
'style': '{',
},
},
'handlers': {
'console': {
'class': 'logging.StreamHandler',
'formatter': 'simple',
},
'file': {
'class': 'logging.FileHandler',
'filename': BASE_DIR / 'logs' / 'django.log',
'formatter': 'verbose',
},
},
'root': {
'handlers': ['console'],
'level': 'INFO',
},
'loggers': {
'django': {
'handlers': ['console'],
'level': os.getenv('DJANGO_LOG_LEVEL', 'INFO'),
'propagate': False,
},
'scanner': {
'handlers': ['console'],
'level': 'DEBUG' if DEBUG else 'INFO',
'propagate': False,
},
'celery': {
'handlers': ['console'],
'level': 'INFO',
'propagate': False,
},
},
}
# Create logs directory if it doesn't exist
(BASE_DIR / 'logs').mkdir(exist_ok=True)

20
backend/core/urls.py Normal file
View File

@ -0,0 +1,20 @@
"""
URL configuration for Website Analyzer project.
"""
from django.contrib import admin
from django.urls import path, include
from django.views.generic import TemplateView
urlpatterns = [
# Admin
path('admin/', admin.site.urls),
# API endpoints
path('api/', include('api.urls')),
# Frontend views
path('', TemplateView.as_view(template_name='index.html'), name='home'),
path('scan/<uuid:scan_id>/', TemplateView.as_view(template_name='scan_detail.html'), name='scan_detail'),
]

11
backend/core/wsgi.py Normal file
View File

@ -0,0 +1,11 @@
"""
WSGI config for Website Analyzer project.
"""
import os
from django.core.wsgi import get_wsgi_application
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'core.settings')
application = get_wsgi_application()

22
backend/manage.py Normal file
View File

@ -0,0 +1,22 @@
#!/usr/bin/env python
"""Django's command-line utility for administrative tasks."""
import os
import sys
def main():
"""Run administrative tasks."""
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'core.settings')
try:
from django.core.management import execute_from_command_line
except ImportError as exc:
raise ImportError(
"Couldn't import Django. Are you sure it's installed and "
"available on your PYTHONPATH environment variable? Did you "
"forget to activate a virtual environment?"
) from exc
execute_from_command_line(sys.argv)
if __name__ == '__main__':
main()

91
backend/pyproject.toml Normal file
View File

@ -0,0 +1,91 @@
[build-system]
requires = ["setuptools>=61.0"]
build-backend = "setuptools.build_meta"
[project]
name = "website-analyzer"
version = "1.0.0"
description = "A Django-based web application for analyzing website performance, security, and best practices"
readme = "README.md"
license = {text = "MIT"}
requires-python = ">=3.11"
authors = [
{name = "Website Analyzer Team"}
]
classifiers = [
"Development Status :: 4 - Beta",
"Framework :: Django :: 5.0",
"Intended Audience :: Developers",
"License :: OSI Approved :: MIT License",
"Operating System :: OS Independent",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
]
dependencies = [
"Django>=5.0,<6.0",
"djangorestframework>=3.14.0",
"django-cors-headers>=4.3.0",
"psycopg2-binary>=2.9.9",
"celery[redis]>=5.3.0",
"redis>=5.0.0",
"httpx>=0.26.0",
"playwright>=1.40.0",
"python-dotenv>=1.0.0",
"gunicorn>=21.2.0",
"whitenoise>=6.6.0",
"validators>=0.22.0",
"ipaddress>=1.0.23",
]
[project.optional-dependencies]
dev = [
"pytest>=7.4.0",
"pytest-django>=4.7.0",
"pytest-asyncio>=0.23.0",
"pytest-cov>=4.1.0",
"black>=23.12.0",
"isort>=5.13.0",
"flake8>=7.0.0",
"mypy>=1.8.0",
"django-stubs>=4.2.0",
]
[tool.black]
line-length = 100
target-version = ['py311']
include = '\.pyi?$'
exclude = '''
/(
\.git
| \.hg
| \.mypy_cache
| \.tox
| \.venv
| _build
| buck-out
| build
| dist
| migrations
)/
'''
[tool.isort]
profile = "black"
line_length = 100
skip = ["migrations", ".venv"]
[tool.pytest.ini_options]
DJANGO_SETTINGS_MODULE = "core.settings"
python_files = ["test_*.py", "*_test.py"]
addopts = "-v --tb=short"
[tool.mypy]
python_version = "3.11"
plugins = ["mypy_django_plugin.main"]
ignore_missing_imports = true
strict = false
[tool.django-stubs]
django_settings_module = "core.settings"

36
backend/requirements.txt Normal file
View File

@ -0,0 +1,36 @@
# Django & REST Framework
Django>=5.0,<6.0
djangorestframework>=3.14.0
django-cors-headers>=4.3.0
# Database
psycopg2-binary>=2.9.9
# Async Task Queue
celery[redis]>=5.3.0
redis>=5.0.0
# HTTP Client
httpx>=0.26.0
# Browser Automation
playwright>=1.40.0
# Environment & Config
python-dotenv>=1.0.0
# Production Server
gunicorn>=21.2.0
whitenoise>=6.6.0
# Validation & Utilities
validators>=0.22.0
# Development & Testing
pytest>=7.4.0
pytest-django>=4.7.0
pytest-asyncio>=0.23.0
pytest-cov>=4.1.0
black>=23.12.0
isort>=5.13.0
flake8>=7.0.0

View File

@ -0,0 +1,5 @@
"""
Scanner app initialization.
"""
default_app_config = 'scanner.apps.ScannerConfig'

11
backend/scanner/apps.py Normal file
View File

@ -0,0 +1,11 @@
"""
Scanner app configuration.
"""
from django.apps import AppConfig
class ScannerConfig(AppConfig):
default_auto_field = 'django.db.models.BigAutoField'
name = 'scanner'
verbose_name = 'Scanner Tools'

View File

@ -0,0 +1,25 @@
"""
Scanner modules initialization.
This package contains the various scanner implementations
that analyze websites for performance, security, and best practices.
"""
from .base import BaseScanner, ScannerResult
from .lighthouse import LighthouseScanner
from .playwright_scanner import PlaywrightScanner
from .zap import ZAPScanner
from .headers import HeaderScanner
from .tls import TLSScanner
from .runner import ScanRunner
__all__ = [
'BaseScanner',
'ScannerResult',
'LighthouseScanner',
'PlaywrightScanner',
'ZAPScanner',
'HeaderScanner',
'TLSScanner',
'ScanRunner',
]

View File

@ -0,0 +1,161 @@
"""
Base scanner interface and result structures.
All scanner implementations should inherit from BaseScanner
and return ScannerResult objects.
"""
import logging
from abc import ABC, abstractmethod
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional
from enum import Enum
logger = logging.getLogger(__name__)
class ScannerStatus(str, Enum):
"""Status of a scanner execution."""
SUCCESS = "success"
PARTIAL = "partial"
FAILED = "failed"
SKIPPED = "skipped"
@dataclass
class IssueData:
"""
Represents a single issue found by a scanner.
Attributes:
category: Issue category (security, performance, etc.)
severity: Issue severity (critical, high, medium, low, info)
title: Brief title of the issue
description: Detailed description
tool: The scanner that found this issue
affected_url: Specific URL affected (optional)
remediation: Suggested fix (optional)
raw_data: Original scanner data (optional)
"""
category: str
severity: str
title: str
description: str
tool: str
affected_url: Optional[str] = None
remediation: Optional[str] = None
raw_data: Optional[Dict[str, Any]] = None
@dataclass
class MetricData:
"""
Represents a single metric measured by a scanner.
Attributes:
name: Internal name (e.g., 'first_contentful_paint_ms')
display_name: Human-readable name
value: Numeric value
unit: Unit of measurement
source: The scanner that measured this
score: Normalized score (0-1) if available
"""
name: str
display_name: str
value: float
unit: str
source: str
score: Optional[float] = None
@dataclass
class ScannerResult:
"""
Result of a scanner execution.
Attributes:
scanner_name: Name of the scanner
status: Execution status
issues: List of issues found
metrics: List of metrics measured
scores: Dictionary of category scores
raw_data: Original scanner output
error_message: Error details if failed
"""
scanner_name: str
status: ScannerStatus
issues: List[IssueData] = field(default_factory=list)
metrics: List[MetricData] = field(default_factory=list)
scores: Dict[str, int] = field(default_factory=dict)
raw_data: Optional[Dict[str, Any]] = None
error_message: Optional[str] = None
class BaseScanner(ABC):
"""
Abstract base class for all scanners.
Each scanner implementation must implement the `run` method
which performs the actual scan and returns a ScannerResult.
"""
name: str = "base"
def __init__(self, config: Optional[Dict[str, Any]] = None):
"""
Initialize the scanner with optional configuration.
Args:
config: Scanner-specific configuration dictionary
"""
self.config = config or {}
self.logger = logging.getLogger(f"scanner.{self.name}")
@abstractmethod
def run(self, url: str) -> ScannerResult:
"""
Run the scanner against the given URL.
Args:
url: The URL to scan
Returns:
ScannerResult with findings, metrics, and status
"""
pass
def is_available(self) -> bool:
"""
Check if the scanner service/tool is available.
Returns:
True if the scanner can be used, False otherwise
"""
return True
def _create_error_result(self, error: Exception) -> ScannerResult:
"""
Create a failed result from an exception.
Args:
error: The exception that occurred
Returns:
ScannerResult with failed status
"""
self.logger.error(f"Scanner {self.name} failed: {error}")
return ScannerResult(
scanner_name=self.name,
status=ScannerStatus.FAILED,
error_message=str(error),
issues=[
IssueData(
category="scanner",
severity="info",
title=f"{self.name.title()} scan failed",
description=f"The {self.name} scanner encountered an error: {error}",
tool=self.name,
remediation="Check scanner service configuration and availability."
)
]
)

View File

@ -0,0 +1,405 @@
"""
HTTP Header Security Scanner.
This module analyzes HTTP response headers for security
best practices and common misconfigurations.
"""
import logging
from typing import Any, Dict, List, Optional, Tuple
import httpx
from .base import (
BaseScanner,
ScannerResult,
ScannerStatus,
IssueData,
MetricData,
)
logger = logging.getLogger(__name__)
# Security header definitions with expected values and severity
SECURITY_HEADERS = {
'Strict-Transport-Security': {
'severity': 'high',
'description': 'HTTP Strict Transport Security (HSTS) forces browsers to use HTTPS.',
'remediation': (
'Add the header: Strict-Transport-Security: max-age=31536000; includeSubDomains; preload'
),
'check_value': lambda v: 'max-age' in v.lower() and int(
v.lower().split('max-age=')[1].split(';')[0].strip()
) >= 31536000 if 'max-age=' in v.lower() else False,
},
'Content-Security-Policy': {
'severity': 'high',
'description': 'Content Security Policy (CSP) helps prevent XSS and data injection attacks.',
'remediation': (
"Implement a Content-Security-Policy header that restricts sources for scripts, "
"styles, and other resources. Start with a report-only policy to identify issues."
),
'check_value': lambda v: "default-src" in v.lower() or "script-src" in v.lower(),
},
'X-Content-Type-Options': {
'severity': 'medium',
'description': 'Prevents browsers from MIME-sniffing responses.',
'remediation': 'Add the header: X-Content-Type-Options: nosniff',
'check_value': lambda v: v.lower() == 'nosniff',
},
'X-Frame-Options': {
'severity': 'medium',
'description': 'Protects against clickjacking by controlling page framing.',
'remediation': 'Add the header: X-Frame-Options: DENY or SAMEORIGIN',
'check_value': lambda v: v.upper() in ['DENY', 'SAMEORIGIN'],
},
'Referrer-Policy': {
'severity': 'low',
'description': 'Controls how much referrer information is sent with requests.',
'remediation': (
'Add the header: Referrer-Policy: strict-origin-when-cross-origin '
'or no-referrer-when-downgrade'
),
'check_value': lambda v: v.lower() in [
'no-referrer', 'no-referrer-when-downgrade',
'strict-origin', 'strict-origin-when-cross-origin',
'same-origin', 'origin', 'origin-when-cross-origin'
],
},
'Permissions-Policy': {
'severity': 'low',
'description': 'Controls which browser features can be used.',
'remediation': (
'Add a Permissions-Policy header to restrict access to sensitive browser APIs '
'like geolocation, camera, and microphone.'
),
'check_value': lambda v: len(v) > 0,
},
'X-XSS-Protection': {
'severity': 'info',
'description': 'Legacy XSS filter (deprecated in modern browsers, CSP is preferred).',
'remediation': 'While deprecated, you can add: X-XSS-Protection: 1; mode=block',
'check_value': lambda v: '1' in v,
},
}
# CORS security checks
CORS_CHECKS = {
'permissive_origin': {
'severity': 'high',
'title': 'Overly permissive CORS (Access-Control-Allow-Origin: *)',
'description': (
'The server allows requests from any origin. This can expose sensitive data '
'to malicious websites if combined with credentials.'
),
'remediation': (
'Restrict Access-Control-Allow-Origin to specific trusted domains instead of using *. '
'Never use * with Access-Control-Allow-Credentials: true.'
),
},
'credentials_with_wildcard': {
'severity': 'critical',
'title': 'CORS allows credentials with wildcard origin',
'description': (
'The server has Access-Control-Allow-Credentials: true with Access-Control-Allow-Origin: *. '
'This is a severe misconfiguration that can allow credential theft.'
),
'remediation': (
'Never combine Access-Control-Allow-Credentials: true with a wildcard origin. '
'Implement a whitelist of allowed origins.'
),
},
}
class HeaderScanner(BaseScanner):
"""
Scanner for HTTP security headers.
Checks for:
- Missing security headers
- Improperly configured headers
- CORS misconfigurations
- Cookie security flags
"""
name = "header_check"
def __init__(self, config: Optional[Dict[str, Any]] = None):
super().__init__(config)
self.timeout = self.config.get('timeout', 30)
def run(self, url: str) -> ScannerResult:
"""
Run header security analysis on the URL.
Args:
url: The URL to analyze
Returns:
ScannerResult with header findings
"""
self.logger.info(f"Starting header scan for {url}")
try:
# Make both GET and HEAD requests
headers_data = self._fetch_headers(url)
issues = []
metrics = []
# Check security headers
header_issues, header_score = self._check_security_headers(
headers_data['headers']
)
issues.extend(header_issues)
# Check CORS configuration
cors_issues = self._check_cors(headers_data['headers'], url)
issues.extend(cors_issues)
# Check cookies
cookie_issues = self._check_cookies(headers_data['headers'], url)
issues.extend(cookie_issues)
# Create metrics
metrics.append(MetricData(
name='security_headers_score',
display_name='Security Headers Score',
value=float(header_score),
unit='percent',
source='header_check'
))
metrics.append(MetricData(
name='headers_missing_count',
display_name='Missing Security Headers',
value=float(len([i for i in header_issues if 'missing' in i.title.lower()])),
unit='count',
source='header_check'
))
self.logger.info(
f"Header scan complete: {len(issues)} issues, score: {header_score}"
)
return ScannerResult(
scanner_name=self.name,
status=ScannerStatus.SUCCESS,
issues=issues,
metrics=metrics,
raw_data=headers_data
)
except httpx.TimeoutException:
return self._create_error_result(Exception("Header check timed out"))
except Exception as e:
return self._create_error_result(e)
def _fetch_headers(self, url: str) -> Dict[str, Any]:
"""Fetch headers from the URL."""
with httpx.Client(
timeout=self.timeout,
follow_redirects=True,
verify=True
) as client:
# GET request
get_response = client.get(url)
# HEAD request
head_response = client.head(url)
return {
'url': str(get_response.url),
'status_code': get_response.status_code,
'headers': dict(get_response.headers),
'head_headers': dict(head_response.headers),
'redirected': str(get_response.url) != url,
'redirect_history': [str(r.url) for r in get_response.history],
}
def _check_security_headers(
self,
headers: Dict[str, str]
) -> Tuple[List[IssueData], int]:
"""
Check for security headers.
Returns:
Tuple of (list of issues, security score 0-100)
"""
issues = []
score = 100
headers_lower = {k.lower(): v for k, v in headers.items()}
for header_name, config in SECURITY_HEADERS.items():
header_key = header_name.lower()
if header_key not in headers_lower:
# Missing header
severity = config['severity']
deduction = {'critical': 20, 'high': 15, 'medium': 10, 'low': 5, 'info': 2}
score -= deduction.get(severity, 5)
issues.append(IssueData(
category='headers',
severity=severity,
title=f'Missing security header: {header_name}',
description=config['description'],
tool='header_check',
remediation=config['remediation'],
raw_data={'header': header_name, 'status': 'missing'}
))
else:
# Header present, check value
value = headers_lower[header_key]
check_func = config.get('check_value')
if check_func and not check_func(value):
issues.append(IssueData(
category='headers',
severity='low',
title=f'Weak configuration: {header_name}',
description=(
f"{config['description']} "
f"Current value may not provide optimal protection: {value}"
),
tool='header_check',
remediation=config['remediation'],
raw_data={'header': header_name, 'value': value, 'status': 'weak'}
))
score -= 3
return issues, max(0, score)
def _check_cors(self, headers: Dict[str, str], url: str) -> List[IssueData]:
"""Check CORS configuration for issues."""
issues = []
headers_lower = {k.lower(): v for k, v in headers.items()}
acao = headers_lower.get('access-control-allow-origin', '')
acac = headers_lower.get('access-control-allow-credentials', '')
if acao == '*':
if acac.lower() == 'true':
# Critical: credentials with wildcard
check = CORS_CHECKS['credentials_with_wildcard']
issues.append(IssueData(
category='cors',
severity=check['severity'],
title=check['title'],
description=check['description'],
tool='header_check',
affected_url=url,
remediation=check['remediation'],
raw_data={
'Access-Control-Allow-Origin': acao,
'Access-Control-Allow-Credentials': acac
}
))
else:
# Warning: permissive origin
check = CORS_CHECKS['permissive_origin']
issues.append(IssueData(
category='cors',
severity='medium', # Lower severity without credentials
title=check['title'],
description=check['description'],
tool='header_check',
affected_url=url,
remediation=check['remediation'],
raw_data={'Access-Control-Allow-Origin': acao}
))
return issues
def _check_cookies(self, headers: Dict[str, str], url: str) -> List[IssueData]:
"""Check Set-Cookie headers for security flags."""
issues = []
headers_lower = {k.lower(): v for k, v in headers.items()}
# Get all Set-Cookie headers
set_cookies = []
for key, value in headers.items():
if key.lower() == 'set-cookie':
set_cookies.append(value)
is_https = url.startswith('https://')
for cookie in set_cookies:
cookie_lower = cookie.lower()
cookie_name = cookie.split('=')[0] if '=' in cookie else 'unknown'
cookie_issues = []
# Check Secure flag on HTTPS
if is_https and 'secure' not in cookie_lower:
cookie_issues.append({
'flag': 'Secure',
'description': (
'Cookie is set without Secure flag on HTTPS site. '
'This allows the cookie to be sent over unencrypted connections.'
),
'severity': 'high'
})
# Check HttpOnly flag (important for session cookies)
if 'httponly' not in cookie_lower:
# Check if it might be a session cookie
if any(term in cookie_name.lower() for term in ['session', 'auth', 'token', 'user']):
cookie_issues.append({
'flag': 'HttpOnly',
'description': (
'Session-like cookie is set without HttpOnly flag. '
'This allows JavaScript access, increasing XSS risk.'
),
'severity': 'high'
})
else:
cookie_issues.append({
'flag': 'HttpOnly',
'description': (
'Cookie is set without HttpOnly flag. '
'Consider adding it unless JavaScript needs access.'
),
'severity': 'low'
})
# Check SameSite attribute
if 'samesite' not in cookie_lower:
cookie_issues.append({
'flag': 'SameSite',
'description': (
'Cookie is set without SameSite attribute. '
'This can enable CSRF attacks in some scenarios.'
),
'severity': 'medium'
})
elif 'samesite=none' in cookie_lower and 'secure' not in cookie_lower:
cookie_issues.append({
'flag': 'SameSite=None without Secure',
'description': (
'Cookie has SameSite=None but no Secure flag. '
'Modern browsers will reject this cookie.'
),
'severity': 'medium'
})
# Create issues for this cookie
for ci in cookie_issues:
issues.append(IssueData(
category='security',
severity=ci['severity'],
title=f"Cookie '{cookie_name}' missing {ci['flag']} flag",
description=ci['description'],
tool='header_check',
affected_url=url,
remediation=(
f"Add the {ci['flag']} flag to the Set-Cookie header. "
f"Example: Set-Cookie: {cookie_name}=value; Secure; HttpOnly; SameSite=Strict"
),
raw_data={'cookie': cookie[:200]} # Truncate for storage
))
return issues

View File

@ -0,0 +1,323 @@
"""
Lighthouse Scanner Integration.
This module integrates with Google Lighthouse to measure
performance, accessibility, SEO, and best practices.
"""
import logging
from typing import Any, Dict, Optional
import httpx
from django.conf import settings
from .base import (
BaseScanner,
ScannerResult,
ScannerStatus,
IssueData,
MetricData,
)
logger = logging.getLogger(__name__)
class LighthouseScanner(BaseScanner):
"""
Scanner that uses Google Lighthouse for performance analysis.
Communicates with the Lighthouse service container via HTTP API.
Collects performance metrics, Core Web Vitals, and various audits.
"""
name = "lighthouse"
def __init__(self, config: Optional[Dict[str, Any]] = None):
super().__init__(config)
self.service_url = self.config.get(
'service_url',
'http://lighthouse:3001'
)
self.timeout = self.config.get('timeout', 120)
def is_available(self) -> bool:
"""Check if Lighthouse service is available."""
try:
with httpx.Client(timeout=5) as client:
response = client.get(f"{self.service_url}/health")
return response.status_code == 200
except Exception as e:
self.logger.warning(f"Lighthouse service not available: {e}")
return False
def run(self, url: str) -> ScannerResult:
"""
Run Lighthouse scan against the URL.
Args:
url: The URL to analyze
Returns:
ScannerResult with performance metrics and issues
"""
self.logger.info(f"Starting Lighthouse scan for {url}")
try:
with httpx.Client(timeout=self.timeout) as client:
response = client.post(
f"{self.service_url}/scan",
json={"url": url}
)
response.raise_for_status()
data = response.json()
return self._parse_results(url, data)
except httpx.TimeoutException:
return self._create_error_result(
Exception("Lighthouse scan timed out")
)
except httpx.HTTPStatusError as e:
return self._create_error_result(
Exception(f"Lighthouse service error: {e.response.status_code}")
)
except Exception as e:
return self._create_error_result(e)
def _parse_results(self, url: str, data: Dict[str, Any]) -> ScannerResult:
"""
Parse Lighthouse results into ScannerResult format.
Args:
url: The scanned URL
data: Raw Lighthouse response data
Returns:
Parsed ScannerResult
"""
issues = []
metrics = []
# Extract scores
scores = data.get('scores', {})
# Extract and create metrics
raw_metrics = data.get('metrics', {})
# Core Web Vitals
metric_mappings = [
('firstContentfulPaint', 'First Contentful Paint', 'ms'),
('largestContentfulPaint', 'Largest Contentful Paint', 'ms'),
('speedIndex', 'Speed Index', 'ms'),
('timeToInteractive', 'Time to Interactive', 'ms'),
('totalBlockingTime', 'Total Blocking Time', 'ms'),
('cumulativeLayoutShift', 'Cumulative Layout Shift', 'score'),
]
for key, display_name, unit in metric_mappings:
metric_data = raw_metrics.get(key, {})
if metric_data and metric_data.get('value') is not None:
metrics.append(MetricData(
name=self._to_snake_case(key),
display_name=display_name,
value=metric_data['value'],
unit=unit,
source='lighthouse',
score=metric_data.get('score')
))
# Resource metrics
resources = data.get('resources', {})
diagnostics = data.get('diagnostics', {})
if resources.get('totalByteWeight'):
metrics.append(MetricData(
name='total_byte_weight',
display_name='Total Page Weight',
value=resources['totalByteWeight'],
unit='bytes',
source='lighthouse'
))
if resources.get('bootupTime'):
metrics.append(MetricData(
name='javascript_bootup_time',
display_name='JavaScript Boot-up Time',
value=resources['bootupTime'],
unit='ms',
source='lighthouse'
))
if diagnostics.get('numRequests'):
metrics.append(MetricData(
name='total_requests',
display_name='Total Network Requests',
value=float(diagnostics['numRequests']),
unit='count',
source='lighthouse'
))
# Extract issues from failed audits
raw_issues = data.get('issues', [])
for issue in raw_issues:
severity = self._score_to_severity(issue.get('score', 0.5))
category = self._map_category(issue.get('category', 'performance'))
issues.append(IssueData(
category=category,
severity=severity,
title=issue.get('title', 'Unknown issue'),
description=issue.get('description', ''),
tool='lighthouse',
affected_url=url,
remediation=self._get_remediation(issue.get('id')),
raw_data=issue
))
# Check for large bundles
large_scripts = resources.get('scriptTreemap', [])
for script in large_scripts[:5]: # Top 5 largest
if script.get('resourceBytes', 0) > settings.SCANNER_CONFIG.get(
'LARGE_JS_BUNDLE_THRESHOLD_BYTES', 500 * 1024
):
issues.append(IssueData(
category='resources',
severity='medium',
title=f"Large JavaScript bundle detected",
description=(
f"The script '{script.get('name', 'Unknown')}' "
f"is {script['resourceBytes'] / 1024:.1f} KB. "
"Large bundles can slow down page load and increase memory usage."
),
tool='lighthouse',
affected_url=url,
remediation=(
"Consider code splitting, tree shaking, or lazy loading "
"to reduce bundle size."
),
raw_data=script
))
# Check for unused JavaScript
unused_js = resources.get('unusedJavascript', [])
if unused_js:
total_wasted = sum(u.get('wastedBytes', 0) for u in unused_js)
if total_wasted > 100 * 1024: # More than 100KB unused
issues.append(IssueData(
category='performance',
severity='medium',
title="Significant unused JavaScript detected",
description=(
f"Found {total_wasted / 1024:.1f} KB of unused JavaScript "
f"across {len(unused_js)} resources. This increases page "
"load time and memory usage."
),
tool='lighthouse',
affected_url=url,
remediation=(
"Remove unused code or use code splitting to load "
"JavaScript only when needed."
),
raw_data={'unused_resources': unused_js}
))
# Check for render-blocking resources
blocking = resources.get('renderBlockingResources', [])
if blocking:
total_wasted_ms = sum(r.get('wastedMs', 0) for r in blocking)
if total_wasted_ms > 500:
issues.append(IssueData(
category='performance',
severity='medium',
title="Render-blocking resources detected",
description=(
f"Found {len(blocking)} render-blocking resources "
f"adding approximately {total_wasted_ms:.0f}ms to page load. "
"These resources delay first paint."
),
tool='lighthouse',
affected_url=url,
remediation=(
"Consider inlining critical CSS, deferring non-critical JS, "
"or using async/defer attributes."
),
raw_data={'blocking_resources': blocking}
))
self.logger.info(
f"Lighthouse scan complete: {len(issues)} issues, {len(metrics)} metrics"
)
return ScannerResult(
scanner_name=self.name,
status=ScannerStatus.SUCCESS,
issues=issues,
metrics=metrics,
scores={
'performance': scores.get('performance', 0),
'accessibility': scores.get('accessibility', 0),
'best_practices': scores.get('bestPractices', 0),
'seo': scores.get('seo', 0),
},
raw_data=data
)
def _to_snake_case(self, name: str) -> str:
"""Convert camelCase to snake_case."""
import re
s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower()
def _score_to_severity(self, score: float) -> str:
"""Convert Lighthouse score to severity level."""
if score is None:
return 'info'
elif score < 0.25:
return 'high'
elif score < 0.5:
return 'medium'
elif score < 0.75:
return 'low'
else:
return 'info'
def _map_category(self, lighthouse_category: str) -> str:
"""Map Lighthouse category to our category."""
mapping = {
'performance': 'performance',
'accessibility': 'accessibility',
'best-practices': 'best_practices',
'seo': 'seo',
}
return mapping.get(lighthouse_category, 'performance')
def _get_remediation(self, audit_id: str) -> str:
"""Get remediation text for known audit IDs."""
remediations = {
'first-contentful-paint': (
"Reduce server response time, eliminate render-blocking resources, "
"and optimize critical rendering path."
),
'largest-contentful-paint': (
"Optimize images, preload critical resources, and reduce server "
"response time."
),
'total-blocking-time': (
"Reduce JavaScript execution time by breaking up long tasks, "
"removing unused code, and minimizing main thread work."
),
'cumulative-layout-shift': (
"Always include size attributes on images and videos, reserve space "
"for ad slots, and avoid inserting content above existing content."
),
'speed-index': (
"Minimize main thread work, reduce JavaScript execution time, "
"and ensure text remains visible during webfont load."
),
'interactive': (
"Reduce JavaScript payload, defer non-critical scripts, and "
"minimize main thread work."
),
}
return remediations.get(audit_id, "Review and optimize based on the audit details.")

View File

@ -0,0 +1,397 @@
"""
Playwright Scanner Integration.
This module uses Playwright to perform browser-based analysis,
capturing console errors, network requests, and resource metrics.
"""
import asyncio
import logging
import time
from typing import Any, Dict, List, Optional
from django.conf import settings
from .base import (
BaseScanner,
ScannerResult,
ScannerStatus,
IssueData,
MetricData,
)
logger = logging.getLogger(__name__)
class PlaywrightScanner(BaseScanner):
"""
Scanner using Playwright for browser-based analysis.
Captures:
- Console errors and warnings
- Network request details
- Page load timing
- Large resources (images, scripts)
- Memory usage indicators
"""
name = "playwright"
def __init__(self, config: Optional[Dict[str, Any]] = None):
super().__init__(config)
self.timeout = self.config.get('timeout', 30000) # 30 seconds
self.viewport = self.config.get('viewport', {'width': 1920, 'height': 1080})
def run(self, url: str) -> ScannerResult:
"""
Run Playwright analysis on the URL.
Args:
url: The URL to analyze
Returns:
ScannerResult with browser analysis data
"""
self.logger.info(f"Starting Playwright scan for {url}")
try:
# Run async scan in sync context
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
try:
result = loop.run_until_complete(self._async_scan(url))
finally:
loop.close()
return result
except Exception as e:
return self._create_error_result(e)
async def _async_scan(self, url: str) -> ScannerResult:
"""
Async implementation of the scan.
Args:
url: The URL to analyze
Returns:
ScannerResult with findings
"""
from playwright.async_api import async_playwright
issues = []
metrics = []
raw_data = {
'console_messages': [],
'network_requests': [],
'failed_requests': [],
'large_resources': [],
}
async with async_playwright() as p:
browser = await p.chromium.launch(
headless=True,
args=[
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-gpu',
]
)
context = await browser.new_context(
viewport=self.viewport,
user_agent=(
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
'(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
)
)
page = await context.new_page()
# Collect data
console_messages = []
network_requests = []
failed_requests = []
# Set up event listeners
page.on("console", lambda msg: console_messages.append({
'type': msg.type,
'text': msg.text,
'location': str(msg.location) if msg.location else None,
}))
page.on("request", lambda req: network_requests.append({
'url': req.url,
'method': req.method,
'resource_type': req.resource_type,
'timestamp': time.time(),
}))
page.on("requestfailed", lambda req: failed_requests.append({
'url': req.url,
'failure': req.failure,
'resource_type': req.resource_type,
}))
# Navigate and measure
start_time = time.time()
try:
response = await page.goto(
url,
wait_until='networkidle',
timeout=self.timeout
)
load_time = (time.time() - start_time) * 1000 # Convert to ms
# Get response status
status_code = response.status if response else 0
# Wait a bit more for any delayed scripts
await page.wait_for_timeout(2000)
# Get performance timing
perf_timing = await page.evaluate('''() => {
const timing = performance.timing;
const navigation = performance.getEntriesByType("navigation")[0];
return {
domContentLoaded: timing.domContentLoadedEventEnd - timing.navigationStart,
domComplete: timing.domComplete - timing.navigationStart,
loadEvent: timing.loadEventEnd - timing.navigationStart,
firstPaint: navigation ? navigation.domComplete : null,
transferSize: navigation ? navigation.transferSize : null,
};
}''')
# Get memory info (if available)
memory_info = await page.evaluate('''() => {
if (performance.memory) {
return {
usedJSHeapSize: performance.memory.usedJSHeapSize,
totalJSHeapSize: performance.memory.totalJSHeapSize,
jsHeapSizeLimit: performance.memory.jsHeapSizeLimit,
};
}
return null;
}''')
# Get resource sizes
resources = await page.evaluate('''() => {
const entries = performance.getEntriesByType("resource");
return entries.map(e => ({
name: e.name,
type: e.initiatorType,
transferSize: e.transferSize,
duration: e.duration,
}));
}''')
except Exception as e:
self.logger.warning(f"Page navigation error: {e}")
load_time = self.timeout
status_code = 0
perf_timing = {}
memory_info = None
resources = []
await browser.close()
# Process collected data
raw_data['console_messages'] = console_messages
raw_data['network_requests'] = network_requests[:100] # Limit stored
raw_data['failed_requests'] = failed_requests
raw_data['performance_timing'] = perf_timing if 'perf_timing' in locals() else {}
raw_data['memory_info'] = memory_info if 'memory_info' in locals() else None
raw_data['status_code'] = status_code if 'status_code' in locals() else 0
# Create metrics
metrics.append(MetricData(
name='page_load_time',
display_name='Page Load Time',
value=load_time,
unit='ms',
source='playwright'
))
metrics.append(MetricData(
name='total_network_requests',
display_name='Total Network Requests',
value=float(len(network_requests)),
unit='count',
source='playwright'
))
# Calculate total transfer size
total_transfer = sum(r.get('transferSize', 0) for r in resources if r.get('transferSize'))
if total_transfer > 0:
metrics.append(MetricData(
name='total_transfer_size',
display_name='Total Transfer Size',
value=float(total_transfer),
unit='bytes',
source='playwright'
))
if perf_timing.get('domContentLoaded'):
metrics.append(MetricData(
name='dom_content_loaded',
display_name='DOM Content Loaded',
value=float(perf_timing['domContentLoaded']),
unit='ms',
source='playwright'
))
# Memory metrics
if memory_info:
metrics.append(MetricData(
name='js_heap_used',
display_name='JS Heap Used',
value=float(memory_info.get('usedJSHeapSize', 0)),
unit='bytes',
source='playwright'
))
# Check for high memory usage
heap_used = memory_info.get('usedJSHeapSize', 0)
heap_limit = memory_info.get('jsHeapSizeLimit', 1)
heap_percent = (heap_used / heap_limit) * 100 if heap_limit > 0 else 0
if heap_percent > 50:
issues.append(IssueData(
category='resources',
severity='medium',
title='High JavaScript memory usage',
description=(
f'JavaScript is using {heap_used / (1024*1024):.1f} MB '
f'({heap_percent:.1f}% of available heap). '
'This may indicate memory-heavy operations or potential leaks.'
),
tool='playwright',
affected_url=url,
remediation=(
'Review JavaScript for memory leaks, optimize data structures, '
'and ensure proper cleanup of event listeners and timers.'
),
raw_data=memory_info
))
# Analyze console messages for errors
errors = [m for m in console_messages if m['type'] == 'error']
warnings = [m for m in console_messages if m['type'] == 'warning']
metrics.append(MetricData(
name='console_errors_count',
display_name='Console Errors',
value=float(len(errors)),
unit='count',
source='playwright'
))
metrics.append(MetricData(
name='console_warnings_count',
display_name='Console Warnings',
value=float(len(warnings)),
unit='count',
source='playwright'
))
# Create issues for console errors
if errors:
# Group similar errors
error_texts = set(e['text'][:200] for e in errors)
for error_text in list(error_texts)[:10]: # Limit to 10 unique errors
issues.append(IssueData(
category='content',
severity='medium',
title='JavaScript console error',
description=f'JavaScript error logged to console: {error_text}',
tool='playwright',
affected_url=url,
remediation='Review and fix the JavaScript error in your code.',
raw_data={'error': error_text}
))
# Check for failed network requests
if failed_requests:
for req in failed_requests[:5]: # Limit reported
issues.append(IssueData(
category='content',
severity='low',
title='Failed network request',
description=(
f"Request to {req['url'][:100]} failed: {req.get('failure', 'Unknown error')}"
),
tool='playwright',
affected_url=req['url'],
remediation='Ensure the resource is available and CORS is configured correctly.',
raw_data=req
))
# Find large resources
large_threshold = settings.SCANNER_CONFIG.get('LARGE_IMAGE_THRESHOLD_BYTES', 1024 * 1024)
large_resources = [
r for r in resources
if r.get('transferSize', 0) > large_threshold
]
for resource in large_resources[:5]: # Limit reported
size_mb = resource['transferSize'] / (1024 * 1024)
issues.append(IssueData(
category='resources',
severity='medium' if size_mb > 2 else 'low',
title=f"Large resource detected ({size_mb:.1f} MB)",
description=(
f"The resource '{resource['name'][-80:]}' is {size_mb:.2f} MB. "
"Large resources increase page load time and bandwidth usage."
),
tool='playwright',
affected_url=resource['name'],
remediation=(
'Optimize images using compression, use appropriate formats (WebP, AVIF), '
'implement lazy loading, or consider a CDN.'
),
raw_data=resource
))
raw_data['large_resources'] = large_resources
# Count resources by type
resource_counts = {}
for req in network_requests:
rtype = req.get('resource_type', 'other')
resource_counts[rtype] = resource_counts.get(rtype, 0) + 1
raw_data['resource_counts'] = resource_counts
# Check for excessive requests
if len(network_requests) > 100:
issues.append(IssueData(
category='performance',
severity='medium',
title='High number of network requests',
description=(
f'Page made {len(network_requests)} network requests. '
'Excessive requests increase page load time and server load.'
),
tool='playwright',
affected_url=url,
remediation=(
'Consolidate resources, use HTTP/2 multiplexing, implement '
'resource bundling, and lazy load non-critical resources.'
),
raw_data=resource_counts
))
self.logger.info(
f"Playwright scan complete: {len(issues)} issues, {len(metrics)} metrics"
)
return ScannerResult(
scanner_name=self.name,
status=ScannerStatus.SUCCESS,
issues=issues,
metrics=metrics,
raw_data=raw_data
)

View File

@ -0,0 +1,314 @@
"""
Scan Runner - Orchestrates multiple scanners.
This module coordinates running all enabled scanners against a URL
and aggregates their results into a unified report.
"""
import logging
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import Any, Dict, List, Optional, Type
from django.conf import settings
from .base import BaseScanner, ScannerResult, ScannerStatus
from .lighthouse import LighthouseScanner
from .playwright_scanner import PlaywrightScanner
from .zap import ZAPScanner
from .headers import HeaderScanner
from .tls import TLSScanner
logger = logging.getLogger(__name__)
# Default scanner classes to run
DEFAULT_SCANNERS: List[Type[BaseScanner]] = [
LighthouseScanner,
PlaywrightScanner,
ZAPScanner,
HeaderScanner,
TLSScanner,
]
class ScanRunner:
"""
Orchestrates running multiple scanners and aggregating results.
This class manages:
- Running enabled scanners in parallel or sequence
- Aggregating results from all scanners
- Error handling and partial result compilation
- Timeout management
"""
def __init__(
self,
scanner_classes: Optional[List[Type[BaseScanner]]] = None,
config: Optional[Dict[str, Any]] = None,
max_workers: int = 3
):
"""
Initialize the scan runner.
Args:
scanner_classes: List of scanner classes to use (defaults to all)
config: Configuration dict passed to each scanner
max_workers: Maximum concurrent scanner threads
"""
self.scanner_classes = scanner_classes or DEFAULT_SCANNERS
self.config = config or {}
self.max_workers = max_workers
self.logger = logging.getLogger(__name__)
def run(self, url: str, parallel: bool = True) -> Dict[str, Any]:
"""
Run all scanners against the URL.
Args:
url: The URL to scan
parallel: Whether to run scanners in parallel
Returns:
Aggregated results dictionary containing:
- status: Overall scan status
- scores: Aggregated scores
- issues: All issues from all scanners
- metrics: All metrics from all scanners
- scanner_results: Individual scanner results
- errors: Any scanner errors
"""
self.logger.info(f"Starting scan runner for {url} with {len(self.scanner_classes)} scanners")
# Initialize scanners
scanners = self._initialize_scanners()
# Run scanners
if parallel:
results = self._run_parallel(scanners, url)
else:
results = self._run_sequential(scanners, url)
# Aggregate results
aggregated = self._aggregate_results(results)
self.logger.info(
f"Scan complete: {len(aggregated['issues'])} issues, "
f"{len(aggregated['metrics'])} metrics, "
f"status: {aggregated['status']}"
)
return aggregated
def _initialize_scanners(self) -> List[BaseScanner]:
"""Initialize scanner instances."""
scanners = []
scanner_config = settings.SCANNER_CONFIG
for scanner_class in self.scanner_classes:
try:
# Merge default config with scanner-specific config
config = {**self.config}
# Add scanner-specific config
if scanner_class == LighthouseScanner:
config['service_url'] = 'http://lighthouse:3001'
config['timeout'] = scanner_config.get('LIGHTHOUSE_TIMEOUT', 60)
elif scanner_class == ZAPScanner:
config['zap_host'] = scanner_config.get('ZAP_HOST')
config['api_key'] = scanner_config.get('ZAP_API_KEY')
config['timeout'] = scanner_config.get('ZAP_TIMEOUT', 120)
elif scanner_class == PlaywrightScanner:
config['timeout'] = scanner_config.get('PLAYWRIGHT_TIMEOUT', 30000)
config['viewport'] = scanner_config.get('PLAYWRIGHT_VIEWPORT', {'width': 1920, 'height': 1080})
scanner = scanner_class(config=config)
scanners.append(scanner)
except Exception as e:
self.logger.error(f"Failed to initialize {scanner_class.__name__}: {e}")
return scanners
def _run_parallel(
self,
scanners: List[BaseScanner],
url: str
) -> Dict[str, ScannerResult]:
"""Run scanners in parallel using thread pool."""
results = {}
with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
# Submit all scanner tasks
future_to_scanner = {
executor.submit(self._run_scanner, scanner, url): scanner
for scanner in scanners
}
# Collect results as they complete
for future in as_completed(future_to_scanner):
scanner = future_to_scanner[future]
try:
result = future.result()
results[scanner.name] = result
except Exception as e:
self.logger.error(f"Scanner {scanner.name} raised exception: {e}")
results[scanner.name] = ScannerResult(
scanner_name=scanner.name,
status=ScannerStatus.FAILED,
error_message=str(e)
)
return results
def _run_sequential(
self,
scanners: List[BaseScanner],
url: str
) -> Dict[str, ScannerResult]:
"""Run scanners sequentially."""
results = {}
for scanner in scanners:
result = self._run_scanner(scanner, url)
results[scanner.name] = result
return results
def _run_scanner(self, scanner: BaseScanner, url: str) -> ScannerResult:
"""Run a single scanner with error handling."""
self.logger.info(f"Running scanner: {scanner.name}")
try:
# Check availability first
if not scanner.is_available():
self.logger.warning(f"Scanner {scanner.name} is not available")
return ScannerResult(
scanner_name=scanner.name,
status=ScannerStatus.SKIPPED,
error_message=f"{scanner.name} service is not available"
)
# Run the scanner
result = scanner.run(url)
self.logger.info(
f"Scanner {scanner.name} completed with status: {result.status}"
)
return result
except Exception as e:
self.logger.error(f"Scanner {scanner.name} failed: {e}")
return ScannerResult(
scanner_name=scanner.name,
status=ScannerStatus.FAILED,
error_message=str(e)
)
def _aggregate_results(
self,
results: Dict[str, ScannerResult]
) -> Dict[str, Any]:
"""Aggregate results from all scanners."""
all_issues = []
all_metrics = []
all_scores = {}
raw_data = {}
errors = []
successful_scanners = 0
failed_scanners = 0
for scanner_name, result in results.items():
# Track scanner status
if result.status == ScannerStatus.SUCCESS:
successful_scanners += 1
elif result.status == ScannerStatus.FAILED:
failed_scanners += 1
if result.error_message:
errors.append({
'scanner': scanner_name,
'error': result.error_message
})
elif result.status == ScannerStatus.PARTIAL:
successful_scanners += 1
# Collect issues
for issue in result.issues:
all_issues.append({
'category': issue.category,
'severity': issue.severity,
'title': issue.title,
'description': issue.description,
'tool': issue.tool,
'affected_url': issue.affected_url,
'remediation': issue.remediation,
'raw_data': issue.raw_data,
})
# Collect metrics
for metric in result.metrics:
all_metrics.append({
'name': metric.name,
'display_name': metric.display_name,
'value': metric.value,
'unit': metric.unit,
'source': metric.source,
'score': metric.score,
})
# Collect scores
if result.scores:
all_scores[scanner_name] = result.scores
# Store raw data
if result.raw_data:
raw_data[scanner_name] = result.raw_data
# Determine overall status
if failed_scanners == len(results):
overall_status = 'failed'
elif failed_scanners > 0:
overall_status = 'partial'
else:
overall_status = 'done'
# Calculate aggregated scores
aggregated_scores = self._calculate_aggregated_scores(all_scores)
return {
'status': overall_status,
'scores': aggregated_scores,
'issues': all_issues,
'metrics': all_metrics,
'scanner_results': {
name: {
'status': result.status.value,
'error': result.error_message,
}
for name, result in results.items()
},
'raw_data': raw_data,
'errors': errors,
'summary': {
'total_scanners': len(results),
'successful': successful_scanners,
'failed': failed_scanners,
'total_issues': len(all_issues),
'total_metrics': len(all_metrics),
}
}
def _calculate_aggregated_scores(
self,
scanner_scores: Dict[str, Dict[str, int]]
) -> Dict[str, Optional[int]]:
"""Calculate aggregated scores from all scanners."""
# Lighthouse provides the main scores
lighthouse_scores = scanner_scores.get('lighthouse', {})
return {
'performance': lighthouse_scores.get('performance'),
'accessibility': lighthouse_scores.get('accessibility'),
'best_practices': lighthouse_scores.get('best_practices'),
'seo': lighthouse_scores.get('seo'),
}

View File

@ -0,0 +1,380 @@
"""
TLS/SSL Security Scanner.
This module checks TLS/SSL configuration and certificate validity.
"""
import logging
import socket
import ssl
from datetime import datetime, timezone
from typing import Any, Dict, Optional
from urllib.parse import urlparse
from .base import (
BaseScanner,
ScannerResult,
ScannerStatus,
IssueData,
MetricData,
)
logger = logging.getLogger(__name__)
class TLSScanner(BaseScanner):
"""
Scanner for TLS/SSL certificate and configuration.
Checks:
- Certificate validity
- Certificate expiration
- HTTPS availability
- HTTP to HTTPS redirect
"""
name = "tls_check"
def __init__(self, config: Optional[Dict[str, Any]] = None):
super().__init__(config)
self.timeout = self.config.get('timeout', 10)
def run(self, url: str) -> ScannerResult:
"""
Run TLS/SSL analysis on the URL.
Args:
url: The URL to analyze
Returns:
ScannerResult with TLS findings
"""
self.logger.info(f"Starting TLS scan for {url}")
try:
parsed = urlparse(url)
hostname = parsed.netloc.split(':')[0]
port = parsed.port or (443 if parsed.scheme == 'https' else 80)
issues = []
metrics = []
raw_data = {}
# Check if site is HTTPS
if parsed.scheme == 'http':
# Check if HTTPS is available
https_available, https_result = self._check_https_available(hostname)
raw_data['https_available'] = https_available
raw_data['https_check'] = https_result
if https_available:
issues.append(IssueData(
category='tls',
severity='high',
title='Site accessed over HTTP but HTTPS is available',
description=(
'The site was accessed over unencrypted HTTP, but HTTPS '
'appears to be available. All traffic should use HTTPS.'
),
tool='tls_check',
affected_url=url,
remediation=(
'Redirect all HTTP traffic to HTTPS using a 301 redirect. '
'Implement HSTS to prevent future HTTP access.'
)
))
else:
issues.append(IssueData(
category='tls',
severity='critical',
title='Site does not support HTTPS',
description=(
'The site does not appear to have HTTPS configured. '
'All data transmitted is unencrypted and vulnerable to interception.'
),
tool='tls_check',
affected_url=url,
remediation=(
'Configure TLS/SSL for your server. Obtain a certificate from '
"Let's Encrypt (free) or a commercial CA."
)
))
metrics.append(MetricData(
name='tls_enabled',
display_name='TLS Enabled',
value=0.0,
unit='score',
source='tls_check'
))
return ScannerResult(
scanner_name=self.name,
status=ScannerStatus.SUCCESS,
issues=issues,
metrics=metrics,
raw_data=raw_data
)
# For HTTPS URLs, check certificate
cert_info = self._get_certificate_info(hostname, port)
raw_data['certificate'] = cert_info
if cert_info.get('error'):
issues.append(IssueData(
category='tls',
severity='critical',
title='Certificate validation failed',
description=f"SSL certificate error: {cert_info['error']}",
tool='tls_check',
affected_url=url,
remediation=(
'Ensure your SSL certificate is valid, not expired, '
'and properly configured for your domain.'
)
))
metrics.append(MetricData(
name='certificate_valid',
display_name='Certificate Valid',
value=0.0,
unit='score',
source='tls_check'
))
else:
# Certificate is valid
metrics.append(MetricData(
name='certificate_valid',
display_name='Certificate Valid',
value=1.0,
unit='score',
source='tls_check'
))
metrics.append(MetricData(
name='tls_enabled',
display_name='TLS Enabled',
value=1.0,
unit='score',
source='tls_check'
))
# Check expiration
if cert_info.get('expires'):
try:
expires = datetime.strptime(
cert_info['expires'],
'%b %d %H:%M:%S %Y %Z'
)
expires = expires.replace(tzinfo=timezone.utc)
now = datetime.now(timezone.utc)
days_until_expiry = (expires - now).days
metrics.append(MetricData(
name='certificate_days_until_expiry',
display_name='Days Until Certificate Expiry',
value=float(days_until_expiry),
unit='count',
source='tls_check'
))
if days_until_expiry <= 0:
issues.append(IssueData(
category='tls',
severity='critical',
title='SSL certificate has expired',
description=(
f"The SSL certificate expired on {cert_info['expires']}. "
"Users will see security warnings."
),
tool='tls_check',
affected_url=url,
remediation='Renew your SSL certificate immediately.'
))
elif days_until_expiry <= 7:
issues.append(IssueData(
category='tls',
severity='high',
title='SSL certificate expiring very soon',
description=(
f"The SSL certificate will expire in {days_until_expiry} days "
f"(on {cert_info['expires']}). Renew immediately."
),
tool='tls_check',
affected_url=url,
remediation='Renew your SSL certificate before it expires.'
))
elif days_until_expiry <= 30:
issues.append(IssueData(
category='tls',
severity='medium',
title='SSL certificate expiring soon',
description=(
f"The SSL certificate will expire in {days_until_expiry} days "
f"(on {cert_info['expires']}). Plan for renewal."
),
tool='tls_check',
affected_url=url,
remediation=(
'Renew your SSL certificate before expiration. '
"Consider using auto-renewal with Let's Encrypt."
)
))
except Exception as e:
self.logger.warning(f"Could not parse certificate expiry: {e}")
# Check certificate subject matches hostname
if cert_info.get('subject'):
subject_cn = dict(x[0] for x in cert_info['subject']).get('commonName', '')
san = cert_info.get('subjectAltName', [])
san_names = [name for type_, name in san if type_ == 'DNS']
hostname_matched = self._hostname_matches_cert(
hostname, subject_cn, san_names
)
if not hostname_matched:
issues.append(IssueData(
category='tls',
severity='high',
title='Certificate hostname mismatch',
description=(
f"The SSL certificate is for '{subject_cn}' but "
f"the site is accessed as '{hostname}'."
),
tool='tls_check',
affected_url=url,
remediation=(
'Obtain a certificate that includes your domain name, '
'or add it to the Subject Alternative Names (SAN).'
)
))
# Check for HTTP to HTTPS redirect
if parsed.scheme == 'https':
redirect_info = self._check_http_redirect(hostname)
raw_data['http_redirect'] = redirect_info
if not redirect_info.get('redirects_to_https'):
issues.append(IssueData(
category='tls',
severity='medium',
title='No HTTP to HTTPS redirect',
description=(
'The site does not redirect HTTP requests to HTTPS. '
'Users accessing via HTTP will use an insecure connection.'
),
tool='tls_check',
affected_url=f"http://{hostname}",
remediation=(
'Configure your server to redirect all HTTP (port 80) '
'requests to HTTPS (port 443) with a 301 redirect.'
)
))
self.logger.info(f"TLS scan complete: {len(issues)} issues")
return ScannerResult(
scanner_name=self.name,
status=ScannerStatus.SUCCESS,
issues=issues,
metrics=metrics,
raw_data=raw_data
)
except Exception as e:
return self._create_error_result(e)
def _check_https_available(self, hostname: str) -> tuple:
"""Check if HTTPS is available for the hostname."""
try:
context = ssl.create_default_context()
with socket.create_connection((hostname, 443), timeout=self.timeout) as sock:
with context.wrap_socket(sock, server_hostname=hostname) as ssock:
return True, {'available': True, 'protocol': ssock.version()}
except ssl.SSLError as e:
return True, {'available': True, 'error': str(e)}
except Exception as e:
return False, {'available': False, 'error': str(e)}
def _get_certificate_info(self, hostname: str, port: int = 443) -> Dict:
"""Get SSL certificate information."""
try:
context = ssl.create_default_context()
with socket.create_connection((hostname, port), timeout=self.timeout) as sock:
with context.wrap_socket(sock, server_hostname=hostname) as ssock:
cert = ssock.getpeercert()
return {
'subject': cert.get('subject'),
'issuer': cert.get('issuer'),
'version': cert.get('version'),
'serialNumber': cert.get('serialNumber'),
'notBefore': cert.get('notBefore'),
'expires': cert.get('notAfter'),
'subjectAltName': cert.get('subjectAltName', []),
'protocol': ssock.version(),
'cipher': ssock.cipher(),
}
except ssl.SSLCertVerificationError as e:
return {'error': f"Certificate verification failed: {e.verify_message}"}
except ssl.SSLError as e:
return {'error': f"SSL error: {str(e)}"}
except socket.timeout:
return {'error': "Connection timed out"}
except Exception as e:
return {'error': str(e)}
def _hostname_matches_cert(
self,
hostname: str,
cn: str,
san_names: list
) -> bool:
"""Check if hostname matches certificate CN or SAN."""
all_names = [cn] + san_names
for name in all_names:
if name == hostname:
return True
# Handle wildcard certificates
if name.startswith('*.'):
domain = name[2:]
if hostname.endswith(domain):
# Ensure wildcard only matches one level
prefix = hostname[:-len(domain)-1]
if '.' not in prefix:
return True
return False
def _check_http_redirect(self, hostname: str) -> Dict:
"""Check if HTTP redirects to HTTPS."""
import httpx
try:
with httpx.Client(
timeout=self.timeout,
follow_redirects=False
) as client:
response = client.get(f"http://{hostname}")
if response.status_code in (301, 302, 303, 307, 308):
location = response.headers.get('location', '')
redirects_to_https = location.startswith('https://')
return {
'redirects_to_https': redirects_to_https,
'status_code': response.status_code,
'location': location,
}
else:
return {
'redirects_to_https': False,
'status_code': response.status_code,
}
except Exception as e:
return {
'redirects_to_https': False,
'error': str(e),
}

View File

@ -0,0 +1,307 @@
"""
OWASP ZAP Scanner Integration.
This module integrates with OWASP ZAP for security scanning,
detecting vulnerabilities like XSS, injection flaws, and
misconfigurations.
"""
import logging
import time
from typing import Any, Dict, List, Optional
import httpx
from django.conf import settings
from .base import (
BaseScanner,
ScannerResult,
ScannerStatus,
IssueData,
MetricData,
)
logger = logging.getLogger(__name__)
class ZAPScanner(BaseScanner):
"""
Scanner using OWASP ZAP for security vulnerability detection.
Performs baseline scans to identify common security issues:
- XSS vulnerabilities
- SQL injection patterns
- Insecure cookies
- Missing security headers
- SSL/TLS issues
- And more...
"""
name = "owasp_zap"
def __init__(self, config: Optional[Dict[str, Any]] = None):
super().__init__(config)
scanner_config = settings.SCANNER_CONFIG
self.zap_host = self.config.get('zap_host', scanner_config.get('ZAP_HOST', 'http://zap:8080'))
self.api_key = self.config.get('api_key', scanner_config.get('ZAP_API_KEY', ''))
self.timeout = self.config.get('timeout', scanner_config.get('ZAP_TIMEOUT', 120))
def is_available(self) -> bool:
"""Check if ZAP service is available."""
try:
with httpx.Client(timeout=10) as client:
response = client.get(
f"{self.zap_host}/JSON/core/view/version/",
params={'apikey': self.api_key}
)
return response.status_code == 200
except Exception as e:
self.logger.warning(f"ZAP service not available: {e}")
return False
def run(self, url: str) -> ScannerResult:
"""
Run ZAP security scan against the URL.
Args:
url: The URL to scan
Returns:
ScannerResult with security findings
"""
self.logger.info(f"Starting ZAP scan for {url}")
try:
# Access the target to populate ZAP's site tree
self._access_url(url)
# Spider the site (limited crawl)
self._spider_url(url)
# Run active scan
self._active_scan(url)
# Get alerts
alerts = self._get_alerts(url)
return self._parse_results(url, alerts)
except httpx.TimeoutException:
return self._create_error_result(
Exception("ZAP scan timed out")
)
except httpx.HTTPStatusError as e:
return self._create_error_result(
Exception(f"ZAP service error: {e.response.status_code}")
)
except Exception as e:
return self._create_error_result(e)
def _zap_request(self, endpoint: str, params: Optional[Dict] = None) -> Dict:
"""Make a request to the ZAP API."""
if params is None:
params = {}
params['apikey'] = self.api_key
with httpx.Client(timeout=self.timeout) as client:
response = client.get(
f"{self.zap_host}{endpoint}",
params=params
)
response.raise_for_status()
return response.json()
def _access_url(self, url: str) -> None:
"""Access the URL to add it to ZAP's site tree."""
self.logger.debug(f"Accessing URL in ZAP: {url}")
self._zap_request(
'/JSON/core/action/accessUrl/',
{'url': url, 'followRedirects': 'true'}
)
time.sleep(2) # Wait for ZAP to process
def _spider_url(self, url: str) -> None:
"""Spider the URL to discover pages."""
self.logger.debug(f"Spidering URL: {url}")
# Start spider
result = self._zap_request(
'/JSON/spider/action/scan/',
{
'url': url,
'maxChildren': '5', # Limited crawl
'recurse': 'true',
'subtreeOnly': 'true'
}
)
scan_id = result.get('scan')
if not scan_id:
return
# Wait for spider to complete (with timeout)
start_time = time.time()
while time.time() - start_time < 60: # 60 second spider timeout
status = self._zap_request(
'/JSON/spider/view/status/',
{'scanId': scan_id}
)
if int(status.get('status', '100')) >= 100:
break
time.sleep(2)
def _active_scan(self, url: str) -> None:
"""Run active scan against the URL."""
self.logger.debug(f"Starting active scan: {url}")
# Start active scan
result = self._zap_request(
'/JSON/ascan/action/scan/',
{
'url': url,
'recurse': 'true',
'inScopeOnly': 'true'
}
)
scan_id = result.get('scan')
if not scan_id:
return
# Wait for scan to complete (with timeout)
start_time = time.time()
while time.time() - start_time < self.timeout:
status = self._zap_request(
'/JSON/ascan/view/status/',
{'scanId': scan_id}
)
if int(status.get('status', '100')) >= 100:
break
time.sleep(5)
def _get_alerts(self, url: str) -> List[Dict]:
"""Get alerts for the scanned URL."""
self.logger.debug(f"Fetching alerts for: {url}")
result = self._zap_request(
'/JSON/core/view/alerts/',
{
'baseurl': url,
'start': '0',
'count': '100' # Limit alerts
}
)
return result.get('alerts', [])
def _parse_results(self, url: str, alerts: List[Dict]) -> ScannerResult:
"""
Parse ZAP alerts into ScannerResult format.
Args:
url: The scanned URL
alerts: List of ZAP alerts
Returns:
Parsed ScannerResult
"""
issues = []
metrics = []
# Count alerts by risk level
risk_counts = {
'High': 0,
'Medium': 0,
'Low': 0,
'Informational': 0
}
for alert in alerts:
risk = alert.get('risk', 'Informational')
risk_counts[risk] = risk_counts.get(risk, 0) + 1
severity = self._map_risk_to_severity(risk)
issues.append(IssueData(
category='security',
severity=severity,
title=alert.get('name', 'Unknown vulnerability'),
description=self._format_description(alert),
tool='owasp_zap',
affected_url=alert.get('url', url),
remediation=alert.get('solution', 'Review and fix the vulnerability.'),
raw_data={
'alert_ref': alert.get('alertRef'),
'cweid': alert.get('cweid'),
'wascid': alert.get('wascid'),
'confidence': alert.get('confidence'),
'evidence': alert.get('evidence', '')[:500], # Truncate evidence
}
))
# Create metrics for vulnerability counts
for risk_level, count in risk_counts.items():
if count > 0:
metrics.append(MetricData(
name=f'zap_{risk_level.lower()}_alerts',
display_name=f'{risk_level} Risk Alerts',
value=float(count),
unit='count',
source='owasp_zap'
))
metrics.append(MetricData(
name='total_security_alerts',
display_name='Total Security Alerts',
value=float(len(alerts)),
unit='count',
source='owasp_zap'
))
self.logger.info(
f"ZAP scan complete: {len(alerts)} alerts "
f"(High: {risk_counts['High']}, Medium: {risk_counts['Medium']}, "
f"Low: {risk_counts['Low']})"
)
return ScannerResult(
scanner_name=self.name,
status=ScannerStatus.SUCCESS,
issues=issues,
metrics=metrics,
raw_data={
'total_alerts': len(alerts),
'risk_counts': risk_counts,
'alerts': alerts[:50] # Store limited raw alerts
}
)
def _map_risk_to_severity(self, risk: str) -> str:
"""Map ZAP risk level to our severity."""
mapping = {
'High': 'high',
'Medium': 'medium',
'Low': 'low',
'Informational': 'info',
}
return mapping.get(risk, 'info')
def _format_description(self, alert: Dict) -> str:
"""Format ZAP alert into readable description."""
parts = []
if alert.get('description'):
parts.append(alert['description'])
if alert.get('attack'):
parts.append(f"\nAttack: {alert['attack']}")
if alert.get('evidence'):
evidence = alert['evidence'][:200]
parts.append(f"\nEvidence: {evidence}")
if alert.get('reference'):
parts.append(f"\nReference: {alert['reference']}")
return '\n'.join(parts)

306
backend/scanner/tasks.py Normal file
View File

@ -0,0 +1,306 @@
"""
Celery tasks for background scanning.
This module defines the Celery tasks that orchestrate website scans
in the background.
"""
import logging
from datetime import timedelta
from typing import Optional
from celery import shared_task
from celery.exceptions import SoftTimeLimitExceeded
from django.conf import settings
from django.utils import timezone
from websites.models import Website, Scan, ScanStatus, Issue, Metric
from scanner.scanners import ScanRunner
from scanner.utils import validate_url, get_domain_from_url
logger = logging.getLogger(__name__)
@shared_task(
bind=True,
max_retries=2,
default_retry_delay=60,
soft_time_limit=300,
time_limit=330,
)
def run_scan_task(self, scan_id: str) -> dict:
"""
Main Celery task for running a website scan.
This task:
1. Updates scan status to running
2. Orchestrates all scanners
3. Saves results to database
4. Handles errors and partial results
Args:
scan_id: UUID of the Scan record
Returns:
Dict with scan results summary
"""
logger.info(f"Starting scan task for scan_id: {scan_id}")
try:
# Get the scan record
scan = Scan.objects.select_related('website').get(id=scan_id)
except Scan.DoesNotExist:
logger.error(f"Scan {scan_id} not found")
return {'error': f'Scan {scan_id} not found'}
# Update status to running
scan.status = ScanStatus.RUNNING
scan.started_at = timezone.now()
scan.celery_task_id = self.request.id
scan.save(update_fields=['status', 'started_at', 'celery_task_id'])
url = scan.website.url
try:
# Run the scan pipeline
runner = ScanRunner()
results = runner.run(url)
# Save results to database
_save_scan_results(scan, results)
# Update website last_scanned_at
scan.website.last_scanned_at = timezone.now()
scan.website.save(update_fields=['last_scanned_at'])
logger.info(f"Scan {scan_id} completed successfully")
return {
'scan_id': str(scan_id),
'status': scan.status,
'overall_score': scan.overall_score,
'issues_count': scan.issues.count(),
'metrics_count': scan.metrics.count(),
}
except SoftTimeLimitExceeded:
logger.warning(f"Scan {scan_id} timed out")
scan.status = ScanStatus.PARTIAL
scan.error_message = "Scan timed out before completing all checks"
scan.completed_at = timezone.now()
scan.save(update_fields=['status', 'error_message', 'completed_at'])
return {
'scan_id': str(scan_id),
'status': 'partial',
'error': 'Scan timed out'
}
except Exception as e:
logger.exception(f"Scan {scan_id} failed with error: {e}")
scan.status = ScanStatus.FAILED
scan.error_message = str(e)
scan.completed_at = timezone.now()
scan.save(update_fields=['status', 'error_message', 'completed_at'])
# Retry on certain errors
if self.request.retries < self.max_retries:
raise self.retry(exc=e)
return {
'scan_id': str(scan_id),
'status': 'failed',
'error': str(e)
}
def _save_scan_results(scan: Scan, results: dict) -> None:
"""
Save scan results to the database.
Args:
scan: The Scan model instance
results: Aggregated results from ScanRunner
"""
# Update scan status
status_map = {
'done': ScanStatus.DONE,
'partial': ScanStatus.PARTIAL,
'failed': ScanStatus.FAILED,
}
scan.status = status_map.get(results['status'], ScanStatus.DONE)
scan.completed_at = timezone.now()
# Save scores
scores = results.get('scores', {})
scan.performance_score = scores.get('performance')
scan.accessibility_score = scores.get('accessibility')
scan.seo_score = scores.get('seo')
scan.best_practices_score = scores.get('best_practices')
# Save raw data
raw_data = results.get('raw_data', {})
scan.raw_lighthouse_data = raw_data.get('lighthouse')
scan.raw_zap_data = raw_data.get('owasp_zap')
scan.raw_playwright_data = raw_data.get('playwright')
scan.raw_headers_data = raw_data.get('header_check')
# Save errors if any
if results.get('errors'):
scan.error_message = '\n'.join(
f"{e['scanner']}: {e['error']}"
for e in results['errors']
)
scan.save()
# Create Issue records
issues_to_create = []
for issue_data in results.get('issues', []):
issues_to_create.append(Issue(
scan=scan,
category=issue_data['category'],
severity=issue_data['severity'],
title=issue_data['title'][:500], # Truncate if too long
description=issue_data['description'],
tool=issue_data['tool'],
affected_url=issue_data.get('affected_url'),
remediation=issue_data.get('remediation'),
raw_data=issue_data.get('raw_data'),
))
if issues_to_create:
Issue.objects.bulk_create(issues_to_create)
# Create Metric records
metrics_to_create = []
seen_metrics = set() # Track unique metrics
for metric_data in results.get('metrics', []):
metric_key = metric_data['name']
if metric_key in seen_metrics:
continue # Skip duplicates
seen_metrics.add(metric_key)
# Map unit strings to model choices
unit_map = {
'ms': 'ms',
'milliseconds': 'ms',
's': 's',
'seconds': 's',
'bytes': 'bytes',
'kb': 'kb',
'kilobytes': 'kb',
'mb': 'mb',
'megabytes': 'mb',
'score': 'score',
'percent': 'percent',
'count': 'count',
}
unit = unit_map.get(metric_data['unit'].lower(), 'count')
metrics_to_create.append(Metric(
scan=scan,
name=metric_data['name'],
display_name=metric_data['display_name'][:200],
value=metric_data['value'],
unit=unit,
source=metric_data['source'],
score=metric_data.get('score'),
))
if metrics_to_create:
Metric.objects.bulk_create(metrics_to_create)
# Calculate security score based on issues
scan.calculate_security_score()
# Calculate overall score
scan.calculate_overall_score()
scan.save(update_fields=['security_score', 'overall_score'])
logger.info(
f"Saved scan results: {len(issues_to_create)} issues, "
f"{len(metrics_to_create)} metrics"
)
@shared_task
def cleanup_old_scans(days: int = 30) -> dict:
"""
Clean up old scan data to prevent database growth.
Args:
days: Number of days to keep scans
Returns:
Dict with cleanup statistics
"""
cutoff_date = timezone.now() - timedelta(days=days)
# Delete old scans (cascades to issues and metrics)
deleted_count, _ = Scan.objects.filter(
created_at__lt=cutoff_date
).delete()
logger.info(f"Cleaned up {deleted_count} old scans")
return {
'deleted_scans': deleted_count,
'cutoff_date': cutoff_date.isoformat(),
}
def check_rate_limit(url: str) -> Optional[str]:
"""
Check if URL scanning is rate limited.
Args:
url: The URL to check
Returns:
Error message if rate limited, None otherwise
"""
from django.core.cache import cache
scanner_config = settings.SCANNER_CONFIG
rate_limit_minutes = scanner_config.get('SCAN_RATE_LIMIT_MINUTES', 5)
# Create a cache key based on the URL
domain = get_domain_from_url(url)
cache_key = f"scan_rate_limit:{domain}"
# Check if already scanned recently
last_scan_time = cache.get(cache_key)
if last_scan_time:
return (
f"This URL was scanned recently. "
f"Please wait {rate_limit_minutes} minutes between scans."
)
# Set the rate limit
cache.set(cache_key, timezone.now().isoformat(), timeout=rate_limit_minutes * 60)
return None
def check_concurrent_scan_limit() -> Optional[str]:
"""
Check if maximum concurrent scans limit is reached.
Returns:
Error message if limit reached, None otherwise
"""
scanner_config = settings.SCANNER_CONFIG
max_concurrent = scanner_config.get('MAX_CONCURRENT_SCANS', 3)
running_count = Scan.objects.filter(status=ScanStatus.RUNNING).count()
if running_count >= max_concurrent:
return (
f"Maximum concurrent scans ({max_concurrent}) reached. "
"Please wait for current scans to complete."
)
return None

185
backend/scanner/utils.py Normal file
View File

@ -0,0 +1,185 @@
"""
URL validation and safety utilities.
This module provides functions for validating and normalizing URLs,
including safety checks to prevent SSRF attacks.
"""
import ipaddress
import logging
import socket
from typing import Tuple
from urllib.parse import urlparse, urlunparse
import validators
from django.conf import settings
logger = logging.getLogger(__name__)
def validate_url(url: str) -> Tuple[bool, str]:
"""
Validate and normalize a URL for scanning.
Args:
url: The URL to validate
Returns:
Tuple of (is_valid, normalized_url_or_error_message)
"""
if not url:
return False, "URL is required"
# Basic URL validation
if not validators.url(url):
return False, "Invalid URL format"
# Parse the URL
try:
parsed = urlparse(url)
except Exception as e:
return False, f"Could not parse URL: {e}"
# Check scheme
if parsed.scheme not in ('http', 'https'):
return False, "URL must use http or https scheme"
# Check hostname
hostname = parsed.netloc.split(':')[0].lower()
if not hostname:
return False, "URL must have a valid hostname"
# Safety check: block localhost and private IPs
is_safe, safety_error = check_url_safety(hostname)
if not is_safe:
return False, safety_error
# Normalize URL
normalized = normalize_url(url)
return True, normalized
def normalize_url(url: str) -> str:
"""
Normalize a URL to a canonical form.
- Lowercase hostname
- Remove trailing slashes from path
- Remove default ports
- Sort query parameters
Args:
url: The URL to normalize
Returns:
Normalized URL string
"""
parsed = urlparse(url)
# Lowercase hostname
hostname = parsed.netloc.lower()
# Remove default ports
if ':80' in hostname and parsed.scheme == 'http':
hostname = hostname.replace(':80', '')
elif ':443' in hostname and parsed.scheme == 'https':
hostname = hostname.replace(':443', '')
# Normalize path (remove trailing slash except for root)
path = parsed.path
if path != '/' and path.endswith('/'):
path = path.rstrip('/')
if not path:
path = '/'
# Reconstruct URL
normalized = urlunparse((
parsed.scheme,
hostname,
path,
parsed.params,
parsed.query,
'' # Remove fragment
))
return normalized
def check_url_safety(hostname: str) -> Tuple[bool, str]:
"""
Check if a hostname is safe to scan (not localhost/private IP).
Args:
hostname: The hostname to check
Returns:
Tuple of (is_safe, error_message_if_not_safe)
"""
scanner_config = settings.SCANNER_CONFIG
blocked_hosts = scanner_config.get('BLOCKED_HOSTS', [])
blocked_ranges = scanner_config.get('BLOCKED_IP_RANGES', [])
# Check blocked hostnames
if hostname in blocked_hosts:
return False, f"Scanning {hostname} is not allowed"
# Try to resolve hostname to IP
try:
ip_addresses = socket.getaddrinfo(
hostname, None, socket.AF_UNSPEC, socket.SOCK_STREAM
)
except socket.gaierror:
# Could not resolve - might be okay for some hostnames
logger.warning(f"Could not resolve hostname: {hostname}")
return True, ""
for family, type_, proto, canonname, sockaddr in ip_addresses:
ip_str = sockaddr[0]
try:
ip = ipaddress.ip_address(ip_str)
# Check if IP is in any blocked range
for blocked_range in blocked_ranges:
try:
network = ipaddress.ip_network(blocked_range, strict=False)
if ip in network:
return False, f"Scanning private/local IP addresses is not allowed ({ip_str})"
except ValueError:
continue
# Additional checks
if ip.is_private:
return False, f"Scanning private IP addresses is not allowed ({ip_str})"
if ip.is_loopback:
return False, f"Scanning localhost/loopback addresses is not allowed ({ip_str})"
if ip.is_link_local:
return False, f"Scanning link-local addresses is not allowed ({ip_str})"
if ip.is_reserved:
return False, f"Scanning reserved IP addresses is not allowed ({ip_str})"
except ValueError:
# Not a valid IP address format
continue
return True, ""
def get_domain_from_url(url: str) -> str:
"""
Extract the domain from a URL.
Args:
url: The URL to extract domain from
Returns:
The domain/hostname
"""
parsed = urlparse(url)
return parsed.netloc.split(':')[0].lower()

View File

@ -0,0 +1,89 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>{% block title %}Website Analyzer{% endblock %}</title>
<!-- Tailwind CSS -->
<script src="https://cdn.tailwindcss.com"></script>
<!-- Alpine.js for interactivity -->
<script defer src="https://cdn.jsdelivr.net/npm/alpinejs@3.x.x/dist/cdn.min.js"></script>
<!-- Chart.js for visualizations -->
<script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
<style>
[x-cloak] { display: none !important; }
/* Custom animations */
@keyframes pulse-slow {
0%, 100% { opacity: 1; }
50% { opacity: 0.5; }
}
.animate-pulse-slow {
animation: pulse-slow 2s cubic-bezier(0.4, 0, 0.6, 1) infinite;
}
/* Score circle gradient */
.score-circle {
background: conic-gradient(
var(--score-color) calc(var(--score) * 3.6deg),
#e5e7eb calc(var(--score) * 3.6deg)
);
}
</style>
{% block extra_head %}{% endblock %}
</head>
<body class="bg-gray-50 min-h-screen">
<!-- Navigation -->
<nav class="bg-white shadow-sm border-b border-gray-200">
<div class="max-w-7xl mx-auto px-4 sm:px-6 lg:px-8">
<div class="flex justify-between h-16">
<div class="flex items-center">
<a href="/" class="flex items-center space-x-2">
<svg class="w-8 h-8 text-blue-600" fill="none" stroke="currentColor" viewBox="0 0 24 24">
<path stroke-linecap="round" stroke-linejoin="round" stroke-width="2"
d="M9 12l2 2 4-4m5.618-4.016A11.955 11.955 0 0112 2.944a11.955 11.955 0 01-8.618 3.04A12.02 12.02 0 003 9c0 5.591 3.824 10.29 9 11.622 5.176-1.332 9-6.03 9-11.622 0-1.042-.133-2.052-.382-3.016z"/>
</svg>
<span class="font-bold text-xl text-gray-900">Website Analyzer</span>
</a>
</div>
<div class="flex items-center space-x-4">
<a href="/" class="text-gray-600 hover:text-gray-900 px-3 py-2 rounded-md text-sm font-medium">
New Scan
</a>
<a href="/api/" class="text-gray-600 hover:text-gray-900 px-3 py-2 rounded-md text-sm font-medium">
API
</a>
</div>
</div>
</div>
</nav>
<!-- Main Content -->
<main class="max-w-7xl mx-auto px-4 sm:px-6 lg:px-8 py-8">
{% block content %}{% endblock %}
</main>
<!-- Footer -->
<footer class="bg-white border-t border-gray-200 mt-auto">
<div class="max-w-7xl mx-auto px-4 sm:px-6 lg:px-8 py-6">
<div class="flex justify-between items-center">
<p class="text-gray-500 text-sm">
Website Analyzer - Security & Performance Scanner
</p>
<div class="flex space-x-4">
<a href="/api/health/" class="text-gray-400 hover:text-gray-600 text-sm">
Health Check
</a>
</div>
</div>
</div>
</footer>
{% block extra_js %}{% endblock %}
</body>
</html>

View File

@ -0,0 +1,5 @@
"""
Websites app initialization.
"""
default_app_config = 'websites.apps.WebsitesConfig'

93
backend/websites/admin.py Normal file
View File

@ -0,0 +1,93 @@
"""
Django admin configuration for Website Analyzer models.
"""
from django.contrib import admin
from .models import Website, Scan, Issue, Metric
@admin.register(Website)
class WebsiteAdmin(admin.ModelAdmin):
list_display = ('url', 'domain', 'created_at', 'last_scanned_at')
list_filter = ('created_at', 'last_scanned_at')
search_fields = ('url', 'domain')
readonly_fields = ('id', 'created_at', 'domain')
ordering = ('-created_at',)
class IssueInline(admin.TabularInline):
model = Issue
extra = 0
readonly_fields = ('id', 'category', 'severity', 'tool', 'title', 'created_at')
can_delete = False
show_change_link = True
max_num = 10
class MetricInline(admin.TabularInline):
model = Metric
extra = 0
readonly_fields = ('id', 'name', 'display_name', 'value', 'unit', 'source', 'score')
can_delete = False
max_num = 15
@admin.register(Scan)
class ScanAdmin(admin.ModelAdmin):
list_display = (
'id', 'website', 'status', 'overall_score',
'performance_score', 'security_score', 'created_at'
)
list_filter = ('status', 'created_at')
search_fields = ('website__url', 'website__domain')
readonly_fields = (
'id', 'created_at', 'started_at', 'completed_at',
'celery_task_id', 'raw_lighthouse_data', 'raw_zap_data',
'raw_playwright_data', 'raw_headers_data'
)
inlines = [IssueInline, MetricInline]
ordering = ('-created_at',)
fieldsets = (
('Basic Info', {
'fields': ('id', 'website', 'status', 'celery_task_id')
}),
('Timestamps', {
'fields': ('created_at', 'started_at', 'completed_at')
}),
('Scores', {
'fields': (
'overall_score', 'performance_score', 'accessibility_score',
'seo_score', 'best_practices_score', 'security_score'
)
}),
('Errors', {
'fields': ('error_message',),
'classes': ('collapse',)
}),
('Raw Data', {
'fields': (
'raw_lighthouse_data', 'raw_zap_data',
'raw_playwright_data', 'raw_headers_data'
),
'classes': ('collapse',)
}),
)
@admin.register(Issue)
class IssueAdmin(admin.ModelAdmin):
list_display = ('title', 'scan', 'category', 'severity', 'tool', 'created_at')
list_filter = ('category', 'severity', 'tool', 'created_at')
search_fields = ('title', 'description', 'scan__website__url')
readonly_fields = ('id', 'created_at', 'raw_data')
ordering = ('severity', '-created_at')
@admin.register(Metric)
class MetricAdmin(admin.ModelAdmin):
list_display = ('display_name', 'scan', 'value', 'unit', 'source', 'score')
list_filter = ('source', 'unit')
search_fields = ('name', 'display_name', 'scan__website__url')
readonly_fields = ('id', 'created_at')
ordering = ('name',)

11
backend/websites/apps.py Normal file
View File

@ -0,0 +1,11 @@
"""
Websites app configuration.
"""
from django.apps import AppConfig
class WebsitesConfig(AppConfig):
default_auto_field = 'django.db.models.BigAutoField'
name = 'websites'
verbose_name = 'Website Scanner'

493
backend/websites/models.py Normal file
View File

@ -0,0 +1,493 @@
"""
Database models for Website Analyzer.
This module defines the core data models for storing websites, scans,
issues, and metrics from various scanning tools.
"""
import uuid
from django.db import models
from django.utils import timezone
from django.core.validators import URLValidator
class Website(models.Model):
"""
Represents a website that has been scanned.
Each unique URL gets one Website record, which can have multiple
Scan records associated with it.
"""
id = models.UUIDField(
primary_key=True,
default=uuid.uuid4,
editable=False,
help_text="Unique identifier for the website"
)
url = models.URLField(
max_length=2048,
unique=True,
validators=[URLValidator(schemes=['http', 'https'])],
help_text="The normalized URL of the website"
)
domain = models.CharField(
max_length=255,
db_index=True,
help_text="The domain extracted from the URL"
)
created_at = models.DateTimeField(
auto_now_add=True,
help_text="When the website was first added"
)
last_scanned_at = models.DateTimeField(
null=True,
blank=True,
help_text="When the website was last scanned"
)
class Meta:
db_table = 'websites'
ordering = ['-created_at']
indexes = [
models.Index(fields=['domain']),
models.Index(fields=['-last_scanned_at']),
]
def __str__(self):
return self.url
def save(self, *args, **kwargs):
"""Extract domain from URL before saving."""
if self.url:
from urllib.parse import urlparse
parsed = urlparse(self.url)
self.domain = parsed.netloc.lower()
super().save(*args, **kwargs)
class ScanStatus(models.TextChoices):
"""Enumeration of possible scan statuses."""
PENDING = 'pending', 'Pending'
RUNNING = 'running', 'Running'
DONE = 'done', 'Completed'
FAILED = 'failed', 'Failed'
PARTIAL = 'partial', 'Partially Completed'
class Scan(models.Model):
"""
Represents a single scan of a website.
Contains aggregated scores from various scanning tools and
links to detailed issues and metrics.
"""
id = models.UUIDField(
primary_key=True,
default=uuid.uuid4,
editable=False,
help_text="Unique identifier for the scan"
)
website = models.ForeignKey(
Website,
on_delete=models.CASCADE,
related_name='scans',
help_text="The website that was scanned"
)
status = models.CharField(
max_length=20,
choices=ScanStatus.choices,
default=ScanStatus.PENDING,
db_index=True,
help_text="Current status of the scan"
)
# Celery task tracking
celery_task_id = models.CharField(
max_length=255,
null=True,
blank=True,
help_text="Celery task ID for tracking"
)
# Timestamps
created_at = models.DateTimeField(
auto_now_add=True,
help_text="When the scan was created"
)
started_at = models.DateTimeField(
null=True,
blank=True,
help_text="When the scan started running"
)
completed_at = models.DateTimeField(
null=True,
blank=True,
help_text="When the scan completed"
)
# Aggregated scores (0-100)
performance_score = models.IntegerField(
null=True,
blank=True,
help_text="Lighthouse performance score (0-100)"
)
accessibility_score = models.IntegerField(
null=True,
blank=True,
help_text="Lighthouse accessibility score (0-100)"
)
seo_score = models.IntegerField(
null=True,
blank=True,
help_text="Lighthouse SEO score (0-100)"
)
best_practices_score = models.IntegerField(
null=True,
blank=True,
help_text="Lighthouse best practices score (0-100)"
)
security_score = models.IntegerField(
null=True,
blank=True,
help_text="Computed security score based on issues (0-100)"
)
# Overall health score (computed average)
overall_score = models.IntegerField(
null=True,
blank=True,
help_text="Overall health score (0-100)"
)
# Error tracking
error_message = models.TextField(
null=True,
blank=True,
help_text="Error message if scan failed"
)
# Raw data from scanners
raw_lighthouse_data = models.JSONField(
null=True,
blank=True,
help_text="Raw Lighthouse report data"
)
raw_zap_data = models.JSONField(
null=True,
blank=True,
help_text="Raw OWASP ZAP report data"
)
raw_playwright_data = models.JSONField(
null=True,
blank=True,
help_text="Raw Playwright analysis data"
)
raw_headers_data = models.JSONField(
null=True,
blank=True,
help_text="Raw HTTP headers analysis data"
)
class Meta:
db_table = 'scans'
ordering = ['-created_at']
indexes = [
models.Index(fields=['status']),
models.Index(fields=['-created_at']),
models.Index(fields=['website', '-created_at']),
]
def __str__(self):
return f"Scan {self.id} - {self.website.url} ({self.status})"
def calculate_overall_score(self):
"""
Calculate overall health score as weighted average of all scores.
Weights:
- Performance: 25%
- Security: 30%
- Accessibility: 15%
- SEO: 15%
- Best Practices: 15%
"""
scores = [
(self.performance_score, 0.25),
(self.security_score, 0.30),
(self.accessibility_score, 0.15),
(self.seo_score, 0.15),
(self.best_practices_score, 0.15),
]
total_weight = 0
weighted_sum = 0
for score, weight in scores:
if score is not None:
weighted_sum += score * weight
total_weight += weight
if total_weight > 0:
self.overall_score = round(weighted_sum / total_weight)
else:
self.overall_score = None
return self.overall_score
def calculate_security_score(self):
"""
Calculate security score based on security issues found.
Starts at 100 and deducts points based on issue severity:
- Critical: -25 points each
- High: -15 points each
- Medium: -8 points each
- Low: -3 points each
- Info: -1 point each
"""
deductions = {
'critical': 25,
'high': 15,
'medium': 8,
'low': 3,
'info': 1,
}
score = 100
security_issues = self.issues.filter(
category__in=['security', 'headers', 'tls', 'cors']
)
for issue in security_issues:
score -= deductions.get(issue.severity, 0)
self.security_score = max(0, score)
return self.security_score
class IssueCategory(models.TextChoices):
"""Categories of issues that can be detected."""
PERFORMANCE = 'performance', 'Performance'
SECURITY = 'security', 'Security'
HEADERS = 'headers', 'HTTP Headers'
TLS = 'tls', 'TLS/SSL'
CORS = 'cors', 'CORS'
ACCESSIBILITY = 'accessibility', 'Accessibility'
SEO = 'seo', 'SEO'
BEST_PRACTICES = 'best_practices', 'Best Practices'
CONTENT = 'content', 'Content'
RESOURCES = 'resources', 'Resources'
class IssueSeverity(models.TextChoices):
"""Severity levels for issues."""
CRITICAL = 'critical', 'Critical'
HIGH = 'high', 'High'
MEDIUM = 'medium', 'Medium'
LOW = 'low', 'Low'
INFO = 'info', 'Informational'
class ScannerTool(models.TextChoices):
"""Scanner tools that can detect issues."""
LIGHTHOUSE = 'lighthouse', 'Google Lighthouse'
ZAP = 'owasp_zap', 'OWASP ZAP'
PLAYWRIGHT = 'playwright', 'Playwright'
HEADER_CHECK = 'header_check', 'HTTP Header Check'
TLS_CHECK = 'tls_check', 'TLS/SSL Check'
class Issue(models.Model):
"""
Represents a specific issue found during a scan.
Issues are categorized by type, severity, and the tool that detected them.
Each issue includes a description and suggested remediation.
"""
id = models.UUIDField(
primary_key=True,
default=uuid.uuid4,
editable=False
)
scan = models.ForeignKey(
Scan,
on_delete=models.CASCADE,
related_name='issues',
help_text="The scan that found this issue"
)
# Classification
category = models.CharField(
max_length=30,
choices=IssueCategory.choices,
db_index=True,
help_text="Category of the issue"
)
severity = models.CharField(
max_length=20,
choices=IssueSeverity.choices,
db_index=True,
help_text="Severity level of the issue"
)
tool = models.CharField(
max_length=30,
choices=ScannerTool.choices,
help_text="Tool that detected this issue"
)
# Issue details
title = models.CharField(
max_length=500,
help_text="Brief title of the issue"
)
description = models.TextField(
help_text="Detailed description of the issue"
)
affected_url = models.URLField(
max_length=2048,
null=True,
blank=True,
help_text="Specific URL affected by this issue"
)
remediation = models.TextField(
null=True,
blank=True,
help_text="Suggested fix or remediation"
)
# Additional data from scanner
raw_data = models.JSONField(
null=True,
blank=True,
help_text="Raw data from the scanner for this issue"
)
# Timestamps
created_at = models.DateTimeField(
auto_now_add=True
)
class Meta:
db_table = 'issues'
ordering = ['severity', '-created_at']
indexes = [
models.Index(fields=['scan', 'category']),
models.Index(fields=['scan', 'severity']),
models.Index(fields=['tool']),
]
def __str__(self):
return f"[{self.severity}] {self.title}"
class MetricUnit(models.TextChoices):
"""Units of measurement for metrics."""
MILLISECONDS = 'ms', 'Milliseconds'
SECONDS = 's', 'Seconds'
BYTES = 'bytes', 'Bytes'
KILOBYTES = 'kb', 'Kilobytes'
MEGABYTES = 'mb', 'Megabytes'
SCORE = 'score', 'Score (0-1)'
PERCENT = 'percent', 'Percentage'
COUNT = 'count', 'Count'
class Metric(models.Model):
"""
Represents a specific metric measured during a scan.
Metrics are numerical values with units, such as page load time,
total byte weight, number of requests, etc.
"""
id = models.UUIDField(
primary_key=True,
default=uuid.uuid4,
editable=False
)
scan = models.ForeignKey(
Scan,
on_delete=models.CASCADE,
related_name='metrics',
help_text="The scan that measured this metric"
)
# Metric identification
name = models.CharField(
max_length=100,
db_index=True,
help_text="Name of the metric (e.g., 'first_contentful_paint_ms')"
)
display_name = models.CharField(
max_length=200,
help_text="Human-readable name for display"
)
# Value
value = models.FloatField(
help_text="Numeric value of the metric"
)
unit = models.CharField(
max_length=20,
choices=MetricUnit.choices,
help_text="Unit of measurement"
)
# Source
source = models.CharField(
max_length=30,
choices=ScannerTool.choices,
help_text="Tool that provided this metric"
)
# Score (if applicable)
score = models.FloatField(
null=True,
blank=True,
help_text="Lighthouse score for this metric (0-1)"
)
# Timestamp
created_at = models.DateTimeField(
auto_now_add=True
)
class Meta:
db_table = 'metrics'
ordering = ['name']
indexes = [
models.Index(fields=['scan', 'name']),
models.Index(fields=['source']),
]
# Ensure unique metric names per scan
constraints = [
models.UniqueConstraint(
fields=['scan', 'name'],
name='unique_metric_per_scan'
)
]
def __str__(self):
return f"{self.display_name}: {self.value} {self.unit}"
def get_formatted_value(self):
"""Return a formatted string representation of the value."""
if self.unit == MetricUnit.MILLISECONDS:
if self.value >= 1000:
return f"{self.value / 1000:.2f}s"
return f"{self.value:.0f}ms"
elif self.unit == MetricUnit.BYTES:
if self.value >= 1024 * 1024:
return f"{self.value / (1024 * 1024):.2f} MB"
elif self.value >= 1024:
return f"{self.value / 1024:.1f} KB"
return f"{self.value:.0f} bytes"
elif self.unit == MetricUnit.PERCENT:
return f"{self.value:.1f}%"
elif self.unit == MetricUnit.SCORE:
return f"{self.value:.3f}"
else:
return f"{self.value:.2f} {self.get_unit_display()}"

160
docker-compose.yml Normal file
View File

@ -0,0 +1,160 @@
# Website Analyzer - Docker Compose Configuration
# This file orchestrates all services required for the application
version: '3.9'
services:
# ==========================================================================
# PostgreSQL Database
# ==========================================================================
db:
image: postgres:16-alpine
container_name: analyzer_db
restart: unless-stopped
environment:
POSTGRES_USER: analyzer
POSTGRES_PASSWORD: analyzer_password
POSTGRES_DB: website_analyzer
volumes:
- postgres_data:/var/lib/postgresql/data
ports:
- "5432:5432"
healthcheck:
test: ["CMD-SHELL", "pg_isready -U analyzer -d website_analyzer"]
interval: 10s
timeout: 5s
retries: 5
# ==========================================================================
# Redis - Message Broker & Cache
# ==========================================================================
redis:
image: redis:7-alpine
container_name: analyzer_redis
restart: unless-stopped
ports:
- "6379:6379"
volumes:
- redis_data:/data
healthcheck:
test: ["CMD", "redis-cli", "ping"]
interval: 10s
timeout: 5s
retries: 5
# ==========================================================================
# Django Web Application
# ==========================================================================
web:
build:
context: ./backend
dockerfile: Dockerfile
container_name: analyzer_web
restart: unless-stopped
command: >
sh -c "python manage.py migrate &&
python manage.py collectstatic --noinput &&
gunicorn core.wsgi:application --bind 0.0.0.0:8000 --workers 4 --threads 2"
volumes:
- ./backend:/app
- static_volume:/app/staticfiles
ports:
- "8000:8000"
env_file:
- ./backend/.env
depends_on:
db:
condition: service_healthy
redis:
condition: service_healthy
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8000/api/health/"]
interval: 30s
timeout: 10s
retries: 3
# ==========================================================================
# Celery Worker - Background Task Processing
# ==========================================================================
celery_worker:
build:
context: ./backend
dockerfile: Dockerfile
container_name: analyzer_celery_worker
restart: unless-stopped
command: celery -A core worker -l INFO --concurrency=2
volumes:
- ./backend:/app
env_file:
- ./backend/.env
depends_on:
- db
- redis
- web
# ==========================================================================
# Celery Beat - Scheduled Tasks (Optional)
# ==========================================================================
celery_beat:
build:
context: ./backend
dockerfile: Dockerfile
container_name: analyzer_celery_beat
restart: unless-stopped
command: celery -A core beat -l INFO
volumes:
- ./backend:/app
env_file:
- ./backend/.env
depends_on:
- db
- redis
- celery_worker
# ==========================================================================
# OWASP ZAP - Security Scanner
# ==========================================================================
zap:
image: ghcr.io/zaproxy/zaproxy:stable
container_name: analyzer_zap
restart: unless-stopped
command: zap.sh -daemon -host 0.0.0.0 -port 8080 -config api.key=zap-api-key-change-me -config api.addrs.addr.name=.* -config api.addrs.addr.regex=true
ports:
- "8081:8080"
volumes:
- zap_data:/home/zap/.ZAP
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8080/JSON/core/view/version/?apikey=zap-api-key-change-me"]
interval: 30s
timeout: 10s
retries: 5
# ==========================================================================
# Lighthouse Scanner Service (Node.js)
# ==========================================================================
lighthouse:
build:
context: ./lighthouse
dockerfile: Dockerfile
container_name: analyzer_lighthouse
restart: unless-stopped
ports:
- "3001:3001"
volumes:
- lighthouse_reports:/app/reports
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:3001/health"]
interval: 30s
timeout: 10s
retries: 3
volumes:
postgres_data:
redis_data:
static_volume:
zap_data:
lighthouse_reports:
networks:
default:
name: analyzer_network

54
lighthouse/Dockerfile Normal file
View File

@ -0,0 +1,54 @@
# Lighthouse Scanner Service - Dockerfile
# Node.js service that runs Lighthouse CLI and provides HTTP API
FROM node:20-slim
# Install Chrome dependencies
RUN apt-get update && apt-get install -y --no-install-recommends \
chromium \
fonts-liberation \
libappindicator3-1 \
libasound2 \
libatk-bridge2.0-0 \
libatk1.0-0 \
libcups2 \
libdbus-1-3 \
libdrm2 \
libgbm1 \
libgtk-3-0 \
libnspr4 \
libnss3 \
libxcomposite1 \
libxdamage1 \
libxfixes3 \
libxkbcommon0 \
libxrandr2 \
xdg-utils \
curl \
&& rm -rf /var/lib/apt/lists/*
# Set Chrome path for Lighthouse
ENV CHROME_PATH=/usr/bin/chromium
WORKDIR /app
# Copy package files
COPY package*.json ./
# Install dependencies
RUN npm ci --only=production
# Copy application code
COPY . .
# Create reports directory
RUN mkdir -p reports
# Create non-root user
RUN useradd -m -u 1000 lighthouse && \
chown -R lighthouse:lighthouse /app
USER lighthouse
EXPOSE 3001
CMD ["node", "server.js"]

19
lighthouse/package.json Normal file
View File

@ -0,0 +1,19 @@
{
"name": "lighthouse-scanner",
"version": "1.0.0",
"description": "Lighthouse scanner service for Website Analyzer",
"main": "server.js",
"scripts": {
"start": "node server.js",
"dev": "node --watch server.js"
},
"dependencies": {
"express": "^4.18.2",
"lighthouse": "^11.4.0",
"chrome-launcher": "^1.1.0",
"uuid": "^9.0.0"
},
"engines": {
"node": ">=18.0.0"
}
}

328
lighthouse/server.js Normal file
View File

@ -0,0 +1,328 @@
/**
* Lighthouse Scanner Service
*
* This service provides an HTTP API for running Lighthouse audits.
* It's designed to be called from the Django backend via Celery tasks.
*/
const express = require('express');
const lighthouse = require('lighthouse');
const chromeLauncher = require('chrome-launcher');
const { v4: uuidv4 } = require('uuid');
const fs = require('fs').promises;
const path = require('path');
const app = express();
app.use(express.json());
const PORT = process.env.PORT || 3001;
const REPORTS_DIR = path.join(__dirname, 'reports');
// Ensure reports directory exists
fs.mkdir(REPORTS_DIR, { recursive: true }).catch(console.error);
/**
* Health check endpoint
*/
app.get('/health', (req, res) => {
res.json({ status: 'healthy', service: 'lighthouse-scanner' });
});
/**
* Run Lighthouse audit for a given URL
*
* POST /scan
* Body: { "url": "https://example.com" }
*
* Returns: Lighthouse audit results as JSON
*/
app.post('/scan', async (req, res) => {
const { url } = req.body;
if (!url) {
return res.status(400).json({ error: 'URL is required' });
}
// Validate URL format
try {
new URL(url);
} catch (e) {
return res.status(400).json({ error: 'Invalid URL format' });
}
const scanId = uuidv4();
console.log(`[${scanId}] Starting Lighthouse scan for: ${url}`);
let chrome = null;
try {
// Launch Chrome
chrome = await chromeLauncher.launch({
chromeFlags: [
'--headless',
'--disable-gpu',
'--no-sandbox',
'--disable-dev-shm-usage',
'--disable-extensions',
'--disable-background-networking',
'--disable-sync',
'--disable-translate',
'--metrics-recording-only',
'--mute-audio',
'--no-first-run',
'--safebrowsing-disable-auto-update'
]
});
console.log(`[${scanId}] Chrome launched on port ${chrome.port}`);
// Lighthouse configuration
const options = {
logLevel: 'error',
output: 'json',
port: chrome.port,
onlyCategories: ['performance', 'accessibility', 'best-practices', 'seo'],
// Throttling settings for more realistic results
throttling: {
cpuSlowdownMultiplier: 4,
downloadThroughputKbps: 1638.4,
uploadThroughputKbps: 675,
rttMs: 150
},
screenEmulation: {
mobile: false,
width: 1920,
height: 1080,
deviceScaleFactor: 1,
disabled: false
},
formFactor: 'desktop'
};
// Run Lighthouse
const runnerResult = await lighthouse(url, options);
// Extract the report
const report = runnerResult.lhr;
// Process and extract key metrics
const result = {
scanId,
url: report.finalUrl || url,
fetchTime: report.fetchTime,
// Category scores (0-100)
scores: {
performance: Math.round((report.categories.performance?.score || 0) * 100),
accessibility: Math.round((report.categories.accessibility?.score || 0) * 100),
bestPractices: Math.round((report.categories['best-practices']?.score || 0) * 100),
seo: Math.round((report.categories.seo?.score || 0) * 100)
},
// Core Web Vitals and key metrics
metrics: {
firstContentfulPaint: {
value: report.audits['first-contentful-paint']?.numericValue || null,
unit: 'ms',
score: report.audits['first-contentful-paint']?.score || null
},
largestContentfulPaint: {
value: report.audits['largest-contentful-paint']?.numericValue || null,
unit: 'ms',
score: report.audits['largest-contentful-paint']?.score || null
},
speedIndex: {
value: report.audits['speed-index']?.numericValue || null,
unit: 'ms',
score: report.audits['speed-index']?.score || null
},
timeToInteractive: {
value: report.audits['interactive']?.numericValue || null,
unit: 'ms',
score: report.audits['interactive']?.score || null
},
totalBlockingTime: {
value: report.audits['total-blocking-time']?.numericValue || null,
unit: 'ms',
score: report.audits['total-blocking-time']?.score || null
},
cumulativeLayoutShift: {
value: report.audits['cumulative-layout-shift']?.numericValue || null,
unit: 'score',
score: report.audits['cumulative-layout-shift']?.score || null
}
},
// JavaScript and resource audits
resources: {
totalByteWeight: report.audits['total-byte-weight']?.numericValue || null,
bootupTime: report.audits['bootup-time']?.numericValue || null,
mainThreadWork: report.audits['mainthread-work-breakdown']?.numericValue || null,
// Unused resources
unusedJavascript: extractUnusedResources(report.audits['unused-javascript']),
unusedCss: extractUnusedResources(report.audits['unused-css-rules']),
// Render blocking resources
renderBlockingResources: extractRenderBlockingResources(report.audits['render-blocking-resources']),
// Large bundles
scriptTreemap: extractLargeScripts(report.audits['script-treemap-data']),
// Third party usage
thirdPartySummary: extractThirdPartySummary(report.audits['third-party-summary'])
},
// Diagnostics
diagnostics: {
numRequests: report.audits['network-requests']?.details?.items?.length || 0,
numScripts: countResourcesByType(report.audits['network-requests'], 'Script'),
numStylesheets: countResourcesByType(report.audits['network-requests'], 'Stylesheet'),
numImages: countResourcesByType(report.audits['network-requests'], 'Image'),
numFonts: countResourcesByType(report.audits['network-requests'], 'Font'),
totalTransferSize: report.audits['total-byte-weight']?.numericValue || 0
},
// Failed audits (potential issues)
issues: extractFailedAudits(report)
};
// Save full report to file for debugging
const reportPath = path.join(REPORTS_DIR, `${scanId}.json`);
await fs.writeFile(reportPath, JSON.stringify(report, null, 2));
console.log(`[${scanId}] Scan completed successfully`);
res.json(result);
} catch (error) {
console.error(`[${scanId}] Scan failed:`, error);
res.status(500).json({
error: 'Lighthouse scan failed',
message: error.message,
scanId
});
} finally {
if (chrome) {
await chrome.kill();
}
}
});
/**
* Get a saved report by ID
*/
app.get('/report/:scanId', async (req, res) => {
const { scanId } = req.params;
const reportPath = path.join(REPORTS_DIR, `${scanId}.json`);
try {
const report = await fs.readFile(reportPath, 'utf8');
res.json(JSON.parse(report));
} catch (error) {
res.status(404).json({ error: 'Report not found' });
}
});
// =============================================================================
// Helper Functions
// =============================================================================
function extractUnusedResources(audit) {
if (!audit?.details?.items) return [];
return audit.details.items.slice(0, 10).map(item => ({
url: item.url,
totalBytes: item.totalBytes,
wastedBytes: item.wastedBytes,
wastedPercent: item.wastedPercent
}));
}
function extractRenderBlockingResources(audit) {
if (!audit?.details?.items) return [];
return audit.details.items.map(item => ({
url: item.url,
wastedMs: item.wastedMs,
totalBytes: item.totalBytes
}));
}
function extractLargeScripts(audit) {
if (!audit?.details?.nodes) return [];
// Get scripts larger than 100KB
const largeScripts = [];
const processNode = (node, path = '') => {
const currentPath = path ? `${path}/${node.name}` : node.name;
if (node.resourceBytes > 100 * 1024) {
largeScripts.push({
name: currentPath,
resourceBytes: node.resourceBytes,
unusedBytes: node.unusedBytes || 0
});
}
if (node.children) {
node.children.forEach(child => processNode(child, currentPath));
}
};
audit.details.nodes.forEach(node => processNode(node));
return largeScripts.slice(0, 20);
}
function extractThirdPartySummary(audit) {
if (!audit?.details?.items) return [];
return audit.details.items.slice(0, 10).map(item => ({
entity: item.entity,
transferSize: item.transferSize,
blockingTime: item.blockingTime,
mainThreadTime: item.mainThreadTime
}));
}
function countResourcesByType(audit, type) {
if (!audit?.details?.items) return 0;
return audit.details.items.filter(item => item.resourceType === type).length;
}
function extractFailedAudits(report) {
const issues = [];
const categoriesToCheck = ['performance', 'accessibility', 'best-practices', 'seo'];
categoriesToCheck.forEach(categoryId => {
const category = report.categories[categoryId];
if (!category?.auditRefs) return;
category.auditRefs.forEach(ref => {
const audit = report.audits[ref.id];
// Include audits with score < 0.5 (50%)
if (audit && audit.score !== null && audit.score < 0.5) {
issues.push({
id: audit.id,
category: categoryId,
title: audit.title,
description: audit.description,
score: audit.score,
displayValue: audit.displayValue,
impact: ref.weight || 0
});
}
});
});
// Sort by impact (weight) descending
issues.sort((a, b) => b.impact - a.impact);
return issues.slice(0, 30);
}
// Start the server
app.listen(PORT, '0.0.0.0', () => {
console.log(`Lighthouse Scanner Service running on port ${PORT}`);
});