From 90ad47a721bf58630b2e19a96aafa3feca4e6efc Mon Sep 17 00:00:00 2001 From: Sereth1 Date: Mon, 8 Dec 2025 10:06:56 +0700 Subject: [PATCH] Initial commit: Lighthouse scanner service --- backend/.env.example | 27 + backend/Dockerfile | 84 +++ backend/api/__init__.py | 5 + backend/api/apps.py | 11 + backend/api/exceptions.py | 52 ++ backend/api/serializers.py | 243 +++++++++ backend/api/urls.py | 18 + backend/api/views.py | 336 ++++++++++++ backend/core/__init__.py | 9 + backend/core/asgi.py | 11 + backend/core/celery.py | 28 + backend/core/settings.py | 300 +++++++++++ backend/core/urls.py | 20 + backend/core/wsgi.py | 11 + backend/manage.py | 22 + backend/pyproject.toml | 91 ++++ backend/requirements.txt | 36 ++ backend/scanner/__init__.py | 5 + backend/scanner/apps.py | 11 + backend/scanner/scanners/__init__.py | 25 + backend/scanner/scanners/base.py | 161 ++++++ backend/scanner/scanners/headers.py | 405 ++++++++++++++ backend/scanner/scanners/lighthouse.py | 323 ++++++++++++ .../scanner/scanners/playwright_scanner.py | 397 ++++++++++++++ backend/scanner/scanners/runner.py | 314 +++++++++++ backend/scanner/scanners/tls.py | 380 ++++++++++++++ backend/scanner/scanners/zap.py | 307 +++++++++++ backend/scanner/tasks.py | 306 +++++++++++ backend/scanner/utils.py | 185 +++++++ backend/templates/base.html | 89 ++++ backend/websites/__init__.py | 5 + backend/websites/admin.py | 93 ++++ backend/websites/apps.py | 11 + backend/websites/models.py | 493 ++++++++++++++++++ docker-compose.yml | 160 ++++++ lighthouse/Dockerfile | 54 ++ lighthouse/package.json | 19 + lighthouse/server.js | 328 ++++++++++++ 38 files changed, 5375 insertions(+) create mode 100644 backend/.env.example create mode 100644 backend/Dockerfile create mode 100644 backend/api/__init__.py create mode 100644 backend/api/apps.py create mode 100644 backend/api/exceptions.py create mode 100644 backend/api/serializers.py create mode 100644 backend/api/urls.py create mode 100644 backend/api/views.py create mode 100644 backend/core/__init__.py create mode 100644 backend/core/asgi.py create mode 100644 backend/core/celery.py create mode 100644 backend/core/settings.py create mode 100644 backend/core/urls.py create mode 100644 backend/core/wsgi.py create mode 100644 backend/manage.py create mode 100644 backend/pyproject.toml create mode 100644 backend/requirements.txt create mode 100644 backend/scanner/__init__.py create mode 100644 backend/scanner/apps.py create mode 100644 backend/scanner/scanners/__init__.py create mode 100644 backend/scanner/scanners/base.py create mode 100644 backend/scanner/scanners/headers.py create mode 100644 backend/scanner/scanners/lighthouse.py create mode 100644 backend/scanner/scanners/playwright_scanner.py create mode 100644 backend/scanner/scanners/runner.py create mode 100644 backend/scanner/scanners/tls.py create mode 100644 backend/scanner/scanners/zap.py create mode 100644 backend/scanner/tasks.py create mode 100644 backend/scanner/utils.py create mode 100644 backend/templates/base.html create mode 100644 backend/websites/__init__.py create mode 100644 backend/websites/admin.py create mode 100644 backend/websites/apps.py create mode 100644 backend/websites/models.py create mode 100644 docker-compose.yml create mode 100644 lighthouse/Dockerfile create mode 100644 lighthouse/package.json create mode 100644 lighthouse/server.js diff --git a/backend/.env.example b/backend/.env.example new file mode 100644 index 0000000..913933e --- /dev/null +++ b/backend/.env.example @@ -0,0 +1,27 @@ +# Django Core Settings +DEBUG=True +SECRET_KEY=your-secret-key-change-in-production-abc123xyz789 +ALLOWED_HOSTS=localhost,127.0.0.1,web + +# Database +DATABASE_URL=postgres://analyzer:analyzer_password@db:5432/website_analyzer + +# Redis & Celery +REDIS_URL=redis://redis:6379/0 +CELERY_BROKER_URL=redis://redis:6379/0 +CELERY_RESULT_BACKEND=redis://redis:6379/1 + +# OWASP ZAP Configuration +ZAP_API_KEY=zap-api-key-change-me +ZAP_HOST=http://zap:8080 + +# Lighthouse Configuration +LIGHTHOUSE_CHROME_FLAGS=--headless --no-sandbox --disable-gpu + +# Scan Settings +MAX_SCAN_TIME_SECONDS=300 +SCAN_RATE_LIMIT_MINUTES=5 +MAX_CONCURRENT_SCANS=3 + +# Security +CORS_ALLOWED_ORIGINS=http://localhost:3000,http://localhost:8000 diff --git a/backend/Dockerfile b/backend/Dockerfile new file mode 100644 index 0000000..343619c --- /dev/null +++ b/backend/Dockerfile @@ -0,0 +1,84 @@ +# Website Analyzer Backend - Dockerfile +# Multi-stage build for efficient image size + +FROM python:3.11-slim as builder + +# Set environment variables +ENV PYTHONDONTWRITEBYTECODE=1 \ + PYTHONUNBUFFERED=1 \ + PIP_NO_CACHE_DIR=1 \ + PIP_DISABLE_PIP_VERSION_CHECK=1 + +WORKDIR /app + +# Install system dependencies +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential \ + libpq-dev \ + curl \ + && rm -rf /var/lib/apt/lists/* + +# Install Python dependencies +COPY requirements.txt . +RUN pip install --user -r requirements.txt + +# Install Playwright and its dependencies +RUN pip install --user playwright && \ + python -m playwright install chromium && \ + python -m playwright install-deps chromium + +# ========================================================================== +# Production Stage +# ========================================================================== +FROM python:3.11-slim + +ENV PYTHONDONTWRITEBYTECODE=1 \ + PYTHONUNBUFFERED=1 \ + PATH="/root/.local/bin:$PATH" + +WORKDIR /app + +# Install runtime dependencies +RUN apt-get update && apt-get install -y --no-install-recommends \ + libpq5 \ + curl \ + # Playwright/Chromium dependencies + libnss3 \ + libnspr4 \ + libatk1.0-0 \ + libatk-bridge2.0-0 \ + libcups2 \ + libdrm2 \ + libdbus-1-3 \ + libxkbcommon0 \ + libxcomposite1 \ + libxdamage1 \ + libxfixes3 \ + libxrandr2 \ + libgbm1 \ + libasound2 \ + libpango-1.0-0 \ + libcairo2 \ + libatspi2.0-0 \ + && rm -rf /var/lib/apt/lists/* + +# Copy Python packages from builder +COPY --from=builder /root/.local /root/.local +COPY --from=builder /root/.cache/ms-playwright /root/.cache/ms-playwright + +# Copy application code +COPY . . + +# Create logs directory +RUN mkdir -p logs staticfiles + +# Create non-root user for security +RUN useradd -m -u 1000 appuser && \ + chown -R appuser:appuser /app /root/.local /root/.cache +USER appuser + +# Expose port +EXPOSE 8000 + +# Default command +CMD ["gunicorn", "core.wsgi:application", "--bind", "0.0.0.0:8000", "--workers", "4"] diff --git a/backend/api/__init__.py b/backend/api/__init__.py new file mode 100644 index 0000000..4883678 --- /dev/null +++ b/backend/api/__init__.py @@ -0,0 +1,5 @@ +""" +API app initialization. +""" + +default_app_config = 'api.apps.ApiConfig' diff --git a/backend/api/apps.py b/backend/api/apps.py new file mode 100644 index 0000000..ac8ea5d --- /dev/null +++ b/backend/api/apps.py @@ -0,0 +1,11 @@ +""" +API app configuration. +""" + +from django.apps import AppConfig + + +class ApiConfig(AppConfig): + default_auto_field = 'django.db.models.BigAutoField' + name = 'api' + verbose_name = 'REST API' diff --git a/backend/api/exceptions.py b/backend/api/exceptions.py new file mode 100644 index 0000000..4a8234a --- /dev/null +++ b/backend/api/exceptions.py @@ -0,0 +1,52 @@ +""" +Custom exception handler for DRF. +""" + +from rest_framework.views import exception_handler +from rest_framework.response import Response +from rest_framework import status +import logging + +logger = logging.getLogger(__name__) + + +def custom_exception_handler(exc, context): + """ + Custom exception handler that provides consistent error responses. + + Handles common exceptions and formats them consistently. + """ + # Call REST framework's default exception handler first + response = exception_handler(exc, context) + + if response is not None: + # Customize the response data + custom_response_data = { + 'error': True, + 'status_code': response.status_code, + } + + if isinstance(response.data, dict): + if 'detail' in response.data: + custom_response_data['message'] = str(response.data['detail']) + else: + custom_response_data['errors'] = response.data + elif isinstance(response.data, list): + custom_response_data['errors'] = response.data + else: + custom_response_data['message'] = str(response.data) + + response.data = custom_response_data + return response + + # Handle unexpected exceptions + logger.exception(f"Unhandled exception: {exc}") + + return Response( + { + 'error': True, + 'status_code': 500, + 'message': 'An unexpected error occurred', + }, + status=status.HTTP_500_INTERNAL_SERVER_ERROR + ) diff --git a/backend/api/serializers.py b/backend/api/serializers.py new file mode 100644 index 0000000..fed1333 --- /dev/null +++ b/backend/api/serializers.py @@ -0,0 +1,243 @@ +""" +DRF Serializers for the API. + +This module defines serializers for converting model instances +to JSON and validating input data. +""" + +from rest_framework import serializers +from websites.models import Website, Scan, Issue, Metric, ScanStatus + + +class IssueSerializer(serializers.ModelSerializer): + """Serializer for Issue model.""" + + severity_display = serializers.CharField(source='get_severity_display', read_only=True) + category_display = serializers.CharField(source='get_category_display', read_only=True) + tool_display = serializers.CharField(source='get_tool_display', read_only=True) + + class Meta: + model = Issue + fields = [ + 'id', + 'category', + 'category_display', + 'severity', + 'severity_display', + 'tool', + 'tool_display', + 'title', + 'description', + 'affected_url', + 'remediation', + 'created_at', + ] + read_only_fields = fields + + +class MetricSerializer(serializers.ModelSerializer): + """Serializer for Metric model.""" + + formatted_value = serializers.CharField(source='get_formatted_value', read_only=True) + unit_display = serializers.CharField(source='get_unit_display', read_only=True) + + class Meta: + model = Metric + fields = [ + 'id', + 'name', + 'display_name', + 'value', + 'unit', + 'unit_display', + 'formatted_value', + 'source', + 'score', + ] + read_only_fields = fields + + +class ScanListSerializer(serializers.ModelSerializer): + """Serializer for Scan list views (minimal data).""" + + status_display = serializers.CharField(source='get_status_display', read_only=True) + website_url = serializers.CharField(source='website.url', read_only=True) + issues_count = serializers.SerializerMethodField() + + class Meta: + model = Scan + fields = [ + 'id', + 'website_url', + 'status', + 'status_display', + 'created_at', + 'completed_at', + 'overall_score', + 'performance_score', + 'security_score', + 'issues_count', + ] + read_only_fields = fields + + def get_issues_count(self, obj): + return obj.issues.count() + + +class ScanDetailSerializer(serializers.ModelSerializer): + """Serializer for Scan detail views (full data).""" + + status_display = serializers.CharField(source='get_status_display', read_only=True) + website_url = serializers.CharField(source='website.url', read_only=True) + website_domain = serializers.CharField(source='website.domain', read_only=True) + issues = IssueSerializer(many=True, read_only=True) + metrics = MetricSerializer(many=True, read_only=True) + issues_by_category = serializers.SerializerMethodField() + issues_by_severity = serializers.SerializerMethodField() + + class Meta: + model = Scan + fields = [ + 'id', + 'website_url', + 'website_domain', + 'status', + 'status_display', + 'created_at', + 'started_at', + 'completed_at', + 'overall_score', + 'performance_score', + 'accessibility_score', + 'seo_score', + 'best_practices_score', + 'security_score', + 'error_message', + 'issues', + 'metrics', + 'issues_by_category', + 'issues_by_severity', + ] + read_only_fields = fields + + def get_issues_by_category(self, obj): + """Group issues by category.""" + from collections import defaultdict + grouped = defaultdict(list) + + for issue in obj.issues.all(): + grouped[issue.category].append(IssueSerializer(issue).data) + + return dict(grouped) + + def get_issues_by_severity(self, obj): + """Count issues by severity.""" + from django.db.models import Count + + counts = obj.issues.values('severity').annotate(count=Count('id')) + return {item['severity']: item['count'] for item in counts} + + +class ScanCreateSerializer(serializers.Serializer): + """Serializer for creating new scans.""" + + url = serializers.URLField( + required=True, + help_text="The URL to scan (must be http or https)" + ) + + def validate_url(self, value): + """Validate and normalize the URL.""" + from scanner.utils import validate_url + + is_valid, result = validate_url(value) + + if not is_valid: + raise serializers.ValidationError(result) + + return result # Return normalized URL + + def create(self, validated_data): + """Create Website and Scan records.""" + from scanner.tasks import check_rate_limit, check_concurrent_scan_limit, run_scan_task + + url = validated_data['url'] + + # Check rate limit + rate_limit_error = check_rate_limit(url) + if rate_limit_error: + raise serializers.ValidationError({'url': rate_limit_error}) + + # Check concurrent scan limit + concurrent_error = check_concurrent_scan_limit() + if concurrent_error: + raise serializers.ValidationError({'non_field_errors': concurrent_error}) + + # Get or create Website + website, created = Website.objects.get_or_create( + url=url, + defaults={'domain': validated_data.get('domain', '')} + ) + + # Create Scan + scan = Scan.objects.create( + website=website, + status=ScanStatus.PENDING + ) + + # Trigger Celery task + task = run_scan_task.delay(str(scan.id)) + + # Update scan with task ID + scan.celery_task_id = task.id + scan.save(update_fields=['celery_task_id']) + + return scan + + +class WebsiteSerializer(serializers.ModelSerializer): + """Serializer for Website model.""" + + scans_count = serializers.SerializerMethodField() + latest_scan = serializers.SerializerMethodField() + + class Meta: + model = Website + fields = [ + 'id', + 'url', + 'domain', + 'created_at', + 'last_scanned_at', + 'scans_count', + 'latest_scan', + ] + read_only_fields = fields + + def get_scans_count(self, obj): + return obj.scans.count() + + def get_latest_scan(self, obj): + latest = obj.scans.first() + if latest: + return ScanListSerializer(latest).data + return None + + +class WebsiteDetailSerializer(WebsiteSerializer): + """Detailed Website serializer with scan list.""" + + scans = ScanListSerializer(many=True, read_only=True) + + class Meta(WebsiteSerializer.Meta): + fields = WebsiteSerializer.Meta.fields + ['scans'] + + +class HealthCheckSerializer(serializers.Serializer): + """Serializer for health check response.""" + + status = serializers.CharField() + database = serializers.CharField() + redis = serializers.CharField() + celery = serializers.CharField() + timestamp = serializers.DateTimeField() diff --git a/backend/api/urls.py b/backend/api/urls.py new file mode 100644 index 0000000..1f8af24 --- /dev/null +++ b/backend/api/urls.py @@ -0,0 +1,18 @@ +""" +URL routing for the API. +""" + +from django.urls import path, include +from rest_framework.routers import DefaultRouter +from . import views + +router = DefaultRouter() +router.register(r'scans', views.ScanViewSet, basename='scan') +router.register(r'websites', views.WebsiteViewSet, basename='website') +router.register(r'issues', views.IssueViewSet, basename='issue') + +urlpatterns = [ + path('', views.api_root, name='api-root'), + path('health/', views.health_check, name='health-check'), + path('', include(router.urls)), +] diff --git a/backend/api/views.py b/backend/api/views.py new file mode 100644 index 0000000..8a0f95b --- /dev/null +++ b/backend/api/views.py @@ -0,0 +1,336 @@ +""" +DRF Views for the API. + +This module defines API views for scans, websites, and issues. +""" + +import logging +from django.db import connection +from django.utils import timezone +from django.core.cache import cache +from rest_framework import viewsets, status, generics +from rest_framework.decorators import api_view, action +from rest_framework.response import Response +from rest_framework.pagination import PageNumberPagination +from rest_framework.throttling import AnonRateThrottle + +from websites.models import Website, Scan, Issue, Metric +from .serializers import ( + WebsiteSerializer, + WebsiteDetailSerializer, + ScanListSerializer, + ScanDetailSerializer, + ScanCreateSerializer, + IssueSerializer, + MetricSerializer, + HealthCheckSerializer, +) + +logger = logging.getLogger(__name__) + + +class ScanRateThrottle(AnonRateThrottle): + """Custom throttle for scan creation.""" + rate = '10/hour' + + +class StandardResultsPagination(PageNumberPagination): + """Standard pagination for list views.""" + page_size = 20 + page_size_query_param = 'page_size' + max_page_size = 100 + + +class ScanViewSet(viewsets.ModelViewSet): + """ + ViewSet for Scan operations. + + Endpoints: + - POST /api/scans/ - Create a new scan + - GET /api/scans/ - List all scans + - GET /api/scans/{id}/ - Get scan details + - DELETE /api/scans/{id}/ - Delete a scan + """ + + queryset = Scan.objects.select_related('website').prefetch_related('issues', 'metrics') + pagination_class = StandardResultsPagination + + def get_serializer_class(self): + if self.action == 'list': + return ScanListSerializer + elif self.action == 'create': + return ScanCreateSerializer + return ScanDetailSerializer + + def get_throttles(self): + if self.action == 'create': + return [ScanRateThrottle()] + return super().get_throttles() + + def create(self, request, *args, **kwargs): + """ + Create a new scan. + + Request body: + ```json + {"url": "https://example.com"} + ``` + + Returns the created scan with pending status. + The scan will be processed asynchronously. + """ + serializer = self.get_serializer(data=request.data) + serializer.is_valid(raise_exception=True) + + try: + scan = serializer.save() + + # Return the created scan details + response_serializer = ScanDetailSerializer(scan) + return Response( + response_serializer.data, + status=status.HTTP_201_CREATED + ) + except Exception as e: + logger.exception("Error creating scan") + return Response( + {'error': str(e)}, + status=status.HTTP_500_INTERNAL_SERVER_ERROR + ) + + @action(detail=True, methods=['get']) + def issues(self, request, pk=None): + """Get all issues for a scan.""" + scan = self.get_object() + issues = scan.issues.all() + + # Optional filtering + category = request.query_params.get('category') + severity = request.query_params.get('severity') + tool = request.query_params.get('tool') + + if category: + issues = issues.filter(category=category) + if severity: + issues = issues.filter(severity=severity) + if tool: + issues = issues.filter(tool=tool) + + serializer = IssueSerializer(issues, many=True) + return Response(serializer.data) + + @action(detail=True, methods=['get']) + def metrics(self, request, pk=None): + """Get all metrics for a scan.""" + scan = self.get_object() + metrics = scan.metrics.all() + + # Optional filtering by source + source = request.query_params.get('source') + if source: + metrics = metrics.filter(source=source) + + serializer = MetricSerializer(metrics, many=True) + return Response(serializer.data) + + @action(detail=True, methods=['get']) + def status(self, request, pk=None): + """Get just the status of a scan (for polling).""" + scan = self.get_object() + return Response({ + 'id': str(scan.id), + 'status': scan.status, + 'status_display': scan.get_status_display(), + 'progress': self._get_scan_progress(scan), + }) + + def _get_scan_progress(self, scan): + """Estimate scan progress based on status and results.""" + if scan.status == 'done': + return 100 + elif scan.status == 'failed': + return 0 + elif scan.status == 'running': + # Estimate based on what data we have + progress = 10 # Started + if scan.raw_headers_data: + progress += 20 + if scan.raw_playwright_data: + progress += 25 + if scan.raw_lighthouse_data: + progress += 30 + if scan.raw_zap_data: + progress += 15 + return min(progress, 95) + return 0 + + +class WebsiteViewSet(viewsets.ReadOnlyModelViewSet): + """ + ViewSet for Website operations. + + Endpoints: + - GET /api/websites/ - List all websites + - GET /api/websites/{id}/ - Get website details + - GET /api/websites/{id}/scans/ - Get scans for a website + """ + + queryset = Website.objects.prefetch_related('scans') + pagination_class = StandardResultsPagination + + def get_serializer_class(self): + if self.action == 'retrieve': + return WebsiteDetailSerializer + return WebsiteSerializer + + @action(detail=True, methods=['get']) + def scans(self, request, pk=None): + """Get all scans for a website.""" + website = self.get_object() + scans = website.scans.all() + + # Apply pagination + page = self.paginate_queryset(scans) + if page is not None: + serializer = ScanListSerializer(page, many=True) + return self.get_paginated_response(serializer.data) + + serializer = ScanListSerializer(scans, many=True) + return Response(serializer.data) + + +class IssueViewSet(viewsets.ReadOnlyModelViewSet): + """ + ViewSet for Issue operations. + + Endpoints: + - GET /api/issues/ - List all issues (with filtering) + - GET /api/issues/{id}/ - Get issue details + """ + + queryset = Issue.objects.select_related('scan', 'scan__website') + serializer_class = IssueSerializer + pagination_class = StandardResultsPagination + + def get_queryset(self): + queryset = super().get_queryset() + + # Filter by scan + scan_id = self.request.query_params.get('scan') + if scan_id: + queryset = queryset.filter(scan_id=scan_id) + + # Filter by category + category = self.request.query_params.get('category') + if category: + queryset = queryset.filter(category=category) + + # Filter by severity + severity = self.request.query_params.get('severity') + if severity: + queryset = queryset.filter(severity=severity) + + # Filter by tool + tool = self.request.query_params.get('tool') + if tool: + queryset = queryset.filter(tool=tool) + + return queryset + + +@api_view(['GET']) +def health_check(request): + """ + Health check endpoint. + + Checks: + - Database connectivity + - Redis connectivity + - Celery worker status + + Returns health status of all components. + """ + health = { + 'status': 'healthy', + 'database': 'unknown', + 'redis': 'unknown', + 'celery': 'unknown', + 'timestamp': timezone.now(), + } + + # Check database + try: + connection.ensure_connection() + health['database'] = 'healthy' + except Exception as e: + health['database'] = f'unhealthy: {e}' + health['status'] = 'unhealthy' + + # Check Redis + try: + cache.set('health_check', 'ok', 10) + if cache.get('health_check') == 'ok': + health['redis'] = 'healthy' + else: + health['redis'] = 'unhealthy: cache not working' + health['status'] = 'degraded' + except Exception as e: + health['redis'] = f'unhealthy: {e}' + health['status'] = 'degraded' + + # Check Celery (basic check) + try: + from core.celery import app as celery_app + inspect = celery_app.control.inspect() + + # Try to get active workers + active = inspect.active() + if active: + health['celery'] = f'healthy ({len(active)} workers)' + else: + health['celery'] = 'degraded: no active workers' + health['status'] = 'degraded' + except Exception as e: + health['celery'] = f'unknown: {e}' + + status_code = 200 if health['status'] == 'healthy' else 503 + + serializer = HealthCheckSerializer(health) + return Response(serializer.data, status=status_code) + + +@api_view(['GET']) +def api_root(request): + """ + API root endpoint. + + Returns available endpoints and basic API information. + """ + return Response({ + 'message': 'Website Analyzer API', + 'version': '1.0.0', + 'endpoints': { + 'scans': '/api/scans/', + 'websites': '/api/websites/', + 'issues': '/api/issues/', + 'health': '/api/health/', + }, + 'documentation': { + 'create_scan': { + 'method': 'POST', + 'url': '/api/scans/', + 'body': {'url': 'https://example.com'}, + 'description': 'Create a new website scan' + }, + 'get_scan': { + 'method': 'GET', + 'url': '/api/scans/{id}/', + 'description': 'Get scan results and details' + }, + 'list_scans': { + 'method': 'GET', + 'url': '/api/scans/', + 'description': 'List all scans with pagination' + }, + } + }) diff --git a/backend/core/__init__.py b/backend/core/__init__.py new file mode 100644 index 0000000..406470b --- /dev/null +++ b/backend/core/__init__.py @@ -0,0 +1,9 @@ +""" +Core module initialization. + +This module loads the Celery app so that shared_task will use this app. +""" + +from .celery import app as celery_app + +__all__ = ('celery_app',) diff --git a/backend/core/asgi.py b/backend/core/asgi.py new file mode 100644 index 0000000..807ba6f --- /dev/null +++ b/backend/core/asgi.py @@ -0,0 +1,11 @@ +""" +ASGI config for Website Analyzer project. +""" + +import os + +from django.core.asgi import get_asgi_application + +os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'core.settings') + +application = get_asgi_application() diff --git a/backend/core/celery.py b/backend/core/celery.py new file mode 100644 index 0000000..4612894 --- /dev/null +++ b/backend/core/celery.py @@ -0,0 +1,28 @@ +""" +Celery configuration for Website Analyzer. + +This module configures Celery for asynchronous task processing, +specifically for running website scans in the background. +""" + +import os + +from celery import Celery + +# Set the default Django settings module for the 'celery' program. +os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'core.settings') + +app = Celery('website_analyzer') + +# Using a string here means the worker doesn't have to serialize +# the configuration object to child processes. +app.config_from_object('django.conf:settings', namespace='CELERY') + +# Load task modules from all registered Django apps. +app.autodiscover_tasks() + + +@app.task(bind=True, ignore_result=True) +def debug_task(self): + """Debug task for testing Celery connectivity.""" + print(f'Request: {self.request!r}') diff --git a/backend/core/settings.py b/backend/core/settings.py new file mode 100644 index 0000000..209c956 --- /dev/null +++ b/backend/core/settings.py @@ -0,0 +1,300 @@ +""" +Django settings for Website Analyzer project. + +This module contains all configuration settings for the Django application, +including database, caching, security, and third-party integrations. +""" + +import os +from pathlib import Path + +from dotenv import load_dotenv + +# Load environment variables from .env file +load_dotenv() + +# Build paths inside the project like this: BASE_DIR / 'subdir'. +BASE_DIR = Path(__file__).resolve().parent.parent + +# SECURITY WARNING: keep the secret key used in production secret! +SECRET_KEY = os.getenv('SECRET_KEY', 'django-insecure-change-me-in-production') + +# SECURITY WARNING: don't run with debug turned on in production! +DEBUG = os.getenv('DEBUG', 'False').lower() in ('true', '1', 'yes') + +ALLOWED_HOSTS = os.getenv('ALLOWED_HOSTS', 'localhost,127.0.0.1').split(',') + + +# Application definition + +INSTALLED_APPS = [ + 'django.contrib.admin', + 'django.contrib.auth', + 'django.contrib.contenttypes', + 'django.contrib.sessions', + 'django.contrib.messages', + 'django.contrib.staticfiles', + + # Third-party apps + 'rest_framework', + 'corsheaders', + + # Local apps + 'websites', + 'scanner', + 'api', +] + +MIDDLEWARE = [ + 'django.middleware.security.SecurityMiddleware', + 'whitenoise.middleware.WhiteNoiseMiddleware', + 'corsheaders.middleware.CorsMiddleware', + 'django.contrib.sessions.middleware.SessionMiddleware', + 'django.middleware.common.CommonMiddleware', + 'django.middleware.csrf.CsrfViewMiddleware', + 'django.contrib.auth.middleware.AuthenticationMiddleware', + 'django.contrib.messages.middleware.MessageMiddleware', + 'django.middleware.clickjacking.XFrameOptionsMiddleware', +] + +ROOT_URLCONF = 'core.urls' + +TEMPLATES = [ + { + 'BACKEND': 'django.template.backends.django.DjangoTemplates', + 'DIRS': [BASE_DIR / 'templates'], + 'APP_DIRS': True, + 'OPTIONS': { + 'context_processors': [ + 'django.template.context_processors.debug', + 'django.template.context_processors.request', + 'django.contrib.auth.context_processors.auth', + 'django.contrib.messages.context_processors.messages', + ], + }, + }, +] + +WSGI_APPLICATION = 'core.wsgi.application' + + +# Database +# Parse DATABASE_URL or use default PostgreSQL settings + +DATABASE_URL = os.getenv('DATABASE_URL', 'postgres://analyzer:analyzer_password@localhost:5432/website_analyzer') + +# Parse the DATABASE_URL +import re +db_pattern = r'postgres://(?P[^:]+):(?P[^@]+)@(?P[^:]+):(?P\d+)/(?P.+)' +db_match = re.match(db_pattern, DATABASE_URL) + +if db_match: + DATABASES = { + 'default': { + 'ENGINE': 'django.db.backends.postgresql', + 'NAME': db_match.group('name'), + 'USER': db_match.group('user'), + 'PASSWORD': db_match.group('password'), + 'HOST': db_match.group('host'), + 'PORT': db_match.group('port'), + } + } +else: + # Fallback for development + DATABASES = { + 'default': { + 'ENGINE': 'django.db.backends.sqlite3', + 'NAME': BASE_DIR / 'db.sqlite3', + } + } + + +# Password validation +AUTH_PASSWORD_VALIDATORS = [ + { + 'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator', + }, + { + 'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator', + }, + { + 'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator', + }, + { + 'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator', + }, +] + + +# Internationalization +LANGUAGE_CODE = 'en-us' +TIME_ZONE = 'UTC' +USE_I18N = True +USE_TZ = True + + +# Static files (CSS, JavaScript, Images) +STATIC_URL = 'static/' +STATIC_ROOT = BASE_DIR / 'staticfiles' +STATICFILES_DIRS = [BASE_DIR / 'static'] +STATICFILES_STORAGE = 'whitenoise.storage.CompressedManifestStaticFilesStorage' + +# Default primary key field type +DEFAULT_AUTO_FIELD = 'django.db.models.BigAutoField' + + +# ============================================================================= +# REST Framework Configuration +# ============================================================================= +REST_FRAMEWORK = { + 'DEFAULT_RENDERER_CLASSES': [ + 'rest_framework.renderers.JSONRenderer', + 'rest_framework.renderers.BrowsableAPIRenderer', + ], + 'DEFAULT_PAGINATION_CLASS': 'rest_framework.pagination.PageNumberPagination', + 'PAGE_SIZE': 20, + 'DEFAULT_THROTTLE_CLASSES': [ + 'rest_framework.throttling.AnonRateThrottle', + 'rest_framework.throttling.UserRateThrottle' + ], + 'DEFAULT_THROTTLE_RATES': { + 'anon': '100/hour', + 'user': '1000/hour', + 'scan': '10/hour', # Specific rate for scan creation + }, + 'EXCEPTION_HANDLER': 'api.exceptions.custom_exception_handler', +} + + +# ============================================================================= +# CORS Configuration +# ============================================================================= +CORS_ALLOWED_ORIGINS = os.getenv( + 'CORS_ALLOWED_ORIGINS', + 'http://localhost:3000,http://localhost:8000' +).split(',') +CORS_ALLOW_CREDENTIALS = True + + +# ============================================================================= +# Celery Configuration +# ============================================================================= +CELERY_BROKER_URL = os.getenv('CELERY_BROKER_URL', 'redis://localhost:6379/0') +CELERY_RESULT_BACKEND = os.getenv('CELERY_RESULT_BACKEND', 'redis://localhost:6379/1') +CELERY_ACCEPT_CONTENT = ['json'] +CELERY_TASK_SERIALIZER = 'json' +CELERY_RESULT_SERIALIZER = 'json' +CELERY_TIMEZONE = TIME_ZONE +CELERY_TASK_TRACK_STARTED = True +CELERY_TASK_TIME_LIMIT = int(os.getenv('MAX_SCAN_TIME_SECONDS', '300')) +CELERY_TASK_SOFT_TIME_LIMIT = CELERY_TASK_TIME_LIMIT - 30 + + +# ============================================================================= +# Redis Cache Configuration +# ============================================================================= +REDIS_URL = os.getenv('REDIS_URL', 'redis://localhost:6379/0') +CACHES = { + 'default': { + 'BACKEND': 'django.core.cache.backends.redis.RedisCache', + 'LOCATION': REDIS_URL, + } +} + + +# ============================================================================= +# Scanner Configuration +# ============================================================================= +SCANNER_CONFIG = { + # OWASP ZAP settings + 'ZAP_API_KEY': os.getenv('ZAP_API_KEY', ''), + 'ZAP_HOST': os.getenv('ZAP_HOST', 'http://localhost:8080'), + 'ZAP_TIMEOUT': 120, + + # Lighthouse settings + 'LIGHTHOUSE_CHROME_FLAGS': os.getenv( + 'LIGHTHOUSE_CHROME_FLAGS', + '--headless --no-sandbox --disable-gpu' + ), + 'LIGHTHOUSE_TIMEOUT': 60, + + # Playwright settings + 'PLAYWRIGHT_TIMEOUT': 30000, # milliseconds + 'PLAYWRIGHT_VIEWPORT': {'width': 1920, 'height': 1080}, + + # General scan settings + 'MAX_SCAN_TIME_SECONDS': int(os.getenv('MAX_SCAN_TIME_SECONDS', '300')), + 'SCAN_RATE_LIMIT_MINUTES': int(os.getenv('SCAN_RATE_LIMIT_MINUTES', '5')), + 'MAX_CONCURRENT_SCANS': int(os.getenv('MAX_CONCURRENT_SCANS', '3')), + + # Safety settings - blocked IP ranges (RFC1918 private ranges + localhost) + 'BLOCKED_IP_RANGES': [ + '10.0.0.0/8', + '172.16.0.0/12', + '192.168.0.0/16', + '127.0.0.0/8', + '169.254.0.0/16', # Link-local + '::1/128', # IPv6 localhost + 'fc00::/7', # IPv6 private + 'fe80::/10', # IPv6 link-local + ], + 'BLOCKED_HOSTS': ['localhost', 'localhost.localdomain'], + + # Large file thresholds + 'LARGE_IMAGE_THRESHOLD_BYTES': 1024 * 1024, # 1 MB + 'LARGE_JS_BUNDLE_THRESHOLD_BYTES': 500 * 1024, # 500 KB +} + + +# ============================================================================= +# Logging Configuration +# ============================================================================= +LOGGING = { + 'version': 1, + 'disable_existing_loggers': False, + 'formatters': { + 'verbose': { + 'format': '{levelname} {asctime} {module} {process:d} {thread:d} {message}', + 'style': '{', + }, + 'simple': { + 'format': '{levelname} {asctime} {module} {message}', + 'style': '{', + }, + }, + 'handlers': { + 'console': { + 'class': 'logging.StreamHandler', + 'formatter': 'simple', + }, + 'file': { + 'class': 'logging.FileHandler', + 'filename': BASE_DIR / 'logs' / 'django.log', + 'formatter': 'verbose', + }, + }, + 'root': { + 'handlers': ['console'], + 'level': 'INFO', + }, + 'loggers': { + 'django': { + 'handlers': ['console'], + 'level': os.getenv('DJANGO_LOG_LEVEL', 'INFO'), + 'propagate': False, + }, + 'scanner': { + 'handlers': ['console'], + 'level': 'DEBUG' if DEBUG else 'INFO', + 'propagate': False, + }, + 'celery': { + 'handlers': ['console'], + 'level': 'INFO', + 'propagate': False, + }, + }, +} + +# Create logs directory if it doesn't exist +(BASE_DIR / 'logs').mkdir(exist_ok=True) diff --git a/backend/core/urls.py b/backend/core/urls.py new file mode 100644 index 0000000..445efb3 --- /dev/null +++ b/backend/core/urls.py @@ -0,0 +1,20 @@ +""" +URL configuration for Website Analyzer project. +""" + +from django.contrib import admin +from django.urls import path, include +from django.views.generic import TemplateView + + +urlpatterns = [ + # Admin + path('admin/', admin.site.urls), + + # API endpoints + path('api/', include('api.urls')), + + # Frontend views + path('', TemplateView.as_view(template_name='index.html'), name='home'), + path('scan//', TemplateView.as_view(template_name='scan_detail.html'), name='scan_detail'), +] diff --git a/backend/core/wsgi.py b/backend/core/wsgi.py new file mode 100644 index 0000000..31c1c26 --- /dev/null +++ b/backend/core/wsgi.py @@ -0,0 +1,11 @@ +""" +WSGI config for Website Analyzer project. +""" + +import os + +from django.core.wsgi import get_wsgi_application + +os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'core.settings') + +application = get_wsgi_application() diff --git a/backend/manage.py b/backend/manage.py new file mode 100644 index 0000000..f2a662c --- /dev/null +++ b/backend/manage.py @@ -0,0 +1,22 @@ +#!/usr/bin/env python +"""Django's command-line utility for administrative tasks.""" +import os +import sys + + +def main(): + """Run administrative tasks.""" + os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'core.settings') + try: + from django.core.management import execute_from_command_line + except ImportError as exc: + raise ImportError( + "Couldn't import Django. Are you sure it's installed and " + "available on your PYTHONPATH environment variable? Did you " + "forget to activate a virtual environment?" + ) from exc + execute_from_command_line(sys.argv) + + +if __name__ == '__main__': + main() diff --git a/backend/pyproject.toml b/backend/pyproject.toml new file mode 100644 index 0000000..998685b --- /dev/null +++ b/backend/pyproject.toml @@ -0,0 +1,91 @@ +[build-system] +requires = ["setuptools>=61.0"] +build-backend = "setuptools.build_meta" + +[project] +name = "website-analyzer" +version = "1.0.0" +description = "A Django-based web application for analyzing website performance, security, and best practices" +readme = "README.md" +license = {text = "MIT"} +requires-python = ">=3.11" +authors = [ + {name = "Website Analyzer Team"} +] +classifiers = [ + "Development Status :: 4 - Beta", + "Framework :: Django :: 5.0", + "Intended Audience :: Developers", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", +] + +dependencies = [ + "Django>=5.0,<6.0", + "djangorestframework>=3.14.0", + "django-cors-headers>=4.3.0", + "psycopg2-binary>=2.9.9", + "celery[redis]>=5.3.0", + "redis>=5.0.0", + "httpx>=0.26.0", + "playwright>=1.40.0", + "python-dotenv>=1.0.0", + "gunicorn>=21.2.0", + "whitenoise>=6.6.0", + "validators>=0.22.0", + "ipaddress>=1.0.23", +] + +[project.optional-dependencies] +dev = [ + "pytest>=7.4.0", + "pytest-django>=4.7.0", + "pytest-asyncio>=0.23.0", + "pytest-cov>=4.1.0", + "black>=23.12.0", + "isort>=5.13.0", + "flake8>=7.0.0", + "mypy>=1.8.0", + "django-stubs>=4.2.0", +] + +[tool.black] +line-length = 100 +target-version = ['py311'] +include = '\.pyi?$' +exclude = ''' +/( + \.git + | \.hg + | \.mypy_cache + | \.tox + | \.venv + | _build + | buck-out + | build + | dist + | migrations +)/ +''' + +[tool.isort] +profile = "black" +line_length = 100 +skip = ["migrations", ".venv"] + +[tool.pytest.ini_options] +DJANGO_SETTINGS_MODULE = "core.settings" +python_files = ["test_*.py", "*_test.py"] +addopts = "-v --tb=short" + +[tool.mypy] +python_version = "3.11" +plugins = ["mypy_django_plugin.main"] +ignore_missing_imports = true +strict = false + +[tool.django-stubs] +django_settings_module = "core.settings" diff --git a/backend/requirements.txt b/backend/requirements.txt new file mode 100644 index 0000000..bad6194 --- /dev/null +++ b/backend/requirements.txt @@ -0,0 +1,36 @@ +# Django & REST Framework +Django>=5.0,<6.0 +djangorestframework>=3.14.0 +django-cors-headers>=4.3.0 + +# Database +psycopg2-binary>=2.9.9 + +# Async Task Queue +celery[redis]>=5.3.0 +redis>=5.0.0 + +# HTTP Client +httpx>=0.26.0 + +# Browser Automation +playwright>=1.40.0 + +# Environment & Config +python-dotenv>=1.0.0 + +# Production Server +gunicorn>=21.2.0 +whitenoise>=6.6.0 + +# Validation & Utilities +validators>=0.22.0 + +# Development & Testing +pytest>=7.4.0 +pytest-django>=4.7.0 +pytest-asyncio>=0.23.0 +pytest-cov>=4.1.0 +black>=23.12.0 +isort>=5.13.0 +flake8>=7.0.0 diff --git a/backend/scanner/__init__.py b/backend/scanner/__init__.py new file mode 100644 index 0000000..b004128 --- /dev/null +++ b/backend/scanner/__init__.py @@ -0,0 +1,5 @@ +""" +Scanner app initialization. +""" + +default_app_config = 'scanner.apps.ScannerConfig' diff --git a/backend/scanner/apps.py b/backend/scanner/apps.py new file mode 100644 index 0000000..2c162fd --- /dev/null +++ b/backend/scanner/apps.py @@ -0,0 +1,11 @@ +""" +Scanner app configuration. +""" + +from django.apps import AppConfig + + +class ScannerConfig(AppConfig): + default_auto_field = 'django.db.models.BigAutoField' + name = 'scanner' + verbose_name = 'Scanner Tools' diff --git a/backend/scanner/scanners/__init__.py b/backend/scanner/scanners/__init__.py new file mode 100644 index 0000000..cb55500 --- /dev/null +++ b/backend/scanner/scanners/__init__.py @@ -0,0 +1,25 @@ +""" +Scanner modules initialization. + +This package contains the various scanner implementations +that analyze websites for performance, security, and best practices. +""" + +from .base import BaseScanner, ScannerResult +from .lighthouse import LighthouseScanner +from .playwright_scanner import PlaywrightScanner +from .zap import ZAPScanner +from .headers import HeaderScanner +from .tls import TLSScanner +from .runner import ScanRunner + +__all__ = [ + 'BaseScanner', + 'ScannerResult', + 'LighthouseScanner', + 'PlaywrightScanner', + 'ZAPScanner', + 'HeaderScanner', + 'TLSScanner', + 'ScanRunner', +] diff --git a/backend/scanner/scanners/base.py b/backend/scanner/scanners/base.py new file mode 100644 index 0000000..ff4cb28 --- /dev/null +++ b/backend/scanner/scanners/base.py @@ -0,0 +1,161 @@ +""" +Base scanner interface and result structures. + +All scanner implementations should inherit from BaseScanner +and return ScannerResult objects. +""" + +import logging +from abc import ABC, abstractmethod +from dataclasses import dataclass, field +from typing import Any, Dict, List, Optional +from enum import Enum + +logger = logging.getLogger(__name__) + + +class ScannerStatus(str, Enum): + """Status of a scanner execution.""" + SUCCESS = "success" + PARTIAL = "partial" + FAILED = "failed" + SKIPPED = "skipped" + + +@dataclass +class IssueData: + """ + Represents a single issue found by a scanner. + + Attributes: + category: Issue category (security, performance, etc.) + severity: Issue severity (critical, high, medium, low, info) + title: Brief title of the issue + description: Detailed description + tool: The scanner that found this issue + affected_url: Specific URL affected (optional) + remediation: Suggested fix (optional) + raw_data: Original scanner data (optional) + """ + category: str + severity: str + title: str + description: str + tool: str + affected_url: Optional[str] = None + remediation: Optional[str] = None + raw_data: Optional[Dict[str, Any]] = None + + +@dataclass +class MetricData: + """ + Represents a single metric measured by a scanner. + + Attributes: + name: Internal name (e.g., 'first_contentful_paint_ms') + display_name: Human-readable name + value: Numeric value + unit: Unit of measurement + source: The scanner that measured this + score: Normalized score (0-1) if available + """ + name: str + display_name: str + value: float + unit: str + source: str + score: Optional[float] = None + + +@dataclass +class ScannerResult: + """ + Result of a scanner execution. + + Attributes: + scanner_name: Name of the scanner + status: Execution status + issues: List of issues found + metrics: List of metrics measured + scores: Dictionary of category scores + raw_data: Original scanner output + error_message: Error details if failed + """ + scanner_name: str + status: ScannerStatus + issues: List[IssueData] = field(default_factory=list) + metrics: List[MetricData] = field(default_factory=list) + scores: Dict[str, int] = field(default_factory=dict) + raw_data: Optional[Dict[str, Any]] = None + error_message: Optional[str] = None + + +class BaseScanner(ABC): + """ + Abstract base class for all scanners. + + Each scanner implementation must implement the `run` method + which performs the actual scan and returns a ScannerResult. + """ + + name: str = "base" + + def __init__(self, config: Optional[Dict[str, Any]] = None): + """ + Initialize the scanner with optional configuration. + + Args: + config: Scanner-specific configuration dictionary + """ + self.config = config or {} + self.logger = logging.getLogger(f"scanner.{self.name}") + + @abstractmethod + def run(self, url: str) -> ScannerResult: + """ + Run the scanner against the given URL. + + Args: + url: The URL to scan + + Returns: + ScannerResult with findings, metrics, and status + """ + pass + + def is_available(self) -> bool: + """ + Check if the scanner service/tool is available. + + Returns: + True if the scanner can be used, False otherwise + """ + return True + + def _create_error_result(self, error: Exception) -> ScannerResult: + """ + Create a failed result from an exception. + + Args: + error: The exception that occurred + + Returns: + ScannerResult with failed status + """ + self.logger.error(f"Scanner {self.name} failed: {error}") + return ScannerResult( + scanner_name=self.name, + status=ScannerStatus.FAILED, + error_message=str(error), + issues=[ + IssueData( + category="scanner", + severity="info", + title=f"{self.name.title()} scan failed", + description=f"The {self.name} scanner encountered an error: {error}", + tool=self.name, + remediation="Check scanner service configuration and availability." + ) + ] + ) diff --git a/backend/scanner/scanners/headers.py b/backend/scanner/scanners/headers.py new file mode 100644 index 0000000..1501b73 --- /dev/null +++ b/backend/scanner/scanners/headers.py @@ -0,0 +1,405 @@ +""" +HTTP Header Security Scanner. + +This module analyzes HTTP response headers for security +best practices and common misconfigurations. +""" + +import logging +from typing import Any, Dict, List, Optional, Tuple + +import httpx + +from .base import ( + BaseScanner, + ScannerResult, + ScannerStatus, + IssueData, + MetricData, +) + +logger = logging.getLogger(__name__) + + +# Security header definitions with expected values and severity +SECURITY_HEADERS = { + 'Strict-Transport-Security': { + 'severity': 'high', + 'description': 'HTTP Strict Transport Security (HSTS) forces browsers to use HTTPS.', + 'remediation': ( + 'Add the header: Strict-Transport-Security: max-age=31536000; includeSubDomains; preload' + ), + 'check_value': lambda v: 'max-age' in v.lower() and int( + v.lower().split('max-age=')[1].split(';')[0].strip() + ) >= 31536000 if 'max-age=' in v.lower() else False, + }, + 'Content-Security-Policy': { + 'severity': 'high', + 'description': 'Content Security Policy (CSP) helps prevent XSS and data injection attacks.', + 'remediation': ( + "Implement a Content-Security-Policy header that restricts sources for scripts, " + "styles, and other resources. Start with a report-only policy to identify issues." + ), + 'check_value': lambda v: "default-src" in v.lower() or "script-src" in v.lower(), + }, + 'X-Content-Type-Options': { + 'severity': 'medium', + 'description': 'Prevents browsers from MIME-sniffing responses.', + 'remediation': 'Add the header: X-Content-Type-Options: nosniff', + 'check_value': lambda v: v.lower() == 'nosniff', + }, + 'X-Frame-Options': { + 'severity': 'medium', + 'description': 'Protects against clickjacking by controlling page framing.', + 'remediation': 'Add the header: X-Frame-Options: DENY or SAMEORIGIN', + 'check_value': lambda v: v.upper() in ['DENY', 'SAMEORIGIN'], + }, + 'Referrer-Policy': { + 'severity': 'low', + 'description': 'Controls how much referrer information is sent with requests.', + 'remediation': ( + 'Add the header: Referrer-Policy: strict-origin-when-cross-origin ' + 'or no-referrer-when-downgrade' + ), + 'check_value': lambda v: v.lower() in [ + 'no-referrer', 'no-referrer-when-downgrade', + 'strict-origin', 'strict-origin-when-cross-origin', + 'same-origin', 'origin', 'origin-when-cross-origin' + ], + }, + 'Permissions-Policy': { + 'severity': 'low', + 'description': 'Controls which browser features can be used.', + 'remediation': ( + 'Add a Permissions-Policy header to restrict access to sensitive browser APIs ' + 'like geolocation, camera, and microphone.' + ), + 'check_value': lambda v: len(v) > 0, + }, + 'X-XSS-Protection': { + 'severity': 'info', + 'description': 'Legacy XSS filter (deprecated in modern browsers, CSP is preferred).', + 'remediation': 'While deprecated, you can add: X-XSS-Protection: 1; mode=block', + 'check_value': lambda v: '1' in v, + }, +} + +# CORS security checks +CORS_CHECKS = { + 'permissive_origin': { + 'severity': 'high', + 'title': 'Overly permissive CORS (Access-Control-Allow-Origin: *)', + 'description': ( + 'The server allows requests from any origin. This can expose sensitive data ' + 'to malicious websites if combined with credentials.' + ), + 'remediation': ( + 'Restrict Access-Control-Allow-Origin to specific trusted domains instead of using *. ' + 'Never use * with Access-Control-Allow-Credentials: true.' + ), + }, + 'credentials_with_wildcard': { + 'severity': 'critical', + 'title': 'CORS allows credentials with wildcard origin', + 'description': ( + 'The server has Access-Control-Allow-Credentials: true with Access-Control-Allow-Origin: *. ' + 'This is a severe misconfiguration that can allow credential theft.' + ), + 'remediation': ( + 'Never combine Access-Control-Allow-Credentials: true with a wildcard origin. ' + 'Implement a whitelist of allowed origins.' + ), + }, +} + + +class HeaderScanner(BaseScanner): + """ + Scanner for HTTP security headers. + + Checks for: + - Missing security headers + - Improperly configured headers + - CORS misconfigurations + - Cookie security flags + """ + + name = "header_check" + + def __init__(self, config: Optional[Dict[str, Any]] = None): + super().__init__(config) + self.timeout = self.config.get('timeout', 30) + + def run(self, url: str) -> ScannerResult: + """ + Run header security analysis on the URL. + + Args: + url: The URL to analyze + + Returns: + ScannerResult with header findings + """ + self.logger.info(f"Starting header scan for {url}") + + try: + # Make both GET and HEAD requests + headers_data = self._fetch_headers(url) + + issues = [] + metrics = [] + + # Check security headers + header_issues, header_score = self._check_security_headers( + headers_data['headers'] + ) + issues.extend(header_issues) + + # Check CORS configuration + cors_issues = self._check_cors(headers_data['headers'], url) + issues.extend(cors_issues) + + # Check cookies + cookie_issues = self._check_cookies(headers_data['headers'], url) + issues.extend(cookie_issues) + + # Create metrics + metrics.append(MetricData( + name='security_headers_score', + display_name='Security Headers Score', + value=float(header_score), + unit='percent', + source='header_check' + )) + + metrics.append(MetricData( + name='headers_missing_count', + display_name='Missing Security Headers', + value=float(len([i for i in header_issues if 'missing' in i.title.lower()])), + unit='count', + source='header_check' + )) + + self.logger.info( + f"Header scan complete: {len(issues)} issues, score: {header_score}" + ) + + return ScannerResult( + scanner_name=self.name, + status=ScannerStatus.SUCCESS, + issues=issues, + metrics=metrics, + raw_data=headers_data + ) + + except httpx.TimeoutException: + return self._create_error_result(Exception("Header check timed out")) + except Exception as e: + return self._create_error_result(e) + + def _fetch_headers(self, url: str) -> Dict[str, Any]: + """Fetch headers from the URL.""" + with httpx.Client( + timeout=self.timeout, + follow_redirects=True, + verify=True + ) as client: + # GET request + get_response = client.get(url) + + # HEAD request + head_response = client.head(url) + + return { + 'url': str(get_response.url), + 'status_code': get_response.status_code, + 'headers': dict(get_response.headers), + 'head_headers': dict(head_response.headers), + 'redirected': str(get_response.url) != url, + 'redirect_history': [str(r.url) for r in get_response.history], + } + + def _check_security_headers( + self, + headers: Dict[str, str] + ) -> Tuple[List[IssueData], int]: + """ + Check for security headers. + + Returns: + Tuple of (list of issues, security score 0-100) + """ + issues = [] + score = 100 + headers_lower = {k.lower(): v for k, v in headers.items()} + + for header_name, config in SECURITY_HEADERS.items(): + header_key = header_name.lower() + + if header_key not in headers_lower: + # Missing header + severity = config['severity'] + deduction = {'critical': 20, 'high': 15, 'medium': 10, 'low': 5, 'info': 2} + score -= deduction.get(severity, 5) + + issues.append(IssueData( + category='headers', + severity=severity, + title=f'Missing security header: {header_name}', + description=config['description'], + tool='header_check', + remediation=config['remediation'], + raw_data={'header': header_name, 'status': 'missing'} + )) + else: + # Header present, check value + value = headers_lower[header_key] + check_func = config.get('check_value') + + if check_func and not check_func(value): + issues.append(IssueData( + category='headers', + severity='low', + title=f'Weak configuration: {header_name}', + description=( + f"{config['description']} " + f"Current value may not provide optimal protection: {value}" + ), + tool='header_check', + remediation=config['remediation'], + raw_data={'header': header_name, 'value': value, 'status': 'weak'} + )) + score -= 3 + + return issues, max(0, score) + + def _check_cors(self, headers: Dict[str, str], url: str) -> List[IssueData]: + """Check CORS configuration for issues.""" + issues = [] + headers_lower = {k.lower(): v for k, v in headers.items()} + + acao = headers_lower.get('access-control-allow-origin', '') + acac = headers_lower.get('access-control-allow-credentials', '') + + if acao == '*': + if acac.lower() == 'true': + # Critical: credentials with wildcard + check = CORS_CHECKS['credentials_with_wildcard'] + issues.append(IssueData( + category='cors', + severity=check['severity'], + title=check['title'], + description=check['description'], + tool='header_check', + affected_url=url, + remediation=check['remediation'], + raw_data={ + 'Access-Control-Allow-Origin': acao, + 'Access-Control-Allow-Credentials': acac + } + )) + else: + # Warning: permissive origin + check = CORS_CHECKS['permissive_origin'] + issues.append(IssueData( + category='cors', + severity='medium', # Lower severity without credentials + title=check['title'], + description=check['description'], + tool='header_check', + affected_url=url, + remediation=check['remediation'], + raw_data={'Access-Control-Allow-Origin': acao} + )) + + return issues + + def _check_cookies(self, headers: Dict[str, str], url: str) -> List[IssueData]: + """Check Set-Cookie headers for security flags.""" + issues = [] + headers_lower = {k.lower(): v for k, v in headers.items()} + + # Get all Set-Cookie headers + set_cookies = [] + for key, value in headers.items(): + if key.lower() == 'set-cookie': + set_cookies.append(value) + + is_https = url.startswith('https://') + + for cookie in set_cookies: + cookie_lower = cookie.lower() + cookie_name = cookie.split('=')[0] if '=' in cookie else 'unknown' + + cookie_issues = [] + + # Check Secure flag on HTTPS + if is_https and 'secure' not in cookie_lower: + cookie_issues.append({ + 'flag': 'Secure', + 'description': ( + 'Cookie is set without Secure flag on HTTPS site. ' + 'This allows the cookie to be sent over unencrypted connections.' + ), + 'severity': 'high' + }) + + # Check HttpOnly flag (important for session cookies) + if 'httponly' not in cookie_lower: + # Check if it might be a session cookie + if any(term in cookie_name.lower() for term in ['session', 'auth', 'token', 'user']): + cookie_issues.append({ + 'flag': 'HttpOnly', + 'description': ( + 'Session-like cookie is set without HttpOnly flag. ' + 'This allows JavaScript access, increasing XSS risk.' + ), + 'severity': 'high' + }) + else: + cookie_issues.append({ + 'flag': 'HttpOnly', + 'description': ( + 'Cookie is set without HttpOnly flag. ' + 'Consider adding it unless JavaScript needs access.' + ), + 'severity': 'low' + }) + + # Check SameSite attribute + if 'samesite' not in cookie_lower: + cookie_issues.append({ + 'flag': 'SameSite', + 'description': ( + 'Cookie is set without SameSite attribute. ' + 'This can enable CSRF attacks in some scenarios.' + ), + 'severity': 'medium' + }) + elif 'samesite=none' in cookie_lower and 'secure' not in cookie_lower: + cookie_issues.append({ + 'flag': 'SameSite=None without Secure', + 'description': ( + 'Cookie has SameSite=None but no Secure flag. ' + 'Modern browsers will reject this cookie.' + ), + 'severity': 'medium' + }) + + # Create issues for this cookie + for ci in cookie_issues: + issues.append(IssueData( + category='security', + severity=ci['severity'], + title=f"Cookie '{cookie_name}' missing {ci['flag']} flag", + description=ci['description'], + tool='header_check', + affected_url=url, + remediation=( + f"Add the {ci['flag']} flag to the Set-Cookie header. " + f"Example: Set-Cookie: {cookie_name}=value; Secure; HttpOnly; SameSite=Strict" + ), + raw_data={'cookie': cookie[:200]} # Truncate for storage + )) + + return issues diff --git a/backend/scanner/scanners/lighthouse.py b/backend/scanner/scanners/lighthouse.py new file mode 100644 index 0000000..7544c0f --- /dev/null +++ b/backend/scanner/scanners/lighthouse.py @@ -0,0 +1,323 @@ +""" +Lighthouse Scanner Integration. + +This module integrates with Google Lighthouse to measure +performance, accessibility, SEO, and best practices. +""" + +import logging +from typing import Any, Dict, Optional + +import httpx + +from django.conf import settings + +from .base import ( + BaseScanner, + ScannerResult, + ScannerStatus, + IssueData, + MetricData, +) + +logger = logging.getLogger(__name__) + + +class LighthouseScanner(BaseScanner): + """ + Scanner that uses Google Lighthouse for performance analysis. + + Communicates with the Lighthouse service container via HTTP API. + Collects performance metrics, Core Web Vitals, and various audits. + """ + + name = "lighthouse" + + def __init__(self, config: Optional[Dict[str, Any]] = None): + super().__init__(config) + self.service_url = self.config.get( + 'service_url', + 'http://lighthouse:3001' + ) + self.timeout = self.config.get('timeout', 120) + + def is_available(self) -> bool: + """Check if Lighthouse service is available.""" + try: + with httpx.Client(timeout=5) as client: + response = client.get(f"{self.service_url}/health") + return response.status_code == 200 + except Exception as e: + self.logger.warning(f"Lighthouse service not available: {e}") + return False + + def run(self, url: str) -> ScannerResult: + """ + Run Lighthouse scan against the URL. + + Args: + url: The URL to analyze + + Returns: + ScannerResult with performance metrics and issues + """ + self.logger.info(f"Starting Lighthouse scan for {url}") + + try: + with httpx.Client(timeout=self.timeout) as client: + response = client.post( + f"{self.service_url}/scan", + json={"url": url} + ) + response.raise_for_status() + data = response.json() + + return self._parse_results(url, data) + + except httpx.TimeoutException: + return self._create_error_result( + Exception("Lighthouse scan timed out") + ) + except httpx.HTTPStatusError as e: + return self._create_error_result( + Exception(f"Lighthouse service error: {e.response.status_code}") + ) + except Exception as e: + return self._create_error_result(e) + + def _parse_results(self, url: str, data: Dict[str, Any]) -> ScannerResult: + """ + Parse Lighthouse results into ScannerResult format. + + Args: + url: The scanned URL + data: Raw Lighthouse response data + + Returns: + Parsed ScannerResult + """ + issues = [] + metrics = [] + + # Extract scores + scores = data.get('scores', {}) + + # Extract and create metrics + raw_metrics = data.get('metrics', {}) + + # Core Web Vitals + metric_mappings = [ + ('firstContentfulPaint', 'First Contentful Paint', 'ms'), + ('largestContentfulPaint', 'Largest Contentful Paint', 'ms'), + ('speedIndex', 'Speed Index', 'ms'), + ('timeToInteractive', 'Time to Interactive', 'ms'), + ('totalBlockingTime', 'Total Blocking Time', 'ms'), + ('cumulativeLayoutShift', 'Cumulative Layout Shift', 'score'), + ] + + for key, display_name, unit in metric_mappings: + metric_data = raw_metrics.get(key, {}) + if metric_data and metric_data.get('value') is not None: + metrics.append(MetricData( + name=self._to_snake_case(key), + display_name=display_name, + value=metric_data['value'], + unit=unit, + source='lighthouse', + score=metric_data.get('score') + )) + + # Resource metrics + resources = data.get('resources', {}) + diagnostics = data.get('diagnostics', {}) + + if resources.get('totalByteWeight'): + metrics.append(MetricData( + name='total_byte_weight', + display_name='Total Page Weight', + value=resources['totalByteWeight'], + unit='bytes', + source='lighthouse' + )) + + if resources.get('bootupTime'): + metrics.append(MetricData( + name='javascript_bootup_time', + display_name='JavaScript Boot-up Time', + value=resources['bootupTime'], + unit='ms', + source='lighthouse' + )) + + if diagnostics.get('numRequests'): + metrics.append(MetricData( + name='total_requests', + display_name='Total Network Requests', + value=float(diagnostics['numRequests']), + unit='count', + source='lighthouse' + )) + + # Extract issues from failed audits + raw_issues = data.get('issues', []) + for issue in raw_issues: + severity = self._score_to_severity(issue.get('score', 0.5)) + category = self._map_category(issue.get('category', 'performance')) + + issues.append(IssueData( + category=category, + severity=severity, + title=issue.get('title', 'Unknown issue'), + description=issue.get('description', ''), + tool='lighthouse', + affected_url=url, + remediation=self._get_remediation(issue.get('id')), + raw_data=issue + )) + + # Check for large bundles + large_scripts = resources.get('scriptTreemap', []) + for script in large_scripts[:5]: # Top 5 largest + if script.get('resourceBytes', 0) > settings.SCANNER_CONFIG.get( + 'LARGE_JS_BUNDLE_THRESHOLD_BYTES', 500 * 1024 + ): + issues.append(IssueData( + category='resources', + severity='medium', + title=f"Large JavaScript bundle detected", + description=( + f"The script '{script.get('name', 'Unknown')}' " + f"is {script['resourceBytes'] / 1024:.1f} KB. " + "Large bundles can slow down page load and increase memory usage." + ), + tool='lighthouse', + affected_url=url, + remediation=( + "Consider code splitting, tree shaking, or lazy loading " + "to reduce bundle size." + ), + raw_data=script + )) + + # Check for unused JavaScript + unused_js = resources.get('unusedJavascript', []) + if unused_js: + total_wasted = sum(u.get('wastedBytes', 0) for u in unused_js) + if total_wasted > 100 * 1024: # More than 100KB unused + issues.append(IssueData( + category='performance', + severity='medium', + title="Significant unused JavaScript detected", + description=( + f"Found {total_wasted / 1024:.1f} KB of unused JavaScript " + f"across {len(unused_js)} resources. This increases page " + "load time and memory usage." + ), + tool='lighthouse', + affected_url=url, + remediation=( + "Remove unused code or use code splitting to load " + "JavaScript only when needed." + ), + raw_data={'unused_resources': unused_js} + )) + + # Check for render-blocking resources + blocking = resources.get('renderBlockingResources', []) + if blocking: + total_wasted_ms = sum(r.get('wastedMs', 0) for r in blocking) + if total_wasted_ms > 500: + issues.append(IssueData( + category='performance', + severity='medium', + title="Render-blocking resources detected", + description=( + f"Found {len(blocking)} render-blocking resources " + f"adding approximately {total_wasted_ms:.0f}ms to page load. " + "These resources delay first paint." + ), + tool='lighthouse', + affected_url=url, + remediation=( + "Consider inlining critical CSS, deferring non-critical JS, " + "or using async/defer attributes." + ), + raw_data={'blocking_resources': blocking} + )) + + self.logger.info( + f"Lighthouse scan complete: {len(issues)} issues, {len(metrics)} metrics" + ) + + return ScannerResult( + scanner_name=self.name, + status=ScannerStatus.SUCCESS, + issues=issues, + metrics=metrics, + scores={ + 'performance': scores.get('performance', 0), + 'accessibility': scores.get('accessibility', 0), + 'best_practices': scores.get('bestPractices', 0), + 'seo': scores.get('seo', 0), + }, + raw_data=data + ) + + def _to_snake_case(self, name: str) -> str: + """Convert camelCase to snake_case.""" + import re + s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name) + return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower() + + def _score_to_severity(self, score: float) -> str: + """Convert Lighthouse score to severity level.""" + if score is None: + return 'info' + elif score < 0.25: + return 'high' + elif score < 0.5: + return 'medium' + elif score < 0.75: + return 'low' + else: + return 'info' + + def _map_category(self, lighthouse_category: str) -> str: + """Map Lighthouse category to our category.""" + mapping = { + 'performance': 'performance', + 'accessibility': 'accessibility', + 'best-practices': 'best_practices', + 'seo': 'seo', + } + return mapping.get(lighthouse_category, 'performance') + + def _get_remediation(self, audit_id: str) -> str: + """Get remediation text for known audit IDs.""" + remediations = { + 'first-contentful-paint': ( + "Reduce server response time, eliminate render-blocking resources, " + "and optimize critical rendering path." + ), + 'largest-contentful-paint': ( + "Optimize images, preload critical resources, and reduce server " + "response time." + ), + 'total-blocking-time': ( + "Reduce JavaScript execution time by breaking up long tasks, " + "removing unused code, and minimizing main thread work." + ), + 'cumulative-layout-shift': ( + "Always include size attributes on images and videos, reserve space " + "for ad slots, and avoid inserting content above existing content." + ), + 'speed-index': ( + "Minimize main thread work, reduce JavaScript execution time, " + "and ensure text remains visible during webfont load." + ), + 'interactive': ( + "Reduce JavaScript payload, defer non-critical scripts, and " + "minimize main thread work." + ), + } + return remediations.get(audit_id, "Review and optimize based on the audit details.") diff --git a/backend/scanner/scanners/playwright_scanner.py b/backend/scanner/scanners/playwright_scanner.py new file mode 100644 index 0000000..c1ec4cd --- /dev/null +++ b/backend/scanner/scanners/playwright_scanner.py @@ -0,0 +1,397 @@ +""" +Playwright Scanner Integration. + +This module uses Playwright to perform browser-based analysis, +capturing console errors, network requests, and resource metrics. +""" + +import asyncio +import logging +import time +from typing import Any, Dict, List, Optional + +from django.conf import settings + +from .base import ( + BaseScanner, + ScannerResult, + ScannerStatus, + IssueData, + MetricData, +) + +logger = logging.getLogger(__name__) + + +class PlaywrightScanner(BaseScanner): + """ + Scanner using Playwright for browser-based analysis. + + Captures: + - Console errors and warnings + - Network request details + - Page load timing + - Large resources (images, scripts) + - Memory usage indicators + """ + + name = "playwright" + + def __init__(self, config: Optional[Dict[str, Any]] = None): + super().__init__(config) + self.timeout = self.config.get('timeout', 30000) # 30 seconds + self.viewport = self.config.get('viewport', {'width': 1920, 'height': 1080}) + + def run(self, url: str) -> ScannerResult: + """ + Run Playwright analysis on the URL. + + Args: + url: The URL to analyze + + Returns: + ScannerResult with browser analysis data + """ + self.logger.info(f"Starting Playwright scan for {url}") + + try: + # Run async scan in sync context + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + try: + result = loop.run_until_complete(self._async_scan(url)) + finally: + loop.close() + + return result + + except Exception as e: + return self._create_error_result(e) + + async def _async_scan(self, url: str) -> ScannerResult: + """ + Async implementation of the scan. + + Args: + url: The URL to analyze + + Returns: + ScannerResult with findings + """ + from playwright.async_api import async_playwright + + issues = [] + metrics = [] + raw_data = { + 'console_messages': [], + 'network_requests': [], + 'failed_requests': [], + 'large_resources': [], + } + + async with async_playwright() as p: + browser = await p.chromium.launch( + headless=True, + args=[ + '--no-sandbox', + '--disable-setuid-sandbox', + '--disable-dev-shm-usage', + '--disable-gpu', + ] + ) + + context = await browser.new_context( + viewport=self.viewport, + user_agent=( + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ' + '(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36' + ) + ) + + page = await context.new_page() + + # Collect data + console_messages = [] + network_requests = [] + failed_requests = [] + + # Set up event listeners + page.on("console", lambda msg: console_messages.append({ + 'type': msg.type, + 'text': msg.text, + 'location': str(msg.location) if msg.location else None, + })) + + page.on("request", lambda req: network_requests.append({ + 'url': req.url, + 'method': req.method, + 'resource_type': req.resource_type, + 'timestamp': time.time(), + })) + + page.on("requestfailed", lambda req: failed_requests.append({ + 'url': req.url, + 'failure': req.failure, + 'resource_type': req.resource_type, + })) + + # Navigate and measure + start_time = time.time() + + try: + response = await page.goto( + url, + wait_until='networkidle', + timeout=self.timeout + ) + load_time = (time.time() - start_time) * 1000 # Convert to ms + + # Get response status + status_code = response.status if response else 0 + + # Wait a bit more for any delayed scripts + await page.wait_for_timeout(2000) + + # Get performance timing + perf_timing = await page.evaluate('''() => { + const timing = performance.timing; + const navigation = performance.getEntriesByType("navigation")[0]; + return { + domContentLoaded: timing.domContentLoadedEventEnd - timing.navigationStart, + domComplete: timing.domComplete - timing.navigationStart, + loadEvent: timing.loadEventEnd - timing.navigationStart, + firstPaint: navigation ? navigation.domComplete : null, + transferSize: navigation ? navigation.transferSize : null, + }; + }''') + + # Get memory info (if available) + memory_info = await page.evaluate('''() => { + if (performance.memory) { + return { + usedJSHeapSize: performance.memory.usedJSHeapSize, + totalJSHeapSize: performance.memory.totalJSHeapSize, + jsHeapSizeLimit: performance.memory.jsHeapSizeLimit, + }; + } + return null; + }''') + + # Get resource sizes + resources = await page.evaluate('''() => { + const entries = performance.getEntriesByType("resource"); + return entries.map(e => ({ + name: e.name, + type: e.initiatorType, + transferSize: e.transferSize, + duration: e.duration, + })); + }''') + + except Exception as e: + self.logger.warning(f"Page navigation error: {e}") + load_time = self.timeout + status_code = 0 + perf_timing = {} + memory_info = None + resources = [] + + await browser.close() + + # Process collected data + raw_data['console_messages'] = console_messages + raw_data['network_requests'] = network_requests[:100] # Limit stored + raw_data['failed_requests'] = failed_requests + raw_data['performance_timing'] = perf_timing if 'perf_timing' in locals() else {} + raw_data['memory_info'] = memory_info if 'memory_info' in locals() else None + raw_data['status_code'] = status_code if 'status_code' in locals() else 0 + + # Create metrics + metrics.append(MetricData( + name='page_load_time', + display_name='Page Load Time', + value=load_time, + unit='ms', + source='playwright' + )) + + metrics.append(MetricData( + name='total_network_requests', + display_name='Total Network Requests', + value=float(len(network_requests)), + unit='count', + source='playwright' + )) + + # Calculate total transfer size + total_transfer = sum(r.get('transferSize', 0) for r in resources if r.get('transferSize')) + if total_transfer > 0: + metrics.append(MetricData( + name='total_transfer_size', + display_name='Total Transfer Size', + value=float(total_transfer), + unit='bytes', + source='playwright' + )) + + if perf_timing.get('domContentLoaded'): + metrics.append(MetricData( + name='dom_content_loaded', + display_name='DOM Content Loaded', + value=float(perf_timing['domContentLoaded']), + unit='ms', + source='playwright' + )) + + # Memory metrics + if memory_info: + metrics.append(MetricData( + name='js_heap_used', + display_name='JS Heap Used', + value=float(memory_info.get('usedJSHeapSize', 0)), + unit='bytes', + source='playwright' + )) + + # Check for high memory usage + heap_used = memory_info.get('usedJSHeapSize', 0) + heap_limit = memory_info.get('jsHeapSizeLimit', 1) + heap_percent = (heap_used / heap_limit) * 100 if heap_limit > 0 else 0 + + if heap_percent > 50: + issues.append(IssueData( + category='resources', + severity='medium', + title='High JavaScript memory usage', + description=( + f'JavaScript is using {heap_used / (1024*1024):.1f} MB ' + f'({heap_percent:.1f}% of available heap). ' + 'This may indicate memory-heavy operations or potential leaks.' + ), + tool='playwright', + affected_url=url, + remediation=( + 'Review JavaScript for memory leaks, optimize data structures, ' + 'and ensure proper cleanup of event listeners and timers.' + ), + raw_data=memory_info + )) + + # Analyze console messages for errors + errors = [m for m in console_messages if m['type'] == 'error'] + warnings = [m for m in console_messages if m['type'] == 'warning'] + + metrics.append(MetricData( + name='console_errors_count', + display_name='Console Errors', + value=float(len(errors)), + unit='count', + source='playwright' + )) + + metrics.append(MetricData( + name='console_warnings_count', + display_name='Console Warnings', + value=float(len(warnings)), + unit='count', + source='playwright' + )) + + # Create issues for console errors + if errors: + # Group similar errors + error_texts = set(e['text'][:200] for e in errors) + for error_text in list(error_texts)[:10]: # Limit to 10 unique errors + issues.append(IssueData( + category='content', + severity='medium', + title='JavaScript console error', + description=f'JavaScript error logged to console: {error_text}', + tool='playwright', + affected_url=url, + remediation='Review and fix the JavaScript error in your code.', + raw_data={'error': error_text} + )) + + # Check for failed network requests + if failed_requests: + for req in failed_requests[:5]: # Limit reported + issues.append(IssueData( + category='content', + severity='low', + title='Failed network request', + description=( + f"Request to {req['url'][:100]} failed: {req.get('failure', 'Unknown error')}" + ), + tool='playwright', + affected_url=req['url'], + remediation='Ensure the resource is available and CORS is configured correctly.', + raw_data=req + )) + + # Find large resources + large_threshold = settings.SCANNER_CONFIG.get('LARGE_IMAGE_THRESHOLD_BYTES', 1024 * 1024) + large_resources = [ + r for r in resources + if r.get('transferSize', 0) > large_threshold + ] + + for resource in large_resources[:5]: # Limit reported + size_mb = resource['transferSize'] / (1024 * 1024) + issues.append(IssueData( + category='resources', + severity='medium' if size_mb > 2 else 'low', + title=f"Large resource detected ({size_mb:.1f} MB)", + description=( + f"The resource '{resource['name'][-80:]}' is {size_mb:.2f} MB. " + "Large resources increase page load time and bandwidth usage." + ), + tool='playwright', + affected_url=resource['name'], + remediation=( + 'Optimize images using compression, use appropriate formats (WebP, AVIF), ' + 'implement lazy loading, or consider a CDN.' + ), + raw_data=resource + )) + + raw_data['large_resources'] = large_resources + + # Count resources by type + resource_counts = {} + for req in network_requests: + rtype = req.get('resource_type', 'other') + resource_counts[rtype] = resource_counts.get(rtype, 0) + 1 + + raw_data['resource_counts'] = resource_counts + + # Check for excessive requests + if len(network_requests) > 100: + issues.append(IssueData( + category='performance', + severity='medium', + title='High number of network requests', + description=( + f'Page made {len(network_requests)} network requests. ' + 'Excessive requests increase page load time and server load.' + ), + tool='playwright', + affected_url=url, + remediation=( + 'Consolidate resources, use HTTP/2 multiplexing, implement ' + 'resource bundling, and lazy load non-critical resources.' + ), + raw_data=resource_counts + )) + + self.logger.info( + f"Playwright scan complete: {len(issues)} issues, {len(metrics)} metrics" + ) + + return ScannerResult( + scanner_name=self.name, + status=ScannerStatus.SUCCESS, + issues=issues, + metrics=metrics, + raw_data=raw_data + ) diff --git a/backend/scanner/scanners/runner.py b/backend/scanner/scanners/runner.py new file mode 100644 index 0000000..9b9b029 --- /dev/null +++ b/backend/scanner/scanners/runner.py @@ -0,0 +1,314 @@ +""" +Scan Runner - Orchestrates multiple scanners. + +This module coordinates running all enabled scanners against a URL +and aggregates their results into a unified report. +""" + +import logging +from concurrent.futures import ThreadPoolExecutor, as_completed +from typing import Any, Dict, List, Optional, Type + +from django.conf import settings + +from .base import BaseScanner, ScannerResult, ScannerStatus +from .lighthouse import LighthouseScanner +from .playwright_scanner import PlaywrightScanner +from .zap import ZAPScanner +from .headers import HeaderScanner +from .tls import TLSScanner + +logger = logging.getLogger(__name__) + + +# Default scanner classes to run +DEFAULT_SCANNERS: List[Type[BaseScanner]] = [ + LighthouseScanner, + PlaywrightScanner, + ZAPScanner, + HeaderScanner, + TLSScanner, +] + + +class ScanRunner: + """ + Orchestrates running multiple scanners and aggregating results. + + This class manages: + - Running enabled scanners in parallel or sequence + - Aggregating results from all scanners + - Error handling and partial result compilation + - Timeout management + """ + + def __init__( + self, + scanner_classes: Optional[List[Type[BaseScanner]]] = None, + config: Optional[Dict[str, Any]] = None, + max_workers: int = 3 + ): + """ + Initialize the scan runner. + + Args: + scanner_classes: List of scanner classes to use (defaults to all) + config: Configuration dict passed to each scanner + max_workers: Maximum concurrent scanner threads + """ + self.scanner_classes = scanner_classes or DEFAULT_SCANNERS + self.config = config or {} + self.max_workers = max_workers + self.logger = logging.getLogger(__name__) + + def run(self, url: str, parallel: bool = True) -> Dict[str, Any]: + """ + Run all scanners against the URL. + + Args: + url: The URL to scan + parallel: Whether to run scanners in parallel + + Returns: + Aggregated results dictionary containing: + - status: Overall scan status + - scores: Aggregated scores + - issues: All issues from all scanners + - metrics: All metrics from all scanners + - scanner_results: Individual scanner results + - errors: Any scanner errors + """ + self.logger.info(f"Starting scan runner for {url} with {len(self.scanner_classes)} scanners") + + # Initialize scanners + scanners = self._initialize_scanners() + + # Run scanners + if parallel: + results = self._run_parallel(scanners, url) + else: + results = self._run_sequential(scanners, url) + + # Aggregate results + aggregated = self._aggregate_results(results) + + self.logger.info( + f"Scan complete: {len(aggregated['issues'])} issues, " + f"{len(aggregated['metrics'])} metrics, " + f"status: {aggregated['status']}" + ) + + return aggregated + + def _initialize_scanners(self) -> List[BaseScanner]: + """Initialize scanner instances.""" + scanners = [] + scanner_config = settings.SCANNER_CONFIG + + for scanner_class in self.scanner_classes: + try: + # Merge default config with scanner-specific config + config = {**self.config} + + # Add scanner-specific config + if scanner_class == LighthouseScanner: + config['service_url'] = 'http://lighthouse:3001' + config['timeout'] = scanner_config.get('LIGHTHOUSE_TIMEOUT', 60) + elif scanner_class == ZAPScanner: + config['zap_host'] = scanner_config.get('ZAP_HOST') + config['api_key'] = scanner_config.get('ZAP_API_KEY') + config['timeout'] = scanner_config.get('ZAP_TIMEOUT', 120) + elif scanner_class == PlaywrightScanner: + config['timeout'] = scanner_config.get('PLAYWRIGHT_TIMEOUT', 30000) + config['viewport'] = scanner_config.get('PLAYWRIGHT_VIEWPORT', {'width': 1920, 'height': 1080}) + + scanner = scanner_class(config=config) + scanners.append(scanner) + + except Exception as e: + self.logger.error(f"Failed to initialize {scanner_class.__name__}: {e}") + + return scanners + + def _run_parallel( + self, + scanners: List[BaseScanner], + url: str + ) -> Dict[str, ScannerResult]: + """Run scanners in parallel using thread pool.""" + results = {} + + with ThreadPoolExecutor(max_workers=self.max_workers) as executor: + # Submit all scanner tasks + future_to_scanner = { + executor.submit(self._run_scanner, scanner, url): scanner + for scanner in scanners + } + + # Collect results as they complete + for future in as_completed(future_to_scanner): + scanner = future_to_scanner[future] + try: + result = future.result() + results[scanner.name] = result + except Exception as e: + self.logger.error(f"Scanner {scanner.name} raised exception: {e}") + results[scanner.name] = ScannerResult( + scanner_name=scanner.name, + status=ScannerStatus.FAILED, + error_message=str(e) + ) + + return results + + def _run_sequential( + self, + scanners: List[BaseScanner], + url: str + ) -> Dict[str, ScannerResult]: + """Run scanners sequentially.""" + results = {} + + for scanner in scanners: + result = self._run_scanner(scanner, url) + results[scanner.name] = result + + return results + + def _run_scanner(self, scanner: BaseScanner, url: str) -> ScannerResult: + """Run a single scanner with error handling.""" + self.logger.info(f"Running scanner: {scanner.name}") + + try: + # Check availability first + if not scanner.is_available(): + self.logger.warning(f"Scanner {scanner.name} is not available") + return ScannerResult( + scanner_name=scanner.name, + status=ScannerStatus.SKIPPED, + error_message=f"{scanner.name} service is not available" + ) + + # Run the scanner + result = scanner.run(url) + self.logger.info( + f"Scanner {scanner.name} completed with status: {result.status}" + ) + return result + + except Exception as e: + self.logger.error(f"Scanner {scanner.name} failed: {e}") + return ScannerResult( + scanner_name=scanner.name, + status=ScannerStatus.FAILED, + error_message=str(e) + ) + + def _aggregate_results( + self, + results: Dict[str, ScannerResult] + ) -> Dict[str, Any]: + """Aggregate results from all scanners.""" + all_issues = [] + all_metrics = [] + all_scores = {} + raw_data = {} + errors = [] + + successful_scanners = 0 + failed_scanners = 0 + + for scanner_name, result in results.items(): + # Track scanner status + if result.status == ScannerStatus.SUCCESS: + successful_scanners += 1 + elif result.status == ScannerStatus.FAILED: + failed_scanners += 1 + if result.error_message: + errors.append({ + 'scanner': scanner_name, + 'error': result.error_message + }) + elif result.status == ScannerStatus.PARTIAL: + successful_scanners += 1 + + # Collect issues + for issue in result.issues: + all_issues.append({ + 'category': issue.category, + 'severity': issue.severity, + 'title': issue.title, + 'description': issue.description, + 'tool': issue.tool, + 'affected_url': issue.affected_url, + 'remediation': issue.remediation, + 'raw_data': issue.raw_data, + }) + + # Collect metrics + for metric in result.metrics: + all_metrics.append({ + 'name': metric.name, + 'display_name': metric.display_name, + 'value': metric.value, + 'unit': metric.unit, + 'source': metric.source, + 'score': metric.score, + }) + + # Collect scores + if result.scores: + all_scores[scanner_name] = result.scores + + # Store raw data + if result.raw_data: + raw_data[scanner_name] = result.raw_data + + # Determine overall status + if failed_scanners == len(results): + overall_status = 'failed' + elif failed_scanners > 0: + overall_status = 'partial' + else: + overall_status = 'done' + + # Calculate aggregated scores + aggregated_scores = self._calculate_aggregated_scores(all_scores) + + return { + 'status': overall_status, + 'scores': aggregated_scores, + 'issues': all_issues, + 'metrics': all_metrics, + 'scanner_results': { + name: { + 'status': result.status.value, + 'error': result.error_message, + } + for name, result in results.items() + }, + 'raw_data': raw_data, + 'errors': errors, + 'summary': { + 'total_scanners': len(results), + 'successful': successful_scanners, + 'failed': failed_scanners, + 'total_issues': len(all_issues), + 'total_metrics': len(all_metrics), + } + } + + def _calculate_aggregated_scores( + self, + scanner_scores: Dict[str, Dict[str, int]] + ) -> Dict[str, Optional[int]]: + """Calculate aggregated scores from all scanners.""" + # Lighthouse provides the main scores + lighthouse_scores = scanner_scores.get('lighthouse', {}) + + return { + 'performance': lighthouse_scores.get('performance'), + 'accessibility': lighthouse_scores.get('accessibility'), + 'best_practices': lighthouse_scores.get('best_practices'), + 'seo': lighthouse_scores.get('seo'), + } diff --git a/backend/scanner/scanners/tls.py b/backend/scanner/scanners/tls.py new file mode 100644 index 0000000..31711c6 --- /dev/null +++ b/backend/scanner/scanners/tls.py @@ -0,0 +1,380 @@ +""" +TLS/SSL Security Scanner. + +This module checks TLS/SSL configuration and certificate validity. +""" + +import logging +import socket +import ssl +from datetime import datetime, timezone +from typing import Any, Dict, Optional +from urllib.parse import urlparse + +from .base import ( + BaseScanner, + ScannerResult, + ScannerStatus, + IssueData, + MetricData, +) + +logger = logging.getLogger(__name__) + + +class TLSScanner(BaseScanner): + """ + Scanner for TLS/SSL certificate and configuration. + + Checks: + - Certificate validity + - Certificate expiration + - HTTPS availability + - HTTP to HTTPS redirect + """ + + name = "tls_check" + + def __init__(self, config: Optional[Dict[str, Any]] = None): + super().__init__(config) + self.timeout = self.config.get('timeout', 10) + + def run(self, url: str) -> ScannerResult: + """ + Run TLS/SSL analysis on the URL. + + Args: + url: The URL to analyze + + Returns: + ScannerResult with TLS findings + """ + self.logger.info(f"Starting TLS scan for {url}") + + try: + parsed = urlparse(url) + hostname = parsed.netloc.split(':')[0] + port = parsed.port or (443 if parsed.scheme == 'https' else 80) + + issues = [] + metrics = [] + raw_data = {} + + # Check if site is HTTPS + if parsed.scheme == 'http': + # Check if HTTPS is available + https_available, https_result = self._check_https_available(hostname) + raw_data['https_available'] = https_available + raw_data['https_check'] = https_result + + if https_available: + issues.append(IssueData( + category='tls', + severity='high', + title='Site accessed over HTTP but HTTPS is available', + description=( + 'The site was accessed over unencrypted HTTP, but HTTPS ' + 'appears to be available. All traffic should use HTTPS.' + ), + tool='tls_check', + affected_url=url, + remediation=( + 'Redirect all HTTP traffic to HTTPS using a 301 redirect. ' + 'Implement HSTS to prevent future HTTP access.' + ) + )) + else: + issues.append(IssueData( + category='tls', + severity='critical', + title='Site does not support HTTPS', + description=( + 'The site does not appear to have HTTPS configured. ' + 'All data transmitted is unencrypted and vulnerable to interception.' + ), + tool='tls_check', + affected_url=url, + remediation=( + 'Configure TLS/SSL for your server. Obtain a certificate from ' + "Let's Encrypt (free) or a commercial CA." + ) + )) + + metrics.append(MetricData( + name='tls_enabled', + display_name='TLS Enabled', + value=0.0, + unit='score', + source='tls_check' + )) + + return ScannerResult( + scanner_name=self.name, + status=ScannerStatus.SUCCESS, + issues=issues, + metrics=metrics, + raw_data=raw_data + ) + + # For HTTPS URLs, check certificate + cert_info = self._get_certificate_info(hostname, port) + raw_data['certificate'] = cert_info + + if cert_info.get('error'): + issues.append(IssueData( + category='tls', + severity='critical', + title='Certificate validation failed', + description=f"SSL certificate error: {cert_info['error']}", + tool='tls_check', + affected_url=url, + remediation=( + 'Ensure your SSL certificate is valid, not expired, ' + 'and properly configured for your domain.' + ) + )) + + metrics.append(MetricData( + name='certificate_valid', + display_name='Certificate Valid', + value=0.0, + unit='score', + source='tls_check' + )) + else: + # Certificate is valid + metrics.append(MetricData( + name='certificate_valid', + display_name='Certificate Valid', + value=1.0, + unit='score', + source='tls_check' + )) + + metrics.append(MetricData( + name='tls_enabled', + display_name='TLS Enabled', + value=1.0, + unit='score', + source='tls_check' + )) + + # Check expiration + if cert_info.get('expires'): + try: + expires = datetime.strptime( + cert_info['expires'], + '%b %d %H:%M:%S %Y %Z' + ) + expires = expires.replace(tzinfo=timezone.utc) + now = datetime.now(timezone.utc) + days_until_expiry = (expires - now).days + + metrics.append(MetricData( + name='certificate_days_until_expiry', + display_name='Days Until Certificate Expiry', + value=float(days_until_expiry), + unit='count', + source='tls_check' + )) + + if days_until_expiry <= 0: + issues.append(IssueData( + category='tls', + severity='critical', + title='SSL certificate has expired', + description=( + f"The SSL certificate expired on {cert_info['expires']}. " + "Users will see security warnings." + ), + tool='tls_check', + affected_url=url, + remediation='Renew your SSL certificate immediately.' + )) + elif days_until_expiry <= 7: + issues.append(IssueData( + category='tls', + severity='high', + title='SSL certificate expiring very soon', + description=( + f"The SSL certificate will expire in {days_until_expiry} days " + f"(on {cert_info['expires']}). Renew immediately." + ), + tool='tls_check', + affected_url=url, + remediation='Renew your SSL certificate before it expires.' + )) + elif days_until_expiry <= 30: + issues.append(IssueData( + category='tls', + severity='medium', + title='SSL certificate expiring soon', + description=( + f"The SSL certificate will expire in {days_until_expiry} days " + f"(on {cert_info['expires']}). Plan for renewal." + ), + tool='tls_check', + affected_url=url, + remediation=( + 'Renew your SSL certificate before expiration. ' + "Consider using auto-renewal with Let's Encrypt." + ) + )) + except Exception as e: + self.logger.warning(f"Could not parse certificate expiry: {e}") + + # Check certificate subject matches hostname + if cert_info.get('subject'): + subject_cn = dict(x[0] for x in cert_info['subject']).get('commonName', '') + san = cert_info.get('subjectAltName', []) + san_names = [name for type_, name in san if type_ == 'DNS'] + + hostname_matched = self._hostname_matches_cert( + hostname, subject_cn, san_names + ) + + if not hostname_matched: + issues.append(IssueData( + category='tls', + severity='high', + title='Certificate hostname mismatch', + description=( + f"The SSL certificate is for '{subject_cn}' but " + f"the site is accessed as '{hostname}'." + ), + tool='tls_check', + affected_url=url, + remediation=( + 'Obtain a certificate that includes your domain name, ' + 'or add it to the Subject Alternative Names (SAN).' + ) + )) + + # Check for HTTP to HTTPS redirect + if parsed.scheme == 'https': + redirect_info = self._check_http_redirect(hostname) + raw_data['http_redirect'] = redirect_info + + if not redirect_info.get('redirects_to_https'): + issues.append(IssueData( + category='tls', + severity='medium', + title='No HTTP to HTTPS redirect', + description=( + 'The site does not redirect HTTP requests to HTTPS. ' + 'Users accessing via HTTP will use an insecure connection.' + ), + tool='tls_check', + affected_url=f"http://{hostname}", + remediation=( + 'Configure your server to redirect all HTTP (port 80) ' + 'requests to HTTPS (port 443) with a 301 redirect.' + ) + )) + + self.logger.info(f"TLS scan complete: {len(issues)} issues") + + return ScannerResult( + scanner_name=self.name, + status=ScannerStatus.SUCCESS, + issues=issues, + metrics=metrics, + raw_data=raw_data + ) + + except Exception as e: + return self._create_error_result(e) + + def _check_https_available(self, hostname: str) -> tuple: + """Check if HTTPS is available for the hostname.""" + try: + context = ssl.create_default_context() + with socket.create_connection((hostname, 443), timeout=self.timeout) as sock: + with context.wrap_socket(sock, server_hostname=hostname) as ssock: + return True, {'available': True, 'protocol': ssock.version()} + except ssl.SSLError as e: + return True, {'available': True, 'error': str(e)} + except Exception as e: + return False, {'available': False, 'error': str(e)} + + def _get_certificate_info(self, hostname: str, port: int = 443) -> Dict: + """Get SSL certificate information.""" + try: + context = ssl.create_default_context() + + with socket.create_connection((hostname, port), timeout=self.timeout) as sock: + with context.wrap_socket(sock, server_hostname=hostname) as ssock: + cert = ssock.getpeercert() + + return { + 'subject': cert.get('subject'), + 'issuer': cert.get('issuer'), + 'version': cert.get('version'), + 'serialNumber': cert.get('serialNumber'), + 'notBefore': cert.get('notBefore'), + 'expires': cert.get('notAfter'), + 'subjectAltName': cert.get('subjectAltName', []), + 'protocol': ssock.version(), + 'cipher': ssock.cipher(), + } + except ssl.SSLCertVerificationError as e: + return {'error': f"Certificate verification failed: {e.verify_message}"} + except ssl.SSLError as e: + return {'error': f"SSL error: {str(e)}"} + except socket.timeout: + return {'error': "Connection timed out"} + except Exception as e: + return {'error': str(e)} + + def _hostname_matches_cert( + self, + hostname: str, + cn: str, + san_names: list + ) -> bool: + """Check if hostname matches certificate CN or SAN.""" + all_names = [cn] + san_names + + for name in all_names: + if name == hostname: + return True + # Handle wildcard certificates + if name.startswith('*.'): + domain = name[2:] + if hostname.endswith(domain): + # Ensure wildcard only matches one level + prefix = hostname[:-len(domain)-1] + if '.' not in prefix: + return True + + return False + + def _check_http_redirect(self, hostname: str) -> Dict: + """Check if HTTP redirects to HTTPS.""" + import httpx + + try: + with httpx.Client( + timeout=self.timeout, + follow_redirects=False + ) as client: + response = client.get(f"http://{hostname}") + + if response.status_code in (301, 302, 303, 307, 308): + location = response.headers.get('location', '') + redirects_to_https = location.startswith('https://') + return { + 'redirects_to_https': redirects_to_https, + 'status_code': response.status_code, + 'location': location, + } + else: + return { + 'redirects_to_https': False, + 'status_code': response.status_code, + } + except Exception as e: + return { + 'redirects_to_https': False, + 'error': str(e), + } diff --git a/backend/scanner/scanners/zap.py b/backend/scanner/scanners/zap.py new file mode 100644 index 0000000..f34f9d6 --- /dev/null +++ b/backend/scanner/scanners/zap.py @@ -0,0 +1,307 @@ +""" +OWASP ZAP Scanner Integration. + +This module integrates with OWASP ZAP for security scanning, +detecting vulnerabilities like XSS, injection flaws, and +misconfigurations. +""" + +import logging +import time +from typing import Any, Dict, List, Optional + +import httpx + +from django.conf import settings + +from .base import ( + BaseScanner, + ScannerResult, + ScannerStatus, + IssueData, + MetricData, +) + +logger = logging.getLogger(__name__) + + +class ZAPScanner(BaseScanner): + """ + Scanner using OWASP ZAP for security vulnerability detection. + + Performs baseline scans to identify common security issues: + - XSS vulnerabilities + - SQL injection patterns + - Insecure cookies + - Missing security headers + - SSL/TLS issues + - And more... + """ + + name = "owasp_zap" + + def __init__(self, config: Optional[Dict[str, Any]] = None): + super().__init__(config) + scanner_config = settings.SCANNER_CONFIG + self.zap_host = self.config.get('zap_host', scanner_config.get('ZAP_HOST', 'http://zap:8080')) + self.api_key = self.config.get('api_key', scanner_config.get('ZAP_API_KEY', '')) + self.timeout = self.config.get('timeout', scanner_config.get('ZAP_TIMEOUT', 120)) + + def is_available(self) -> bool: + """Check if ZAP service is available.""" + try: + with httpx.Client(timeout=10) as client: + response = client.get( + f"{self.zap_host}/JSON/core/view/version/", + params={'apikey': self.api_key} + ) + return response.status_code == 200 + except Exception as e: + self.logger.warning(f"ZAP service not available: {e}") + return False + + def run(self, url: str) -> ScannerResult: + """ + Run ZAP security scan against the URL. + + Args: + url: The URL to scan + + Returns: + ScannerResult with security findings + """ + self.logger.info(f"Starting ZAP scan for {url}") + + try: + # Access the target to populate ZAP's site tree + self._access_url(url) + + # Spider the site (limited crawl) + self._spider_url(url) + + # Run active scan + self._active_scan(url) + + # Get alerts + alerts = self._get_alerts(url) + + return self._parse_results(url, alerts) + + except httpx.TimeoutException: + return self._create_error_result( + Exception("ZAP scan timed out") + ) + except httpx.HTTPStatusError as e: + return self._create_error_result( + Exception(f"ZAP service error: {e.response.status_code}") + ) + except Exception as e: + return self._create_error_result(e) + + def _zap_request(self, endpoint: str, params: Optional[Dict] = None) -> Dict: + """Make a request to the ZAP API.""" + if params is None: + params = {} + params['apikey'] = self.api_key + + with httpx.Client(timeout=self.timeout) as client: + response = client.get( + f"{self.zap_host}{endpoint}", + params=params + ) + response.raise_for_status() + return response.json() + + def _access_url(self, url: str) -> None: + """Access the URL to add it to ZAP's site tree.""" + self.logger.debug(f"Accessing URL in ZAP: {url}") + self._zap_request( + '/JSON/core/action/accessUrl/', + {'url': url, 'followRedirects': 'true'} + ) + time.sleep(2) # Wait for ZAP to process + + def _spider_url(self, url: str) -> None: + """Spider the URL to discover pages.""" + self.logger.debug(f"Spidering URL: {url}") + + # Start spider + result = self._zap_request( + '/JSON/spider/action/scan/', + { + 'url': url, + 'maxChildren': '5', # Limited crawl + 'recurse': 'true', + 'subtreeOnly': 'true' + } + ) + + scan_id = result.get('scan') + if not scan_id: + return + + # Wait for spider to complete (with timeout) + start_time = time.time() + while time.time() - start_time < 60: # 60 second spider timeout + status = self._zap_request( + '/JSON/spider/view/status/', + {'scanId': scan_id} + ) + if int(status.get('status', '100')) >= 100: + break + time.sleep(2) + + def _active_scan(self, url: str) -> None: + """Run active scan against the URL.""" + self.logger.debug(f"Starting active scan: {url}") + + # Start active scan + result = self._zap_request( + '/JSON/ascan/action/scan/', + { + 'url': url, + 'recurse': 'true', + 'inScopeOnly': 'true' + } + ) + + scan_id = result.get('scan') + if not scan_id: + return + + # Wait for scan to complete (with timeout) + start_time = time.time() + while time.time() - start_time < self.timeout: + status = self._zap_request( + '/JSON/ascan/view/status/', + {'scanId': scan_id} + ) + if int(status.get('status', '100')) >= 100: + break + time.sleep(5) + + def _get_alerts(self, url: str) -> List[Dict]: + """Get alerts for the scanned URL.""" + self.logger.debug(f"Fetching alerts for: {url}") + + result = self._zap_request( + '/JSON/core/view/alerts/', + { + 'baseurl': url, + 'start': '0', + 'count': '100' # Limit alerts + } + ) + + return result.get('alerts', []) + + def _parse_results(self, url: str, alerts: List[Dict]) -> ScannerResult: + """ + Parse ZAP alerts into ScannerResult format. + + Args: + url: The scanned URL + alerts: List of ZAP alerts + + Returns: + Parsed ScannerResult + """ + issues = [] + metrics = [] + + # Count alerts by risk level + risk_counts = { + 'High': 0, + 'Medium': 0, + 'Low': 0, + 'Informational': 0 + } + + for alert in alerts: + risk = alert.get('risk', 'Informational') + risk_counts[risk] = risk_counts.get(risk, 0) + 1 + + severity = self._map_risk_to_severity(risk) + + issues.append(IssueData( + category='security', + severity=severity, + title=alert.get('name', 'Unknown vulnerability'), + description=self._format_description(alert), + tool='owasp_zap', + affected_url=alert.get('url', url), + remediation=alert.get('solution', 'Review and fix the vulnerability.'), + raw_data={ + 'alert_ref': alert.get('alertRef'), + 'cweid': alert.get('cweid'), + 'wascid': alert.get('wascid'), + 'confidence': alert.get('confidence'), + 'evidence': alert.get('evidence', '')[:500], # Truncate evidence + } + )) + + # Create metrics for vulnerability counts + for risk_level, count in risk_counts.items(): + if count > 0: + metrics.append(MetricData( + name=f'zap_{risk_level.lower()}_alerts', + display_name=f'{risk_level} Risk Alerts', + value=float(count), + unit='count', + source='owasp_zap' + )) + + metrics.append(MetricData( + name='total_security_alerts', + display_name='Total Security Alerts', + value=float(len(alerts)), + unit='count', + source='owasp_zap' + )) + + self.logger.info( + f"ZAP scan complete: {len(alerts)} alerts " + f"(High: {risk_counts['High']}, Medium: {risk_counts['Medium']}, " + f"Low: {risk_counts['Low']})" + ) + + return ScannerResult( + scanner_name=self.name, + status=ScannerStatus.SUCCESS, + issues=issues, + metrics=metrics, + raw_data={ + 'total_alerts': len(alerts), + 'risk_counts': risk_counts, + 'alerts': alerts[:50] # Store limited raw alerts + } + ) + + def _map_risk_to_severity(self, risk: str) -> str: + """Map ZAP risk level to our severity.""" + mapping = { + 'High': 'high', + 'Medium': 'medium', + 'Low': 'low', + 'Informational': 'info', + } + return mapping.get(risk, 'info') + + def _format_description(self, alert: Dict) -> str: + """Format ZAP alert into readable description.""" + parts = [] + + if alert.get('description'): + parts.append(alert['description']) + + if alert.get('attack'): + parts.append(f"\nAttack: {alert['attack']}") + + if alert.get('evidence'): + evidence = alert['evidence'][:200] + parts.append(f"\nEvidence: {evidence}") + + if alert.get('reference'): + parts.append(f"\nReference: {alert['reference']}") + + return '\n'.join(parts) diff --git a/backend/scanner/tasks.py b/backend/scanner/tasks.py new file mode 100644 index 0000000..de1aced --- /dev/null +++ b/backend/scanner/tasks.py @@ -0,0 +1,306 @@ +""" +Celery tasks for background scanning. + +This module defines the Celery tasks that orchestrate website scans +in the background. +""" + +import logging +from datetime import timedelta +from typing import Optional + +from celery import shared_task +from celery.exceptions import SoftTimeLimitExceeded +from django.conf import settings +from django.utils import timezone + +from websites.models import Website, Scan, ScanStatus, Issue, Metric +from scanner.scanners import ScanRunner +from scanner.utils import validate_url, get_domain_from_url + +logger = logging.getLogger(__name__) + + +@shared_task( + bind=True, + max_retries=2, + default_retry_delay=60, + soft_time_limit=300, + time_limit=330, +) +def run_scan_task(self, scan_id: str) -> dict: + """ + Main Celery task for running a website scan. + + This task: + 1. Updates scan status to running + 2. Orchestrates all scanners + 3. Saves results to database + 4. Handles errors and partial results + + Args: + scan_id: UUID of the Scan record + + Returns: + Dict with scan results summary + """ + logger.info(f"Starting scan task for scan_id: {scan_id}") + + try: + # Get the scan record + scan = Scan.objects.select_related('website').get(id=scan_id) + except Scan.DoesNotExist: + logger.error(f"Scan {scan_id} not found") + return {'error': f'Scan {scan_id} not found'} + + # Update status to running + scan.status = ScanStatus.RUNNING + scan.started_at = timezone.now() + scan.celery_task_id = self.request.id + scan.save(update_fields=['status', 'started_at', 'celery_task_id']) + + url = scan.website.url + + try: + # Run the scan pipeline + runner = ScanRunner() + results = runner.run(url) + + # Save results to database + _save_scan_results(scan, results) + + # Update website last_scanned_at + scan.website.last_scanned_at = timezone.now() + scan.website.save(update_fields=['last_scanned_at']) + + logger.info(f"Scan {scan_id} completed successfully") + + return { + 'scan_id': str(scan_id), + 'status': scan.status, + 'overall_score': scan.overall_score, + 'issues_count': scan.issues.count(), + 'metrics_count': scan.metrics.count(), + } + + except SoftTimeLimitExceeded: + logger.warning(f"Scan {scan_id} timed out") + scan.status = ScanStatus.PARTIAL + scan.error_message = "Scan timed out before completing all checks" + scan.completed_at = timezone.now() + scan.save(update_fields=['status', 'error_message', 'completed_at']) + + return { + 'scan_id': str(scan_id), + 'status': 'partial', + 'error': 'Scan timed out' + } + + except Exception as e: + logger.exception(f"Scan {scan_id} failed with error: {e}") + scan.status = ScanStatus.FAILED + scan.error_message = str(e) + scan.completed_at = timezone.now() + scan.save(update_fields=['status', 'error_message', 'completed_at']) + + # Retry on certain errors + if self.request.retries < self.max_retries: + raise self.retry(exc=e) + + return { + 'scan_id': str(scan_id), + 'status': 'failed', + 'error': str(e) + } + + +def _save_scan_results(scan: Scan, results: dict) -> None: + """ + Save scan results to the database. + + Args: + scan: The Scan model instance + results: Aggregated results from ScanRunner + """ + # Update scan status + status_map = { + 'done': ScanStatus.DONE, + 'partial': ScanStatus.PARTIAL, + 'failed': ScanStatus.FAILED, + } + scan.status = status_map.get(results['status'], ScanStatus.DONE) + scan.completed_at = timezone.now() + + # Save scores + scores = results.get('scores', {}) + scan.performance_score = scores.get('performance') + scan.accessibility_score = scores.get('accessibility') + scan.seo_score = scores.get('seo') + scan.best_practices_score = scores.get('best_practices') + + # Save raw data + raw_data = results.get('raw_data', {}) + scan.raw_lighthouse_data = raw_data.get('lighthouse') + scan.raw_zap_data = raw_data.get('owasp_zap') + scan.raw_playwright_data = raw_data.get('playwright') + scan.raw_headers_data = raw_data.get('header_check') + + # Save errors if any + if results.get('errors'): + scan.error_message = '\n'.join( + f"{e['scanner']}: {e['error']}" + for e in results['errors'] + ) + + scan.save() + + # Create Issue records + issues_to_create = [] + for issue_data in results.get('issues', []): + issues_to_create.append(Issue( + scan=scan, + category=issue_data['category'], + severity=issue_data['severity'], + title=issue_data['title'][:500], # Truncate if too long + description=issue_data['description'], + tool=issue_data['tool'], + affected_url=issue_data.get('affected_url'), + remediation=issue_data.get('remediation'), + raw_data=issue_data.get('raw_data'), + )) + + if issues_to_create: + Issue.objects.bulk_create(issues_to_create) + + # Create Metric records + metrics_to_create = [] + seen_metrics = set() # Track unique metrics + + for metric_data in results.get('metrics', []): + metric_key = metric_data['name'] + if metric_key in seen_metrics: + continue # Skip duplicates + seen_metrics.add(metric_key) + + # Map unit strings to model choices + unit_map = { + 'ms': 'ms', + 'milliseconds': 'ms', + 's': 's', + 'seconds': 's', + 'bytes': 'bytes', + 'kb': 'kb', + 'kilobytes': 'kb', + 'mb': 'mb', + 'megabytes': 'mb', + 'score': 'score', + 'percent': 'percent', + 'count': 'count', + } + unit = unit_map.get(metric_data['unit'].lower(), 'count') + + metrics_to_create.append(Metric( + scan=scan, + name=metric_data['name'], + display_name=metric_data['display_name'][:200], + value=metric_data['value'], + unit=unit, + source=metric_data['source'], + score=metric_data.get('score'), + )) + + if metrics_to_create: + Metric.objects.bulk_create(metrics_to_create) + + # Calculate security score based on issues + scan.calculate_security_score() + + # Calculate overall score + scan.calculate_overall_score() + + scan.save(update_fields=['security_score', 'overall_score']) + + logger.info( + f"Saved scan results: {len(issues_to_create)} issues, " + f"{len(metrics_to_create)} metrics" + ) + + +@shared_task +def cleanup_old_scans(days: int = 30) -> dict: + """ + Clean up old scan data to prevent database growth. + + Args: + days: Number of days to keep scans + + Returns: + Dict with cleanup statistics + """ + cutoff_date = timezone.now() - timedelta(days=days) + + # Delete old scans (cascades to issues and metrics) + deleted_count, _ = Scan.objects.filter( + created_at__lt=cutoff_date + ).delete() + + logger.info(f"Cleaned up {deleted_count} old scans") + + return { + 'deleted_scans': deleted_count, + 'cutoff_date': cutoff_date.isoformat(), + } + + +def check_rate_limit(url: str) -> Optional[str]: + """ + Check if URL scanning is rate limited. + + Args: + url: The URL to check + + Returns: + Error message if rate limited, None otherwise + """ + from django.core.cache import cache + + scanner_config = settings.SCANNER_CONFIG + rate_limit_minutes = scanner_config.get('SCAN_RATE_LIMIT_MINUTES', 5) + + # Create a cache key based on the URL + domain = get_domain_from_url(url) + cache_key = f"scan_rate_limit:{domain}" + + # Check if already scanned recently + last_scan_time = cache.get(cache_key) + if last_scan_time: + return ( + f"This URL was scanned recently. " + f"Please wait {rate_limit_minutes} minutes between scans." + ) + + # Set the rate limit + cache.set(cache_key, timezone.now().isoformat(), timeout=rate_limit_minutes * 60) + + return None + + +def check_concurrent_scan_limit() -> Optional[str]: + """ + Check if maximum concurrent scans limit is reached. + + Returns: + Error message if limit reached, None otherwise + """ + scanner_config = settings.SCANNER_CONFIG + max_concurrent = scanner_config.get('MAX_CONCURRENT_SCANS', 3) + + running_count = Scan.objects.filter(status=ScanStatus.RUNNING).count() + + if running_count >= max_concurrent: + return ( + f"Maximum concurrent scans ({max_concurrent}) reached. " + "Please wait for current scans to complete." + ) + + return None diff --git a/backend/scanner/utils.py b/backend/scanner/utils.py new file mode 100644 index 0000000..34a4d52 --- /dev/null +++ b/backend/scanner/utils.py @@ -0,0 +1,185 @@ +""" +URL validation and safety utilities. + +This module provides functions for validating and normalizing URLs, +including safety checks to prevent SSRF attacks. +""" + +import ipaddress +import logging +import socket +from typing import Tuple +from urllib.parse import urlparse, urlunparse + +import validators + +from django.conf import settings + +logger = logging.getLogger(__name__) + + +def validate_url(url: str) -> Tuple[bool, str]: + """ + Validate and normalize a URL for scanning. + + Args: + url: The URL to validate + + Returns: + Tuple of (is_valid, normalized_url_or_error_message) + """ + if not url: + return False, "URL is required" + + # Basic URL validation + if not validators.url(url): + return False, "Invalid URL format" + + # Parse the URL + try: + parsed = urlparse(url) + except Exception as e: + return False, f"Could not parse URL: {e}" + + # Check scheme + if parsed.scheme not in ('http', 'https'): + return False, "URL must use http or https scheme" + + # Check hostname + hostname = parsed.netloc.split(':')[0].lower() + + if not hostname: + return False, "URL must have a valid hostname" + + # Safety check: block localhost and private IPs + is_safe, safety_error = check_url_safety(hostname) + if not is_safe: + return False, safety_error + + # Normalize URL + normalized = normalize_url(url) + + return True, normalized + + +def normalize_url(url: str) -> str: + """ + Normalize a URL to a canonical form. + + - Lowercase hostname + - Remove trailing slashes from path + - Remove default ports + - Sort query parameters + + Args: + url: The URL to normalize + + Returns: + Normalized URL string + """ + parsed = urlparse(url) + + # Lowercase hostname + hostname = parsed.netloc.lower() + + # Remove default ports + if ':80' in hostname and parsed.scheme == 'http': + hostname = hostname.replace(':80', '') + elif ':443' in hostname and parsed.scheme == 'https': + hostname = hostname.replace(':443', '') + + # Normalize path (remove trailing slash except for root) + path = parsed.path + if path != '/' and path.endswith('/'): + path = path.rstrip('/') + if not path: + path = '/' + + # Reconstruct URL + normalized = urlunparse(( + parsed.scheme, + hostname, + path, + parsed.params, + parsed.query, + '' # Remove fragment + )) + + return normalized + + +def check_url_safety(hostname: str) -> Tuple[bool, str]: + """ + Check if a hostname is safe to scan (not localhost/private IP). + + Args: + hostname: The hostname to check + + Returns: + Tuple of (is_safe, error_message_if_not_safe) + """ + scanner_config = settings.SCANNER_CONFIG + blocked_hosts = scanner_config.get('BLOCKED_HOSTS', []) + blocked_ranges = scanner_config.get('BLOCKED_IP_RANGES', []) + + # Check blocked hostnames + if hostname in blocked_hosts: + return False, f"Scanning {hostname} is not allowed" + + # Try to resolve hostname to IP + try: + ip_addresses = socket.getaddrinfo( + hostname, None, socket.AF_UNSPEC, socket.SOCK_STREAM + ) + except socket.gaierror: + # Could not resolve - might be okay for some hostnames + logger.warning(f"Could not resolve hostname: {hostname}") + return True, "" + + for family, type_, proto, canonname, sockaddr in ip_addresses: + ip_str = sockaddr[0] + + try: + ip = ipaddress.ip_address(ip_str) + + # Check if IP is in any blocked range + for blocked_range in blocked_ranges: + try: + network = ipaddress.ip_network(blocked_range, strict=False) + if ip in network: + return False, f"Scanning private/local IP addresses is not allowed ({ip_str})" + except ValueError: + continue + + # Additional checks + if ip.is_private: + return False, f"Scanning private IP addresses is not allowed ({ip_str})" + + if ip.is_loopback: + return False, f"Scanning localhost/loopback addresses is not allowed ({ip_str})" + + if ip.is_link_local: + return False, f"Scanning link-local addresses is not allowed ({ip_str})" + + if ip.is_reserved: + return False, f"Scanning reserved IP addresses is not allowed ({ip_str})" + + except ValueError: + # Not a valid IP address format + continue + + return True, "" + + +def get_domain_from_url(url: str) -> str: + """ + Extract the domain from a URL. + + Args: + url: The URL to extract domain from + + Returns: + The domain/hostname + """ + parsed = urlparse(url) + return parsed.netloc.split(':')[0].lower() diff --git a/backend/templates/base.html b/backend/templates/base.html new file mode 100644 index 0000000..1cdf7dd --- /dev/null +++ b/backend/templates/base.html @@ -0,0 +1,89 @@ + + + + + + {% block title %}Website Analyzer{% endblock %} + + + + + + + + + + + + + {% block extra_head %}{% endblock %} + + + + + + +
+ {% block content %}{% endblock %} +
+ + + + + {% block extra_js %}{% endblock %} + + diff --git a/backend/websites/__init__.py b/backend/websites/__init__.py new file mode 100644 index 0000000..70b87d8 --- /dev/null +++ b/backend/websites/__init__.py @@ -0,0 +1,5 @@ +""" +Websites app initialization. +""" + +default_app_config = 'websites.apps.WebsitesConfig' diff --git a/backend/websites/admin.py b/backend/websites/admin.py new file mode 100644 index 0000000..800f023 --- /dev/null +++ b/backend/websites/admin.py @@ -0,0 +1,93 @@ +""" +Django admin configuration for Website Analyzer models. +""" + +from django.contrib import admin +from .models import Website, Scan, Issue, Metric + + +@admin.register(Website) +class WebsiteAdmin(admin.ModelAdmin): + list_display = ('url', 'domain', 'created_at', 'last_scanned_at') + list_filter = ('created_at', 'last_scanned_at') + search_fields = ('url', 'domain') + readonly_fields = ('id', 'created_at', 'domain') + ordering = ('-created_at',) + + +class IssueInline(admin.TabularInline): + model = Issue + extra = 0 + readonly_fields = ('id', 'category', 'severity', 'tool', 'title', 'created_at') + can_delete = False + show_change_link = True + max_num = 10 + + +class MetricInline(admin.TabularInline): + model = Metric + extra = 0 + readonly_fields = ('id', 'name', 'display_name', 'value', 'unit', 'source', 'score') + can_delete = False + max_num = 15 + + +@admin.register(Scan) +class ScanAdmin(admin.ModelAdmin): + list_display = ( + 'id', 'website', 'status', 'overall_score', + 'performance_score', 'security_score', 'created_at' + ) + list_filter = ('status', 'created_at') + search_fields = ('website__url', 'website__domain') + readonly_fields = ( + 'id', 'created_at', 'started_at', 'completed_at', + 'celery_task_id', 'raw_lighthouse_data', 'raw_zap_data', + 'raw_playwright_data', 'raw_headers_data' + ) + inlines = [IssueInline, MetricInline] + ordering = ('-created_at',) + + fieldsets = ( + ('Basic Info', { + 'fields': ('id', 'website', 'status', 'celery_task_id') + }), + ('Timestamps', { + 'fields': ('created_at', 'started_at', 'completed_at') + }), + ('Scores', { + 'fields': ( + 'overall_score', 'performance_score', 'accessibility_score', + 'seo_score', 'best_practices_score', 'security_score' + ) + }), + ('Errors', { + 'fields': ('error_message',), + 'classes': ('collapse',) + }), + ('Raw Data', { + 'fields': ( + 'raw_lighthouse_data', 'raw_zap_data', + 'raw_playwright_data', 'raw_headers_data' + ), + 'classes': ('collapse',) + }), + ) + + +@admin.register(Issue) +class IssueAdmin(admin.ModelAdmin): + list_display = ('title', 'scan', 'category', 'severity', 'tool', 'created_at') + list_filter = ('category', 'severity', 'tool', 'created_at') + search_fields = ('title', 'description', 'scan__website__url') + readonly_fields = ('id', 'created_at', 'raw_data') + ordering = ('severity', '-created_at') + + +@admin.register(Metric) +class MetricAdmin(admin.ModelAdmin): + list_display = ('display_name', 'scan', 'value', 'unit', 'source', 'score') + list_filter = ('source', 'unit') + search_fields = ('name', 'display_name', 'scan__website__url') + readonly_fields = ('id', 'created_at') + ordering = ('name',) diff --git a/backend/websites/apps.py b/backend/websites/apps.py new file mode 100644 index 0000000..8580775 --- /dev/null +++ b/backend/websites/apps.py @@ -0,0 +1,11 @@ +""" +Websites app configuration. +""" + +from django.apps import AppConfig + + +class WebsitesConfig(AppConfig): + default_auto_field = 'django.db.models.BigAutoField' + name = 'websites' + verbose_name = 'Website Scanner' diff --git a/backend/websites/models.py b/backend/websites/models.py new file mode 100644 index 0000000..af7b0b8 --- /dev/null +++ b/backend/websites/models.py @@ -0,0 +1,493 @@ +""" +Database models for Website Analyzer. + +This module defines the core data models for storing websites, scans, +issues, and metrics from various scanning tools. +""" + +import uuid +from django.db import models +from django.utils import timezone +from django.core.validators import URLValidator + + +class Website(models.Model): + """ + Represents a website that has been scanned. + + Each unique URL gets one Website record, which can have multiple + Scan records associated with it. + """ + + id = models.UUIDField( + primary_key=True, + default=uuid.uuid4, + editable=False, + help_text="Unique identifier for the website" + ) + url = models.URLField( + max_length=2048, + unique=True, + validators=[URLValidator(schemes=['http', 'https'])], + help_text="The normalized URL of the website" + ) + domain = models.CharField( + max_length=255, + db_index=True, + help_text="The domain extracted from the URL" + ) + created_at = models.DateTimeField( + auto_now_add=True, + help_text="When the website was first added" + ) + last_scanned_at = models.DateTimeField( + null=True, + blank=True, + help_text="When the website was last scanned" + ) + + class Meta: + db_table = 'websites' + ordering = ['-created_at'] + indexes = [ + models.Index(fields=['domain']), + models.Index(fields=['-last_scanned_at']), + ] + + def __str__(self): + return self.url + + def save(self, *args, **kwargs): + """Extract domain from URL before saving.""" + if self.url: + from urllib.parse import urlparse + parsed = urlparse(self.url) + self.domain = parsed.netloc.lower() + super().save(*args, **kwargs) + + +class ScanStatus(models.TextChoices): + """Enumeration of possible scan statuses.""" + PENDING = 'pending', 'Pending' + RUNNING = 'running', 'Running' + DONE = 'done', 'Completed' + FAILED = 'failed', 'Failed' + PARTIAL = 'partial', 'Partially Completed' + + +class Scan(models.Model): + """ + Represents a single scan of a website. + + Contains aggregated scores from various scanning tools and + links to detailed issues and metrics. + """ + + id = models.UUIDField( + primary_key=True, + default=uuid.uuid4, + editable=False, + help_text="Unique identifier for the scan" + ) + website = models.ForeignKey( + Website, + on_delete=models.CASCADE, + related_name='scans', + help_text="The website that was scanned" + ) + status = models.CharField( + max_length=20, + choices=ScanStatus.choices, + default=ScanStatus.PENDING, + db_index=True, + help_text="Current status of the scan" + ) + + # Celery task tracking + celery_task_id = models.CharField( + max_length=255, + null=True, + blank=True, + help_text="Celery task ID for tracking" + ) + + # Timestamps + created_at = models.DateTimeField( + auto_now_add=True, + help_text="When the scan was created" + ) + started_at = models.DateTimeField( + null=True, + blank=True, + help_text="When the scan started running" + ) + completed_at = models.DateTimeField( + null=True, + blank=True, + help_text="When the scan completed" + ) + + # Aggregated scores (0-100) + performance_score = models.IntegerField( + null=True, + blank=True, + help_text="Lighthouse performance score (0-100)" + ) + accessibility_score = models.IntegerField( + null=True, + blank=True, + help_text="Lighthouse accessibility score (0-100)" + ) + seo_score = models.IntegerField( + null=True, + blank=True, + help_text="Lighthouse SEO score (0-100)" + ) + best_practices_score = models.IntegerField( + null=True, + blank=True, + help_text="Lighthouse best practices score (0-100)" + ) + security_score = models.IntegerField( + null=True, + blank=True, + help_text="Computed security score based on issues (0-100)" + ) + + # Overall health score (computed average) + overall_score = models.IntegerField( + null=True, + blank=True, + help_text="Overall health score (0-100)" + ) + + # Error tracking + error_message = models.TextField( + null=True, + blank=True, + help_text="Error message if scan failed" + ) + + # Raw data from scanners + raw_lighthouse_data = models.JSONField( + null=True, + blank=True, + help_text="Raw Lighthouse report data" + ) + raw_zap_data = models.JSONField( + null=True, + blank=True, + help_text="Raw OWASP ZAP report data" + ) + raw_playwright_data = models.JSONField( + null=True, + blank=True, + help_text="Raw Playwright analysis data" + ) + raw_headers_data = models.JSONField( + null=True, + blank=True, + help_text="Raw HTTP headers analysis data" + ) + + class Meta: + db_table = 'scans' + ordering = ['-created_at'] + indexes = [ + models.Index(fields=['status']), + models.Index(fields=['-created_at']), + models.Index(fields=['website', '-created_at']), + ] + + def __str__(self): + return f"Scan {self.id} - {self.website.url} ({self.status})" + + def calculate_overall_score(self): + """ + Calculate overall health score as weighted average of all scores. + + Weights: + - Performance: 25% + - Security: 30% + - Accessibility: 15% + - SEO: 15% + - Best Practices: 15% + """ + scores = [ + (self.performance_score, 0.25), + (self.security_score, 0.30), + (self.accessibility_score, 0.15), + (self.seo_score, 0.15), + (self.best_practices_score, 0.15), + ] + + total_weight = 0 + weighted_sum = 0 + + for score, weight in scores: + if score is not None: + weighted_sum += score * weight + total_weight += weight + + if total_weight > 0: + self.overall_score = round(weighted_sum / total_weight) + else: + self.overall_score = None + + return self.overall_score + + def calculate_security_score(self): + """ + Calculate security score based on security issues found. + + Starts at 100 and deducts points based on issue severity: + - Critical: -25 points each + - High: -15 points each + - Medium: -8 points each + - Low: -3 points each + - Info: -1 point each + """ + deductions = { + 'critical': 25, + 'high': 15, + 'medium': 8, + 'low': 3, + 'info': 1, + } + + score = 100 + security_issues = self.issues.filter( + category__in=['security', 'headers', 'tls', 'cors'] + ) + + for issue in security_issues: + score -= deductions.get(issue.severity, 0) + + self.security_score = max(0, score) + return self.security_score + + +class IssueCategory(models.TextChoices): + """Categories of issues that can be detected.""" + PERFORMANCE = 'performance', 'Performance' + SECURITY = 'security', 'Security' + HEADERS = 'headers', 'HTTP Headers' + TLS = 'tls', 'TLS/SSL' + CORS = 'cors', 'CORS' + ACCESSIBILITY = 'accessibility', 'Accessibility' + SEO = 'seo', 'SEO' + BEST_PRACTICES = 'best_practices', 'Best Practices' + CONTENT = 'content', 'Content' + RESOURCES = 'resources', 'Resources' + + +class IssueSeverity(models.TextChoices): + """Severity levels for issues.""" + CRITICAL = 'critical', 'Critical' + HIGH = 'high', 'High' + MEDIUM = 'medium', 'Medium' + LOW = 'low', 'Low' + INFO = 'info', 'Informational' + + +class ScannerTool(models.TextChoices): + """Scanner tools that can detect issues.""" + LIGHTHOUSE = 'lighthouse', 'Google Lighthouse' + ZAP = 'owasp_zap', 'OWASP ZAP' + PLAYWRIGHT = 'playwright', 'Playwright' + HEADER_CHECK = 'header_check', 'HTTP Header Check' + TLS_CHECK = 'tls_check', 'TLS/SSL Check' + + +class Issue(models.Model): + """ + Represents a specific issue found during a scan. + + Issues are categorized by type, severity, and the tool that detected them. + Each issue includes a description and suggested remediation. + """ + + id = models.UUIDField( + primary_key=True, + default=uuid.uuid4, + editable=False + ) + scan = models.ForeignKey( + Scan, + on_delete=models.CASCADE, + related_name='issues', + help_text="The scan that found this issue" + ) + + # Classification + category = models.CharField( + max_length=30, + choices=IssueCategory.choices, + db_index=True, + help_text="Category of the issue" + ) + severity = models.CharField( + max_length=20, + choices=IssueSeverity.choices, + db_index=True, + help_text="Severity level of the issue" + ) + tool = models.CharField( + max_length=30, + choices=ScannerTool.choices, + help_text="Tool that detected this issue" + ) + + # Issue details + title = models.CharField( + max_length=500, + help_text="Brief title of the issue" + ) + description = models.TextField( + help_text="Detailed description of the issue" + ) + affected_url = models.URLField( + max_length=2048, + null=True, + blank=True, + help_text="Specific URL affected by this issue" + ) + remediation = models.TextField( + null=True, + blank=True, + help_text="Suggested fix or remediation" + ) + + # Additional data from scanner + raw_data = models.JSONField( + null=True, + blank=True, + help_text="Raw data from the scanner for this issue" + ) + + # Timestamps + created_at = models.DateTimeField( + auto_now_add=True + ) + + class Meta: + db_table = 'issues' + ordering = ['severity', '-created_at'] + indexes = [ + models.Index(fields=['scan', 'category']), + models.Index(fields=['scan', 'severity']), + models.Index(fields=['tool']), + ] + + def __str__(self): + return f"[{self.severity}] {self.title}" + + +class MetricUnit(models.TextChoices): + """Units of measurement for metrics.""" + MILLISECONDS = 'ms', 'Milliseconds' + SECONDS = 's', 'Seconds' + BYTES = 'bytes', 'Bytes' + KILOBYTES = 'kb', 'Kilobytes' + MEGABYTES = 'mb', 'Megabytes' + SCORE = 'score', 'Score (0-1)' + PERCENT = 'percent', 'Percentage' + COUNT = 'count', 'Count' + + +class Metric(models.Model): + """ + Represents a specific metric measured during a scan. + + Metrics are numerical values with units, such as page load time, + total byte weight, number of requests, etc. + """ + + id = models.UUIDField( + primary_key=True, + default=uuid.uuid4, + editable=False + ) + scan = models.ForeignKey( + Scan, + on_delete=models.CASCADE, + related_name='metrics', + help_text="The scan that measured this metric" + ) + + # Metric identification + name = models.CharField( + max_length=100, + db_index=True, + help_text="Name of the metric (e.g., 'first_contentful_paint_ms')" + ) + display_name = models.CharField( + max_length=200, + help_text="Human-readable name for display" + ) + + # Value + value = models.FloatField( + help_text="Numeric value of the metric" + ) + unit = models.CharField( + max_length=20, + choices=MetricUnit.choices, + help_text="Unit of measurement" + ) + + # Source + source = models.CharField( + max_length=30, + choices=ScannerTool.choices, + help_text="Tool that provided this metric" + ) + + # Score (if applicable) + score = models.FloatField( + null=True, + blank=True, + help_text="Lighthouse score for this metric (0-1)" + ) + + # Timestamp + created_at = models.DateTimeField( + auto_now_add=True + ) + + class Meta: + db_table = 'metrics' + ordering = ['name'] + indexes = [ + models.Index(fields=['scan', 'name']), + models.Index(fields=['source']), + ] + # Ensure unique metric names per scan + constraints = [ + models.UniqueConstraint( + fields=['scan', 'name'], + name='unique_metric_per_scan' + ) + ] + + def __str__(self): + return f"{self.display_name}: {self.value} {self.unit}" + + def get_formatted_value(self): + """Return a formatted string representation of the value.""" + if self.unit == MetricUnit.MILLISECONDS: + if self.value >= 1000: + return f"{self.value / 1000:.2f}s" + return f"{self.value:.0f}ms" + elif self.unit == MetricUnit.BYTES: + if self.value >= 1024 * 1024: + return f"{self.value / (1024 * 1024):.2f} MB" + elif self.value >= 1024: + return f"{self.value / 1024:.1f} KB" + return f"{self.value:.0f} bytes" + elif self.unit == MetricUnit.PERCENT: + return f"{self.value:.1f}%" + elif self.unit == MetricUnit.SCORE: + return f"{self.value:.3f}" + else: + return f"{self.value:.2f} {self.get_unit_display()}" diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..027b87a --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,160 @@ +# Website Analyzer - Docker Compose Configuration +# This file orchestrates all services required for the application + +version: '3.9' + +services: + # ========================================================================== + # PostgreSQL Database + # ========================================================================== + db: + image: postgres:16-alpine + container_name: analyzer_db + restart: unless-stopped + environment: + POSTGRES_USER: analyzer + POSTGRES_PASSWORD: analyzer_password + POSTGRES_DB: website_analyzer + volumes: + - postgres_data:/var/lib/postgresql/data + ports: + - "5432:5432" + healthcheck: + test: ["CMD-SHELL", "pg_isready -U analyzer -d website_analyzer"] + interval: 10s + timeout: 5s + retries: 5 + + # ========================================================================== + # Redis - Message Broker & Cache + # ========================================================================== + redis: + image: redis:7-alpine + container_name: analyzer_redis + restart: unless-stopped + ports: + - "6379:6379" + volumes: + - redis_data:/data + healthcheck: + test: ["CMD", "redis-cli", "ping"] + interval: 10s + timeout: 5s + retries: 5 + + # ========================================================================== + # Django Web Application + # ========================================================================== + web: + build: + context: ./backend + dockerfile: Dockerfile + container_name: analyzer_web + restart: unless-stopped + command: > + sh -c "python manage.py migrate && + python manage.py collectstatic --noinput && + gunicorn core.wsgi:application --bind 0.0.0.0:8000 --workers 4 --threads 2" + volumes: + - ./backend:/app + - static_volume:/app/staticfiles + ports: + - "8000:8000" + env_file: + - ./backend/.env + depends_on: + db: + condition: service_healthy + redis: + condition: service_healthy + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:8000/api/health/"] + interval: 30s + timeout: 10s + retries: 3 + + # ========================================================================== + # Celery Worker - Background Task Processing + # ========================================================================== + celery_worker: + build: + context: ./backend + dockerfile: Dockerfile + container_name: analyzer_celery_worker + restart: unless-stopped + command: celery -A core worker -l INFO --concurrency=2 + volumes: + - ./backend:/app + env_file: + - ./backend/.env + depends_on: + - db + - redis + - web + + # ========================================================================== + # Celery Beat - Scheduled Tasks (Optional) + # ========================================================================== + celery_beat: + build: + context: ./backend + dockerfile: Dockerfile + container_name: analyzer_celery_beat + restart: unless-stopped + command: celery -A core beat -l INFO + volumes: + - ./backend:/app + env_file: + - ./backend/.env + depends_on: + - db + - redis + - celery_worker + + # ========================================================================== + # OWASP ZAP - Security Scanner + # ========================================================================== + zap: + image: ghcr.io/zaproxy/zaproxy:stable + container_name: analyzer_zap + restart: unless-stopped + command: zap.sh -daemon -host 0.0.0.0 -port 8080 -config api.key=zap-api-key-change-me -config api.addrs.addr.name=.* -config api.addrs.addr.regex=true + ports: + - "8081:8080" + volumes: + - zap_data:/home/zap/.ZAP + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:8080/JSON/core/view/version/?apikey=zap-api-key-change-me"] + interval: 30s + timeout: 10s + retries: 5 + + # ========================================================================== + # Lighthouse Scanner Service (Node.js) + # ========================================================================== + lighthouse: + build: + context: ./lighthouse + dockerfile: Dockerfile + container_name: analyzer_lighthouse + restart: unless-stopped + ports: + - "3001:3001" + volumes: + - lighthouse_reports:/app/reports + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:3001/health"] + interval: 30s + timeout: 10s + retries: 3 + +volumes: + postgres_data: + redis_data: + static_volume: + zap_data: + lighthouse_reports: + +networks: + default: + name: analyzer_network diff --git a/lighthouse/Dockerfile b/lighthouse/Dockerfile new file mode 100644 index 0000000..3cb2667 --- /dev/null +++ b/lighthouse/Dockerfile @@ -0,0 +1,54 @@ +# Lighthouse Scanner Service - Dockerfile +# Node.js service that runs Lighthouse CLI and provides HTTP API + +FROM node:20-slim + +# Install Chrome dependencies +RUN apt-get update && apt-get install -y --no-install-recommends \ + chromium \ + fonts-liberation \ + libappindicator3-1 \ + libasound2 \ + libatk-bridge2.0-0 \ + libatk1.0-0 \ + libcups2 \ + libdbus-1-3 \ + libdrm2 \ + libgbm1 \ + libgtk-3-0 \ + libnspr4 \ + libnss3 \ + libxcomposite1 \ + libxdamage1 \ + libxfixes3 \ + libxkbcommon0 \ + libxrandr2 \ + xdg-utils \ + curl \ + && rm -rf /var/lib/apt/lists/* + +# Set Chrome path for Lighthouse +ENV CHROME_PATH=/usr/bin/chromium + +WORKDIR /app + +# Copy package files +COPY package*.json ./ + +# Install dependencies +RUN npm ci --only=production + +# Copy application code +COPY . . + +# Create reports directory +RUN mkdir -p reports + +# Create non-root user +RUN useradd -m -u 1000 lighthouse && \ + chown -R lighthouse:lighthouse /app +USER lighthouse + +EXPOSE 3001 + +CMD ["node", "server.js"] diff --git a/lighthouse/package.json b/lighthouse/package.json new file mode 100644 index 0000000..0908b95 --- /dev/null +++ b/lighthouse/package.json @@ -0,0 +1,19 @@ +{ + "name": "lighthouse-scanner", + "version": "1.0.0", + "description": "Lighthouse scanner service for Website Analyzer", + "main": "server.js", + "scripts": { + "start": "node server.js", + "dev": "node --watch server.js" + }, + "dependencies": { + "express": "^4.18.2", + "lighthouse": "^11.4.0", + "chrome-launcher": "^1.1.0", + "uuid": "^9.0.0" + }, + "engines": { + "node": ">=18.0.0" + } +} diff --git a/lighthouse/server.js b/lighthouse/server.js new file mode 100644 index 0000000..f399f7f --- /dev/null +++ b/lighthouse/server.js @@ -0,0 +1,328 @@ +/** + * Lighthouse Scanner Service + * + * This service provides an HTTP API for running Lighthouse audits. + * It's designed to be called from the Django backend via Celery tasks. + */ + +const express = require('express'); +const lighthouse = require('lighthouse'); +const chromeLauncher = require('chrome-launcher'); +const { v4: uuidv4 } = require('uuid'); +const fs = require('fs').promises; +const path = require('path'); + +const app = express(); +app.use(express.json()); + +const PORT = process.env.PORT || 3001; +const REPORTS_DIR = path.join(__dirname, 'reports'); + +// Ensure reports directory exists +fs.mkdir(REPORTS_DIR, { recursive: true }).catch(console.error); + +/** + * Health check endpoint + */ +app.get('/health', (req, res) => { + res.json({ status: 'healthy', service: 'lighthouse-scanner' }); +}); + +/** + * Run Lighthouse audit for a given URL + * + * POST /scan + * Body: { "url": "https://example.com" } + * + * Returns: Lighthouse audit results as JSON + */ +app.post('/scan', async (req, res) => { + const { url } = req.body; + + if (!url) { + return res.status(400).json({ error: 'URL is required' }); + } + + // Validate URL format + try { + new URL(url); + } catch (e) { + return res.status(400).json({ error: 'Invalid URL format' }); + } + + const scanId = uuidv4(); + console.log(`[${scanId}] Starting Lighthouse scan for: ${url}`); + + let chrome = null; + + try { + // Launch Chrome + chrome = await chromeLauncher.launch({ + chromeFlags: [ + '--headless', + '--disable-gpu', + '--no-sandbox', + '--disable-dev-shm-usage', + '--disable-extensions', + '--disable-background-networking', + '--disable-sync', + '--disable-translate', + '--metrics-recording-only', + '--mute-audio', + '--no-first-run', + '--safebrowsing-disable-auto-update' + ] + }); + + console.log(`[${scanId}] Chrome launched on port ${chrome.port}`); + + // Lighthouse configuration + const options = { + logLevel: 'error', + output: 'json', + port: chrome.port, + onlyCategories: ['performance', 'accessibility', 'best-practices', 'seo'], + // Throttling settings for more realistic results + throttling: { + cpuSlowdownMultiplier: 4, + downloadThroughputKbps: 1638.4, + uploadThroughputKbps: 675, + rttMs: 150 + }, + screenEmulation: { + mobile: false, + width: 1920, + height: 1080, + deviceScaleFactor: 1, + disabled: false + }, + formFactor: 'desktop' + }; + + // Run Lighthouse + const runnerResult = await lighthouse(url, options); + + // Extract the report + const report = runnerResult.lhr; + + // Process and extract key metrics + const result = { + scanId, + url: report.finalUrl || url, + fetchTime: report.fetchTime, + + // Category scores (0-100) + scores: { + performance: Math.round((report.categories.performance?.score || 0) * 100), + accessibility: Math.round((report.categories.accessibility?.score || 0) * 100), + bestPractices: Math.round((report.categories['best-practices']?.score || 0) * 100), + seo: Math.round((report.categories.seo?.score || 0) * 100) + }, + + // Core Web Vitals and key metrics + metrics: { + firstContentfulPaint: { + value: report.audits['first-contentful-paint']?.numericValue || null, + unit: 'ms', + score: report.audits['first-contentful-paint']?.score || null + }, + largestContentfulPaint: { + value: report.audits['largest-contentful-paint']?.numericValue || null, + unit: 'ms', + score: report.audits['largest-contentful-paint']?.score || null + }, + speedIndex: { + value: report.audits['speed-index']?.numericValue || null, + unit: 'ms', + score: report.audits['speed-index']?.score || null + }, + timeToInteractive: { + value: report.audits['interactive']?.numericValue || null, + unit: 'ms', + score: report.audits['interactive']?.score || null + }, + totalBlockingTime: { + value: report.audits['total-blocking-time']?.numericValue || null, + unit: 'ms', + score: report.audits['total-blocking-time']?.score || null + }, + cumulativeLayoutShift: { + value: report.audits['cumulative-layout-shift']?.numericValue || null, + unit: 'score', + score: report.audits['cumulative-layout-shift']?.score || null + } + }, + + // JavaScript and resource audits + resources: { + totalByteWeight: report.audits['total-byte-weight']?.numericValue || null, + bootupTime: report.audits['bootup-time']?.numericValue || null, + mainThreadWork: report.audits['mainthread-work-breakdown']?.numericValue || null, + + // Unused resources + unusedJavascript: extractUnusedResources(report.audits['unused-javascript']), + unusedCss: extractUnusedResources(report.audits['unused-css-rules']), + + // Render blocking resources + renderBlockingResources: extractRenderBlockingResources(report.audits['render-blocking-resources']), + + // Large bundles + scriptTreemap: extractLargeScripts(report.audits['script-treemap-data']), + + // Third party usage + thirdPartySummary: extractThirdPartySummary(report.audits['third-party-summary']) + }, + + // Diagnostics + diagnostics: { + numRequests: report.audits['network-requests']?.details?.items?.length || 0, + numScripts: countResourcesByType(report.audits['network-requests'], 'Script'), + numStylesheets: countResourcesByType(report.audits['network-requests'], 'Stylesheet'), + numImages: countResourcesByType(report.audits['network-requests'], 'Image'), + numFonts: countResourcesByType(report.audits['network-requests'], 'Font'), + totalTransferSize: report.audits['total-byte-weight']?.numericValue || 0 + }, + + // Failed audits (potential issues) + issues: extractFailedAudits(report) + }; + + // Save full report to file for debugging + const reportPath = path.join(REPORTS_DIR, `${scanId}.json`); + await fs.writeFile(reportPath, JSON.stringify(report, null, 2)); + + console.log(`[${scanId}] Scan completed successfully`); + res.json(result); + + } catch (error) { + console.error(`[${scanId}] Scan failed:`, error); + res.status(500).json({ + error: 'Lighthouse scan failed', + message: error.message, + scanId + }); + } finally { + if (chrome) { + await chrome.kill(); + } + } +}); + +/** + * Get a saved report by ID + */ +app.get('/report/:scanId', async (req, res) => { + const { scanId } = req.params; + const reportPath = path.join(REPORTS_DIR, `${scanId}.json`); + + try { + const report = await fs.readFile(reportPath, 'utf8'); + res.json(JSON.parse(report)); + } catch (error) { + res.status(404).json({ error: 'Report not found' }); + } +}); + +// ============================================================================= +// Helper Functions +// ============================================================================= + +function extractUnusedResources(audit) { + if (!audit?.details?.items) return []; + + return audit.details.items.slice(0, 10).map(item => ({ + url: item.url, + totalBytes: item.totalBytes, + wastedBytes: item.wastedBytes, + wastedPercent: item.wastedPercent + })); +} + +function extractRenderBlockingResources(audit) { + if (!audit?.details?.items) return []; + + return audit.details.items.map(item => ({ + url: item.url, + wastedMs: item.wastedMs, + totalBytes: item.totalBytes + })); +} + +function extractLargeScripts(audit) { + if (!audit?.details?.nodes) return []; + + // Get scripts larger than 100KB + const largeScripts = []; + const processNode = (node, path = '') => { + const currentPath = path ? `${path}/${node.name}` : node.name; + + if (node.resourceBytes > 100 * 1024) { + largeScripts.push({ + name: currentPath, + resourceBytes: node.resourceBytes, + unusedBytes: node.unusedBytes || 0 + }); + } + + if (node.children) { + node.children.forEach(child => processNode(child, currentPath)); + } + }; + + audit.details.nodes.forEach(node => processNode(node)); + return largeScripts.slice(0, 20); +} + +function extractThirdPartySummary(audit) { + if (!audit?.details?.items) return []; + + return audit.details.items.slice(0, 10).map(item => ({ + entity: item.entity, + transferSize: item.transferSize, + blockingTime: item.blockingTime, + mainThreadTime: item.mainThreadTime + })); +} + +function countResourcesByType(audit, type) { + if (!audit?.details?.items) return 0; + return audit.details.items.filter(item => item.resourceType === type).length; +} + +function extractFailedAudits(report) { + const issues = []; + + const categoriesToCheck = ['performance', 'accessibility', 'best-practices', 'seo']; + + categoriesToCheck.forEach(categoryId => { + const category = report.categories[categoryId]; + if (!category?.auditRefs) return; + + category.auditRefs.forEach(ref => { + const audit = report.audits[ref.id]; + + // Include audits with score < 0.5 (50%) + if (audit && audit.score !== null && audit.score < 0.5) { + issues.push({ + id: audit.id, + category: categoryId, + title: audit.title, + description: audit.description, + score: audit.score, + displayValue: audit.displayValue, + impact: ref.weight || 0 + }); + } + }); + }); + + // Sort by impact (weight) descending + issues.sort((a, b) => b.impact - a.impact); + + return issues.slice(0, 30); +} + +// Start the server +app.listen(PORT, '0.0.0.0', () => { + console.log(`Lighthouse Scanner Service running on port ${PORT}`); +});