secure-web/backend/scanner/scanners/lighthouse.py

"""
Lighthouse Scanner Integration.

This module integrates with Google Lighthouse to measure
performance, accessibility, SEO, and best practices.
"""

import logging
from typing import Any, Dict, Optional

import httpx

from django.conf import settings

from .base import (
    BaseScanner,
    ScannerResult,
    ScannerStatus,
    IssueData,
    MetricData,
)

logger = logging.getLogger(__name__)


class LighthouseScanner(BaseScanner):
    """
    Scanner that uses Google Lighthouse for performance analysis.

    Communicates with the Lighthouse service container via HTTP API.
    Collects performance metrics, Core Web Vitals, and various audits.
    """

    name = "lighthouse"

    def __init__(self, config: Optional[Dict[str, Any]] = None):
        super().__init__(config)
        self.service_url = self.config.get(
            'service_url',
            'http://lighthouse:3001'
        )
        self.timeout = self.config.get('timeout', 120)

    def is_available(self) -> bool:
        """Check if Lighthouse service is available."""
        try:
            with httpx.Client(timeout=5) as client:
                response = client.get(f"{self.service_url}/health")
                return response.status_code == 200
        except Exception as e:
            self.logger.warning(f"Lighthouse service not available: {e}")
            return False

    def run(self, url: str) -> ScannerResult:
        """
        Run Lighthouse scan against the URL.

        Args:
            url: The URL to analyze

        Returns:
            ScannerResult with performance metrics and issues
        """
        self.logger.info(f"Starting Lighthouse scan for {url}")

        try:
            with httpx.Client(timeout=self.timeout) as client:
                response = client.post(
                    f"{self.service_url}/scan",
                    json={"url": url}
                )
                response.raise_for_status()
                data = response.json()

            return self._parse_results(url, data)

        except httpx.TimeoutException:
            return self._create_error_result(
                Exception("Lighthouse scan timed out")
            )
        except httpx.HTTPStatusError as e:
            return self._create_error_result(
                Exception(f"Lighthouse service error: {e.response.status_code}")
            )
        except Exception as e:
            return self._create_error_result(e)

    def _parse_results(self, url: str, data: Dict[str, Any]) -> ScannerResult:
        """
        Parse Lighthouse results into ScannerResult format.

        Args:
            url: The scanned URL
            data: Raw Lighthouse response data

        Returns:
            Parsed ScannerResult
        """
        issues = []
        metrics = []

        # Extract scores
        scores = data.get('scores', {})

        # Extract and create metrics
        raw_metrics = data.get('metrics', {})

        # Core Web Vitals
        metric_mappings = [
            ('firstContentfulPaint', 'First Contentful Paint', 'ms'),
            ('largestContentfulPaint', 'Largest Contentful Paint', 'ms'),
            ('speedIndex', 'Speed Index', 'ms'),
            ('timeToInteractive', 'Time to Interactive', 'ms'),
            ('totalBlockingTime', 'Total Blocking Time', 'ms'),
            ('cumulativeLayoutShift', 'Cumulative Layout Shift', 'score'),
        ]

        for key, display_name, unit in metric_mappings:
            metric_data = raw_metrics.get(key, {})
            if metric_data and metric_data.get('value') is not None:
                metrics.append(MetricData(
                    name=self._to_snake_case(key),
                    display_name=display_name,
                    value=metric_data['value'],
                    unit=unit,
                    source='lighthouse',
                    score=metric_data.get('score')
                ))

        # Resource metrics
        resources = data.get('resources', {})
        diagnostics = data.get('diagnostics', {})

        if resources.get('totalByteWeight'):
            metrics.append(MetricData(
                name='total_byte_weight',
                display_name='Total Page Weight',
                value=resources['totalByteWeight'],
                unit='bytes',
                source='lighthouse'
            ))

        if resources.get('bootupTime'):
            metrics.append(MetricData(
                name='javascript_bootup_time',
                display_name='JavaScript Boot-up Time',
                value=resources['bootupTime'],
                unit='ms',
                source='lighthouse'
            ))

        if diagnostics.get('numRequests'):
            metrics.append(MetricData(
                name='total_requests',
                display_name='Total Network Requests',
                value=float(diagnostics['numRequests']),
                unit='count',
                source='lighthouse'
            ))

        # Extract issues from failed audits
        raw_issues = data.get('issues', [])
        for issue in raw_issues:
            severity = self._score_to_severity(issue.get('score', 0.5))
            category = self._map_category(issue.get('category', 'performance'))

            issues.append(IssueData(
                category=category,
                severity=severity,
                title=issue.get('title', 'Unknown issue'),
                description=issue.get('description', ''),
                tool='lighthouse',
                affected_url=url,
                remediation=self._get_remediation(issue.get('id')),
                raw_data=issue
            ))

        # Check for large bundles
        large_scripts = resources.get('scriptTreemap', [])
        for script in large_scripts[:5]:  # Top 5 largest
            if script.get('resourceBytes', 0) > settings.SCANNER_CONFIG.get(
                'LARGE_JS_BUNDLE_THRESHOLD_BYTES', 500 * 1024
            ):
                issues.append(IssueData(
                    category='resources',
                    severity='medium',
                    title=f"Large JavaScript bundle detected",
                    description=(
                        f"The script '{script.get('name', 'Unknown')}' "
                        f"is {script['resourceBytes'] / 1024:.1f} KB. "
                        "Large bundles can slow down page load and increase memory usage."
                    ),
                    tool='lighthouse',
                    affected_url=url,
                    remediation=(
                        "Consider code splitting, tree shaking, or lazy loading "
                        "to reduce bundle size."
                    ),
                    raw_data=script
                ))

        # Check for unused JavaScript
        unused_js = resources.get('unusedJavascript', [])
        if unused_js:
            total_wasted = sum(u.get('wastedBytes', 0) for u in unused_js)
            if total_wasted > 100 * 1024:  # More than 100KB unused
                issues.append(IssueData(
                    category='performance',
                    severity='medium',
                    title="Significant unused JavaScript detected",
                    description=(
                        f"Found {total_wasted / 1024:.1f} KB of unused JavaScript "
                        f"across {len(unused_js)} resources. This increases page "
                        "load time and memory usage."
                    ),
                    tool='lighthouse',
                    affected_url=url,
                    remediation=(
                        "Remove unused code or use code splitting to load "
                        "JavaScript only when needed."
                    ),
                    raw_data={'unused_resources': unused_js}
                ))

        # Check for render-blocking resources
        blocking = resources.get('renderBlockingResources', [])
        if blocking:
            total_wasted_ms = sum(r.get('wastedMs', 0) for r in blocking)
            if total_wasted_ms > 500:
                issues.append(IssueData(
                    category='performance',
                    severity='medium',
                    title="Render-blocking resources detected",
                    description=(
                        f"Found {len(blocking)} render-blocking resources "
                        f"adding approximately {total_wasted_ms:.0f}ms to page load. "
                        "These resources delay first paint."
                    ),
                    tool='lighthouse',
                    affected_url=url,
                    remediation=(
                        "Consider inlining critical CSS, deferring non-critical JS, "
                        "or using async/defer attributes."
                    ),
                    raw_data={'blocking_resources': blocking}
                ))

        self.logger.info(
            f"Lighthouse scan complete: {len(issues)} issues, {len(metrics)} metrics"
        )

        return ScannerResult(
            scanner_name=self.name,
            status=ScannerStatus.SUCCESS,
            issues=issues,
            metrics=metrics,
            scores={
                'performance': scores.get('performance', 0),
                'accessibility': scores.get('accessibility', 0),
                'best_practices': scores.get('bestPractices', 0),
                'seo': scores.get('seo', 0),
            },
            raw_data=data
        )

    def _to_snake_case(self, name: str) -> str:
        """Convert camelCase to snake_case."""
        import re
        s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
        return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower()

    def _score_to_severity(self, score: float) -> str:
        """Convert Lighthouse score to severity level."""
        if score is None:
            return 'info'
        elif score < 0.25:
            return 'high'
        elif score < 0.5:
            return 'medium'
        elif score < 0.75:
            return 'low'
        else:
            return 'info'

    def _map_category(self, lighthouse_category: str) -> str:
        """Map Lighthouse category to our category."""
        mapping = {
            'performance': 'performance',
            'accessibility': 'accessibility',
            'best-practices': 'best_practices',
            'seo': 'seo',
        }
        return mapping.get(lighthouse_category, 'performance')

    def _get_remediation(self, audit_id: str) -> str:
        """Get remediation text for known audit IDs."""
        remediations = {
            'first-contentful-paint': (
                "Reduce server response time, eliminate render-blocking resources, "
                "and optimize critical rendering path."
            ),
            'largest-contentful-paint': (
                "Optimize images, preload critical resources, and reduce server "
                "response time."
            ),
            'total-blocking-time': (
                "Reduce JavaScript execution time by breaking up long tasks, "
                "removing unused code, and minimizing main thread work."
            ),
            'cumulative-layout-shift': (
                "Always include size attributes on images and videos, reserve space "
                "for ad slots, and avoid inserting content above existing content."
            ),
            'speed-index': (
                "Minimize main thread work, reduce JavaScript execution time, "
                "and ensure text remains visible during webfont load."
            ),
            'interactive': (
                "Reduce JavaScript payload, defer non-critical scripts, and "
                "minimize main thread work."
            ),
        }
        return remediations.get(audit_id, "Review and optimize based on the audit details.")