""" Lighthouse Scanner Integration. This module integrates with Google Lighthouse to measure performance, accessibility, SEO, and best practices. """ import logging from typing import Any, Dict, Optional import httpx from django.conf import settings from .base import ( BaseScanner, ScannerResult, ScannerStatus, IssueData, MetricData, ) logger = logging.getLogger(__name__) class LighthouseScanner(BaseScanner): """ Scanner that uses Google Lighthouse for performance analysis. Communicates with the Lighthouse service container via HTTP API. Collects performance metrics, Core Web Vitals, and various audits. """ name = "lighthouse" def __init__(self, config: Optional[Dict[str, Any]] = None): super().__init__(config) self.service_url = self.config.get( 'service_url', 'http://lighthouse:3001' ) self.timeout = self.config.get('timeout', 120) def is_available(self) -> bool: """Check if Lighthouse service is available.""" try: with httpx.Client(timeout=5) as client: response = client.get(f"{self.service_url}/health") return response.status_code == 200 except Exception as e: self.logger.warning(f"Lighthouse service not available: {e}") return False def run(self, url: str) -> ScannerResult: """ Run Lighthouse scan against the URL. Args: url: The URL to analyze Returns: ScannerResult with performance metrics and issues """ self.logger.info(f"Starting Lighthouse scan for {url}") try: with httpx.Client(timeout=self.timeout) as client: response = client.post( f"{self.service_url}/scan", json={"url": url} ) response.raise_for_status() data = response.json() return self._parse_results(url, data) except httpx.TimeoutException: return self._create_error_result( Exception("Lighthouse scan timed out") ) except httpx.HTTPStatusError as e: return self._create_error_result( Exception(f"Lighthouse service error: {e.response.status_code}") ) except Exception as e: return self._create_error_result(e) def _parse_results(self, url: str, data: Dict[str, Any]) -> ScannerResult: """ Parse Lighthouse results into ScannerResult format. Args: url: The scanned URL data: Raw Lighthouse response data Returns: Parsed ScannerResult """ issues = [] metrics = [] # Extract scores scores = data.get('scores', {}) # Extract and create metrics raw_metrics = data.get('metrics', {}) # Core Web Vitals metric_mappings = [ ('firstContentfulPaint', 'First Contentful Paint', 'ms'), ('largestContentfulPaint', 'Largest Contentful Paint', 'ms'), ('speedIndex', 'Speed Index', 'ms'), ('timeToInteractive', 'Time to Interactive', 'ms'), ('totalBlockingTime', 'Total Blocking Time', 'ms'), ('cumulativeLayoutShift', 'Cumulative Layout Shift', 'score'), ] for key, display_name, unit in metric_mappings: metric_data = raw_metrics.get(key, {}) if metric_data and metric_data.get('value') is not None: metrics.append(MetricData( name=self._to_snake_case(key), display_name=display_name, value=metric_data['value'], unit=unit, source='lighthouse', score=metric_data.get('score') )) # Resource metrics resources = data.get('resources', {}) diagnostics = data.get('diagnostics', {}) if resources.get('totalByteWeight'): metrics.append(MetricData( name='total_byte_weight', display_name='Total Page Weight', value=resources['totalByteWeight'], unit='bytes', source='lighthouse' )) if resources.get('bootupTime'): metrics.append(MetricData( name='javascript_bootup_time', display_name='JavaScript Boot-up Time', value=resources['bootupTime'], unit='ms', source='lighthouse' )) if diagnostics.get('numRequests'): metrics.append(MetricData( name='total_requests', display_name='Total Network Requests', value=float(diagnostics['numRequests']), unit='count', source='lighthouse' )) # Extract issues from failed audits raw_issues = data.get('issues', []) for issue in raw_issues: severity = self._score_to_severity(issue.get('score', 0.5)) category = self._map_category(issue.get('category', 'performance')) issues.append(IssueData( category=category, severity=severity, title=issue.get('title', 'Unknown issue'), description=issue.get('description', ''), tool='lighthouse', affected_url=url, remediation=self._get_remediation(issue.get('id')), raw_data=issue )) # Check for large bundles large_scripts = resources.get('scriptTreemap', []) for script in large_scripts[:5]: # Top 5 largest if script.get('resourceBytes', 0) > settings.SCANNER_CONFIG.get( 'LARGE_JS_BUNDLE_THRESHOLD_BYTES', 500 * 1024 ): issues.append(IssueData( category='resources', severity='medium', title=f"Large JavaScript bundle detected", description=( f"The script '{script.get('name', 'Unknown')}' " f"is {script['resourceBytes'] / 1024:.1f} KB. " "Large bundles can slow down page load and increase memory usage." ), tool='lighthouse', affected_url=url, remediation=( "Consider code splitting, tree shaking, or lazy loading " "to reduce bundle size." ), raw_data=script )) # Check for unused JavaScript unused_js = resources.get('unusedJavascript', []) if unused_js: total_wasted = sum(u.get('wastedBytes', 0) for u in unused_js) if total_wasted > 100 * 1024: # More than 100KB unused issues.append(IssueData( category='performance', severity='medium', title="Significant unused JavaScript detected", description=( f"Found {total_wasted / 1024:.1f} KB of unused JavaScript " f"across {len(unused_js)} resources. This increases page " "load time and memory usage." ), tool='lighthouse', affected_url=url, remediation=( "Remove unused code or use code splitting to load " "JavaScript only when needed." ), raw_data={'unused_resources': unused_js} )) # Check for render-blocking resources blocking = resources.get('renderBlockingResources', []) if blocking: total_wasted_ms = sum(r.get('wastedMs', 0) for r in blocking) if total_wasted_ms > 500: issues.append(IssueData( category='performance', severity='medium', title="Render-blocking resources detected", description=( f"Found {len(blocking)} render-blocking resources " f"adding approximately {total_wasted_ms:.0f}ms to page load. " "These resources delay first paint." ), tool='lighthouse', affected_url=url, remediation=( "Consider inlining critical CSS, deferring non-critical JS, " "or using async/defer attributes." ), raw_data={'blocking_resources': blocking} )) self.logger.info( f"Lighthouse scan complete: {len(issues)} issues, {len(metrics)} metrics" ) return ScannerResult( scanner_name=self.name, status=ScannerStatus.SUCCESS, issues=issues, metrics=metrics, scores={ 'performance': scores.get('performance', 0), 'accessibility': scores.get('accessibility', 0), 'best_practices': scores.get('bestPractices', 0), 'seo': scores.get('seo', 0), }, raw_data=data ) def _to_snake_case(self, name: str) -> str: """Convert camelCase to snake_case.""" import re s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name) return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower() def _score_to_severity(self, score: float) -> str: """Convert Lighthouse score to severity level.""" if score is None: return 'info' elif score < 0.25: return 'high' elif score < 0.5: return 'medium' elif score < 0.75: return 'low' else: return 'info' def _map_category(self, lighthouse_category: str) -> str: """Map Lighthouse category to our category.""" mapping = { 'performance': 'performance', 'accessibility': 'accessibility', 'best-practices': 'best_practices', 'seo': 'seo', } return mapping.get(lighthouse_category, 'performance') def _get_remediation(self, audit_id: str) -> str: """Get remediation text for known audit IDs.""" remediations = { 'first-contentful-paint': ( "Reduce server response time, eliminate render-blocking resources, " "and optimize critical rendering path." ), 'largest-contentful-paint': ( "Optimize images, preload critical resources, and reduce server " "response time." ), 'total-blocking-time': ( "Reduce JavaScript execution time by breaking up long tasks, " "removing unused code, and minimizing main thread work." ), 'cumulative-layout-shift': ( "Always include size attributes on images and videos, reserve space " "for ad slots, and avoid inserting content above existing content." ), 'speed-index': ( "Minimize main thread work, reduce JavaScript execution time, " "and ensure text remains visible during webfont load." ), 'interactive': ( "Reduce JavaScript payload, defer non-critical scripts, and " "minimize main thread work." ), } return remediations.get(audit_id, "Review and optimize based on the audit details.")