""" Lighthouse Scanner Integration. This module integrates with the Lighthouse scanner service to perform performance, accessibility, SEO, and best practices audits. """ import time import logging from typing import Optional import httpx from django.conf import settings from .base import BaseScanner, ScannerResult, ScannerStatus logger = logging.getLogger('scanner') class LighthouseScanner(BaseScanner): """ Scanner that integrates with the Lighthouse service. Lighthouse audits: - Performance (FCP, LCP, TTI, TBT, CLS, Speed Index) - Accessibility - Best Practices - SEO """ name = "lighthouse" def __init__(self, config: dict = None): super().__init__(config) self.service_url = self.config.get( 'lighthouse_url', settings.SCANNER_CONFIG.get('LIGHTHOUSE_URL', 'http://lighthouse:3001') ) self.timeout = self.config.get('timeout', 120) def is_available(self) -> bool: """Check if Lighthouse service is available.""" try: response = httpx.get( f"{self.service_url}/health", timeout=5 ) return response.status_code == 200 except Exception as e: self.logger.warning(f"Lighthouse service not available: {e}") return False def run(self, url: str) -> ScannerResult: """ Run Lighthouse audit on the given URL. Args: url: The URL to audit Returns: ScannerResult with Lighthouse data """ start_time = time.time() if not self.is_available(): return ScannerResult( status=ScannerStatus.FAILED, scanner_name=self.name, error_message="Lighthouse service is not available", execution_time_seconds=time.time() - start_time ) try: # Call Lighthouse service response = httpx.post( f"{self.service_url}/scan", json={"url": url}, timeout=self.timeout ) if response.status_code != 200: return ScannerResult( status=ScannerStatus.FAILED, scanner_name=self.name, error_message=f"Lighthouse returned status {response.status_code}: {response.text}", execution_time_seconds=time.time() - start_time ) data = response.json() # Extract scores scores = { 'performance': data.get('scores', {}).get('performance'), 'accessibility': data.get('scores', {}).get('accessibility'), 'best_practices': data.get('scores', {}).get('bestPractices'), 'seo': data.get('scores', {}).get('seo'), } # Extract metrics metrics = self._extract_metrics(data) # Extract issues issues = self._extract_issues(data) execution_time = time.time() - start_time return ScannerResult( status=ScannerStatus.SUCCESS, scanner_name=self.name, scores=scores, metrics=metrics, issues=issues, raw_data=data, execution_time_seconds=execution_time ) except httpx.TimeoutException: return ScannerResult( status=ScannerStatus.FAILED, scanner_name=self.name, error_message="Lighthouse scan timed out", execution_time_seconds=time.time() - start_time ) except httpx.RequestError as e: return ScannerResult( status=ScannerStatus.FAILED, scanner_name=self.name, error_message=f"Lighthouse request failed: {e}", execution_time_seconds=time.time() - start_time ) except Exception as e: logger.exception(f"Lighthouse scan failed for {url}") return ScannerResult( status=ScannerStatus.FAILED, scanner_name=self.name, error_message=f"Unexpected error: {e}", execution_time_seconds=time.time() - start_time ) def _extract_metrics(self, data: dict) -> list: """Extract key metrics from Lighthouse data.""" metrics = [] # Core Web Vitals and performance metrics metrics_config = { 'first_contentful_paint': ('First Contentful Paint', 'firstContentfulPaint', 'ms'), 'largest_contentful_paint': ('Largest Contentful Paint', 'largestContentfulPaint', 'ms'), 'speed_index': ('Speed Index', 'speedIndex', 'ms'), 'time_to_interactive': ('Time to Interactive', 'timeToInteractive', 'ms'), 'total_blocking_time': ('Total Blocking Time', 'totalBlockingTime', 'ms'), 'cumulative_layout_shift': ('Cumulative Layout Shift', 'cumulativeLayoutShift', 'score'), } lh_metrics = data.get('metrics', {}) for metric_name, (display_name, lh_key, unit) in metrics_config.items(): metric_data = lh_metrics.get(lh_key, {}) if metric_data and metric_data.get('value') is not None: metrics.append(self._create_metric( name=metric_name, display_name=display_name, value=metric_data['value'], unit=unit, score=metric_data.get('score') )) # Resource metrics resources = data.get('resources', {}) diagnostics = data.get('diagnostics', {}) if resources.get('totalByteWeight'): metrics.append(self._create_metric( name='total_byte_weight', display_name='Total Page Weight', value=resources['totalByteWeight'], unit='bytes' )) if diagnostics.get('numRequests'): metrics.append(self._create_metric( name='num_requests', display_name='Total Requests', value=diagnostics['numRequests'], unit='count' )) if diagnostics.get('numScripts'): metrics.append(self._create_metric( name='num_scripts', display_name='JavaScript Files', value=diagnostics['numScripts'], unit='count' )) if diagnostics.get('totalTransferSize'): metrics.append(self._create_metric( name='total_transfer_size', display_name='Total Transfer Size', value=diagnostics['totalTransferSize'], unit='bytes' )) return metrics def _extract_issues(self, data: dict) -> list: """Extract issues from Lighthouse audit results.""" issues = [] # Convert Lighthouse issues to our format lh_issues = data.get('issues', []) # Map Lighthouse categories to our categories category_map = { 'performance': 'performance', 'accessibility': 'accessibility', 'best-practices': 'best_practices', 'seo': 'seo', } for lh_issue in lh_issues: # Determine severity based on score and impact score = lh_issue.get('score', 0) impact = lh_issue.get('impact', 0) if score == 0 and impact > 5: severity = 'high' elif score < 0.5 and impact > 3: severity = 'medium' elif score < 0.5: severity = 'low' else: severity = 'info' category = category_map.get(lh_issue.get('category'), 'performance') issues.append(self._create_issue( category=category, severity=severity, title=lh_issue.get('title', 'Unknown issue'), description=lh_issue.get('description', ''), raw_data={ 'id': lh_issue.get('id'), 'displayValue': lh_issue.get('displayValue'), 'score': score, 'impact': impact, } )) # Check for unused resources resources = data.get('resources', {}) # Unused JavaScript unused_js = resources.get('unusedJavascript', []) for item in unused_js[:5]: # Top 5 if item.get('wastedBytes', 0) > 50000: # > 50KB wasted issues.append(self._create_issue( category='performance', severity='medium', title='Unused JavaScript', description=f"Remove unused JavaScript to reduce payload. {item.get('url', '')} has {item.get('wastedBytes', 0) / 1024:.1f}KB unused.", remediation='Remove unused JavaScript code or use code splitting to load only what is needed.', raw_data=item )) # Unused CSS unused_css = resources.get('unusedCss', []) for item in unused_css[:5]: if item.get('wastedBytes', 0) > 20000: # > 20KB wasted issues.append(self._create_issue( category='performance', severity='low', title='Unused CSS', description=f"Remove unused CSS rules. {item.get('url', '')} has {item.get('wastedBytes', 0) / 1024:.1f}KB unused.", remediation='Use tools like PurgeCSS to remove unused CSS.', raw_data=item )) # Render-blocking resources blocking = resources.get('renderBlockingResources', []) if len(blocking) > 3: issues.append(self._create_issue( category='performance', severity='medium', title='Multiple render-blocking resources', description=f'Found {len(blocking)} render-blocking resources that delay page rendering.', remediation='Defer non-critical JavaScript and inline critical CSS.', raw_data={'resources': blocking[:10]} )) # Large JavaScript bundles large_scripts = resources.get('scriptTreemap', []) for script in large_scripts[:5]: if script.get('resourceBytes', 0) > 500000: # > 500KB issues.append(self._create_issue( category='resources', severity='medium', title='Large JavaScript bundle', description=f"Large script bundle detected: {script.get('name', 'Unknown')} ({script.get('resourceBytes', 0) / 1024:.1f}KB)", remediation='Consider code splitting and lazy loading to reduce bundle size.', raw_data=script )) # Third-party impact third_party = resources.get('thirdPartySummary', []) high_impact_third_party = [ tp for tp in third_party if tp.get('blockingTime', 0) > 500 # > 500ms blocking ] if high_impact_third_party: issues.append(self._create_issue( category='performance', severity='medium', title='Third-party scripts impacting performance', description=f'{len(high_impact_third_party)} third-party scripts are significantly impacting page load time.', remediation='Consider lazy loading third-party scripts or using async/defer attributes.', raw_data={'third_parties': high_impact_third_party} )) return issues