secure-web/backend/scanner/lighthouse_scanner.py

321 lines
12 KiB
Python

"""
Lighthouse Scanner Integration.
This module integrates with the Lighthouse scanner service
to perform performance, accessibility, SEO, and best practices audits.
"""
import time
import logging
from typing import Optional
import httpx
from django.conf import settings
from .base import BaseScanner, ScannerResult, ScannerStatus
logger = logging.getLogger('scanner')
class LighthouseScanner(BaseScanner):
"""
Scanner that integrates with the Lighthouse service.
Lighthouse audits:
- Performance (FCP, LCP, TTI, TBT, CLS, Speed Index)
- Accessibility
- Best Practices
- SEO
"""
name = "lighthouse"
def __init__(self, config: dict = None):
super().__init__(config)
self.service_url = self.config.get(
'lighthouse_url',
settings.SCANNER_CONFIG.get('LIGHTHOUSE_URL', 'http://lighthouse:3001')
)
self.timeout = self.config.get('timeout', 120)
def is_available(self) -> bool:
"""Check if Lighthouse service is available."""
try:
response = httpx.get(
f"{self.service_url}/health",
timeout=5
)
return response.status_code == 200
except Exception as e:
self.logger.warning(f"Lighthouse service not available: {e}")
return False
def run(self, url: str) -> ScannerResult:
"""
Run Lighthouse audit on the given URL.
Args:
url: The URL to audit
Returns:
ScannerResult with Lighthouse data
"""
start_time = time.time()
if not self.is_available():
return ScannerResult(
status=ScannerStatus.FAILED,
scanner_name=self.name,
error_message="Lighthouse service is not available",
execution_time_seconds=time.time() - start_time
)
try:
# Call Lighthouse service
response = httpx.post(
f"{self.service_url}/scan",
json={"url": url},
timeout=self.timeout
)
if response.status_code != 200:
return ScannerResult(
status=ScannerStatus.FAILED,
scanner_name=self.name,
error_message=f"Lighthouse returned status {response.status_code}: {response.text}",
execution_time_seconds=time.time() - start_time
)
data = response.json()
# Extract scores
scores = {
'performance': data.get('scores', {}).get('performance'),
'accessibility': data.get('scores', {}).get('accessibility'),
'best_practices': data.get('scores', {}).get('bestPractices'),
'seo': data.get('scores', {}).get('seo'),
}
# Extract metrics
metrics = self._extract_metrics(data)
# Extract issues
issues = self._extract_issues(data)
execution_time = time.time() - start_time
return ScannerResult(
status=ScannerStatus.SUCCESS,
scanner_name=self.name,
scores=scores,
metrics=metrics,
issues=issues,
raw_data=data,
execution_time_seconds=execution_time
)
except httpx.TimeoutException:
return ScannerResult(
status=ScannerStatus.FAILED,
scanner_name=self.name,
error_message="Lighthouse scan timed out",
execution_time_seconds=time.time() - start_time
)
except httpx.RequestError as e:
return ScannerResult(
status=ScannerStatus.FAILED,
scanner_name=self.name,
error_message=f"Lighthouse request failed: {e}",
execution_time_seconds=time.time() - start_time
)
except Exception as e:
logger.exception(f"Lighthouse scan failed for {url}")
return ScannerResult(
status=ScannerStatus.FAILED,
scanner_name=self.name,
error_message=f"Unexpected error: {e}",
execution_time_seconds=time.time() - start_time
)
def _extract_metrics(self, data: dict) -> list:
"""Extract key metrics from Lighthouse data."""
metrics = []
# Core Web Vitals and performance metrics
metrics_config = {
'first_contentful_paint': ('First Contentful Paint', 'firstContentfulPaint', 'ms'),
'largest_contentful_paint': ('Largest Contentful Paint', 'largestContentfulPaint', 'ms'),
'speed_index': ('Speed Index', 'speedIndex', 'ms'),
'time_to_interactive': ('Time to Interactive', 'timeToInteractive', 'ms'),
'total_blocking_time': ('Total Blocking Time', 'totalBlockingTime', 'ms'),
'cumulative_layout_shift': ('Cumulative Layout Shift', 'cumulativeLayoutShift', 'score'),
}
lh_metrics = data.get('metrics', {})
for metric_name, (display_name, lh_key, unit) in metrics_config.items():
metric_data = lh_metrics.get(lh_key, {})
if metric_data and metric_data.get('value') is not None:
metrics.append(self._create_metric(
name=metric_name,
display_name=display_name,
value=metric_data['value'],
unit=unit,
score=metric_data.get('score')
))
# Resource metrics
resources = data.get('resources', {})
diagnostics = data.get('diagnostics', {})
if resources.get('totalByteWeight'):
metrics.append(self._create_metric(
name='total_byte_weight',
display_name='Total Page Weight',
value=resources['totalByteWeight'],
unit='bytes'
))
if diagnostics.get('numRequests'):
metrics.append(self._create_metric(
name='num_requests',
display_name='Total Requests',
value=diagnostics['numRequests'],
unit='count'
))
if diagnostics.get('numScripts'):
metrics.append(self._create_metric(
name='num_scripts',
display_name='JavaScript Files',
value=diagnostics['numScripts'],
unit='count'
))
if diagnostics.get('totalTransferSize'):
metrics.append(self._create_metric(
name='total_transfer_size',
display_name='Total Transfer Size',
value=diagnostics['totalTransferSize'],
unit='bytes'
))
return metrics
def _extract_issues(self, data: dict) -> list:
"""Extract issues from Lighthouse audit results."""
issues = []
# Convert Lighthouse issues to our format
lh_issues = data.get('issues', [])
# Map Lighthouse categories to our categories
category_map = {
'performance': 'performance',
'accessibility': 'accessibility',
'best-practices': 'best_practices',
'seo': 'seo',
}
for lh_issue in lh_issues:
# Determine severity based on score and impact
score = lh_issue.get('score', 0)
impact = lh_issue.get('impact', 0)
if score == 0 and impact > 5:
severity = 'high'
elif score < 0.5 and impact > 3:
severity = 'medium'
elif score < 0.5:
severity = 'low'
else:
severity = 'info'
category = category_map.get(lh_issue.get('category'), 'performance')
issues.append(self._create_issue(
category=category,
severity=severity,
title=lh_issue.get('title', 'Unknown issue'),
description=lh_issue.get('description', ''),
raw_data={
'id': lh_issue.get('id'),
'displayValue': lh_issue.get('displayValue'),
'score': score,
'impact': impact,
}
))
# Check for unused resources
resources = data.get('resources', {})
# Unused JavaScript
unused_js = resources.get('unusedJavascript', [])
for item in unused_js[:5]: # Top 5
if item.get('wastedBytes', 0) > 50000: # > 50KB wasted
issues.append(self._create_issue(
category='performance',
severity='medium',
title='Unused JavaScript',
description=f"Remove unused JavaScript to reduce payload. {item.get('url', '')} has {item.get('wastedBytes', 0) / 1024:.1f}KB unused.",
remediation='Remove unused JavaScript code or use code splitting to load only what is needed.',
raw_data=item
))
# Unused CSS
unused_css = resources.get('unusedCss', [])
for item in unused_css[:5]:
if item.get('wastedBytes', 0) > 20000: # > 20KB wasted
issues.append(self._create_issue(
category='performance',
severity='low',
title='Unused CSS',
description=f"Remove unused CSS rules. {item.get('url', '')} has {item.get('wastedBytes', 0) / 1024:.1f}KB unused.",
remediation='Use tools like PurgeCSS to remove unused CSS.',
raw_data=item
))
# Render-blocking resources
blocking = resources.get('renderBlockingResources', [])
if len(blocking) > 3:
issues.append(self._create_issue(
category='performance',
severity='medium',
title='Multiple render-blocking resources',
description=f'Found {len(blocking)} render-blocking resources that delay page rendering.',
remediation='Defer non-critical JavaScript and inline critical CSS.',
raw_data={'resources': blocking[:10]}
))
# Large JavaScript bundles
large_scripts = resources.get('scriptTreemap', [])
for script in large_scripts[:5]:
if script.get('resourceBytes', 0) > 500000: # > 500KB
issues.append(self._create_issue(
category='resources',
severity='medium',
title='Large JavaScript bundle',
description=f"Large script bundle detected: {script.get('name', 'Unknown')} ({script.get('resourceBytes', 0) / 1024:.1f}KB)",
remediation='Consider code splitting and lazy loading to reduce bundle size.',
raw_data=script
))
# Third-party impact
third_party = resources.get('thirdPartySummary', [])
high_impact_third_party = [
tp for tp in third_party
if tp.get('blockingTime', 0) > 500 # > 500ms blocking
]
if high_impact_third_party:
issues.append(self._create_issue(
category='performance',
severity='medium',
title='Third-party scripts impacting performance',
description=f'{len(high_impact_third_party)} third-party scripts are significantly impacting page load time.',
remediation='Consider lazy loading third-party scripts or using async/defer attributes.',
raw_data={'third_parties': high_impact_third_party}
))
return issues