secure-web/backend/scanner/playwright_scanner.py

433 lines
16 KiB
Python

"""
Playwright Browser Scanner.
This module uses Playwright to perform browser-based analysis,
including console error capture, resource loading, and basic
memory usage indicators.
"""
import time
import logging
import asyncio
from typing import Dict, List, Optional, Tuple
from django.conf import settings
from .base import BaseScanner, ScannerResult, ScannerStatus
logger = logging.getLogger('scanner')
class PlaywrightScanner(BaseScanner):
"""
Browser-based scanner using Playwright.
Captures:
- Console errors and warnings
- Network request metrics
- Large images and resources
- JavaScript errors
- Memory usage indicators
- Page load timing
"""
name = "playwright"
def __init__(self, config: dict = None):
super().__init__(config)
self.timeout = self.config.get(
'timeout',
settings.SCANNER_CONFIG.get('PLAYWRIGHT_TIMEOUT', 30000)
)
self.viewport = self.config.get(
'viewport',
settings.SCANNER_CONFIG.get('PLAYWRIGHT_VIEWPORT', {'width': 1920, 'height': 1080})
)
self.large_image_threshold = settings.SCANNER_CONFIG.get(
'LARGE_IMAGE_THRESHOLD_BYTES', 1024 * 1024
)
def is_available(self) -> bool:
"""Check if Playwright is available."""
try:
from playwright.sync_api import sync_playwright
return True
except ImportError:
self.logger.warning("Playwright not installed")
return False
def run(self, url: str) -> ScannerResult:
"""
Run browser-based analysis using Playwright.
Args:
url: The URL to analyze
Returns:
ScannerResult with browser analysis data
"""
start_time = time.time()
if not self.is_available():
return ScannerResult(
status=ScannerStatus.FAILED,
scanner_name=self.name,
error_message="Playwright is not available",
execution_time_seconds=time.time() - start_time
)
try:
from playwright.sync_api import sync_playwright
with sync_playwright() as p:
# Launch browser
browser = p.chromium.launch(
headless=True,
args=[
'--no-sandbox',
'--disable-dev-shm-usage',
'--disable-gpu',
'--disable-extensions',
]
)
context = browser.new_context(
viewport=self.viewport,
user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
)
page = context.new_page()
# Collect data
console_messages = []
network_requests = []
failed_requests = []
js_errors = []
# Console message handler
def handle_console(msg):
console_messages.append({
'type': msg.type,
'text': msg.text[:500], # Truncate long messages
'location': str(msg.location) if hasattr(msg, 'location') else None
})
# Request handler
def handle_request(request):
network_requests.append({
'url': request.url[:200],
'method': request.method,
'resource_type': request.resource_type,
})
# Response handler
def handle_response(response):
# Find the corresponding request
for req in network_requests:
if req['url'] == response.url[:200]:
req['status'] = response.status
try:
headers = response.headers
content_length = headers.get('content-length', '0')
req['size'] = int(content_length) if content_length else 0
except:
req['size'] = 0
break
# Request failed handler
def handle_request_failed(request):
failed_requests.append({
'url': request.url[:200],
'failure': request.failure,
'resource_type': request.resource_type,
})
# Page error handler
def handle_page_error(error):
js_errors.append({
'message': str(error)[:500],
})
# Attach handlers
page.on('console', handle_console)
page.on('request', handle_request)
page.on('response', handle_response)
page.on('requestfailed', handle_request_failed)
page.on('pageerror', handle_page_error)
# Navigate to page
load_start = time.time()
try:
page.goto(url, timeout=self.timeout, wait_until='networkidle')
except Exception as e:
# Try with less strict wait condition
self.logger.warning(f"Network idle timeout, trying load: {e}")
page.goto(url, timeout=self.timeout, wait_until='load')
load_time = (time.time() - load_start) * 1000 # ms
# Wait a bit more for any async content
page.wait_for_timeout(2000)
# Get performance metrics if available
performance_data = page.evaluate('''() => {
const timing = performance.timing;
const memory = performance.memory || {};
return {
domContentLoaded: timing.domContentLoadedEventEnd - timing.navigationStart,
loadComplete: timing.loadEventEnd - timing.navigationStart,
domInteractive: timing.domInteractive - timing.navigationStart,
firstPaint: performance.getEntriesByType('paint').find(p => p.name === 'first-paint')?.startTime || null,
firstContentfulPaint: performance.getEntriesByType('paint').find(p => p.name === 'first-contentful-paint')?.startTime || null,
jsHeapSizeLimit: memory.jsHeapSizeLimit || null,
totalJSHeapSize: memory.totalJSHeapSize || null,
usedJSHeapSize: memory.usedJSHeapSize || null,
};
}''')
# Close browser
browser.close()
# Process results
metrics = self._extract_metrics(
load_time,
performance_data,
network_requests
)
issues = self._extract_issues(
console_messages,
network_requests,
failed_requests,
js_errors,
performance_data
)
raw_data = {
'console_messages': console_messages[:50], # Limit size
'network_requests': network_requests[:100],
'failed_requests': failed_requests,
'js_errors': js_errors,
'performance': performance_data,
'load_time_ms': load_time,
}
execution_time = time.time() - start_time
return ScannerResult(
status=ScannerStatus.SUCCESS,
scanner_name=self.name,
metrics=metrics,
issues=issues,
raw_data=raw_data,
execution_time_seconds=execution_time
)
except Exception as e:
logger.exception(f"Playwright scan failed for {url}")
return ScannerResult(
status=ScannerStatus.FAILED,
scanner_name=self.name,
error_message=f"Browser scan failed: {e}",
execution_time_seconds=time.time() - start_time
)
def _extract_metrics(
self,
load_time: float,
performance_data: dict,
network_requests: list
) -> list:
"""Extract metrics from browser data."""
metrics = []
# Page load time
metrics.append(self._create_metric(
name='page_load_time',
display_name='Page Load Time',
value=load_time,
unit='ms'
))
# DOM Content Loaded
if performance_data.get('domContentLoaded'):
metrics.append(self._create_metric(
name='dom_content_loaded',
display_name='DOM Content Loaded',
value=performance_data['domContentLoaded'],
unit='ms'
))
# DOM Interactive
if performance_data.get('domInteractive'):
metrics.append(self._create_metric(
name='dom_interactive',
display_name='DOM Interactive',
value=performance_data['domInteractive'],
unit='ms'
))
# Network metrics
total_requests = len(network_requests)
total_size = sum(r.get('size', 0) for r in network_requests)
metrics.append(self._create_metric(
name='total_requests_playwright',
display_name='Total Network Requests',
value=total_requests,
unit='count'
))
metrics.append(self._create_metric(
name='total_download_size',
display_name='Total Downloaded',
value=total_size,
unit='bytes'
))
# Request type breakdown
scripts = [r for r in network_requests if r.get('resource_type') == 'script']
stylesheets = [r for r in network_requests if r.get('resource_type') == 'stylesheet']
images = [r for r in network_requests if r.get('resource_type') == 'image']
fonts = [r for r in network_requests if r.get('resource_type') == 'font']
metrics.append(self._create_metric(
name='script_requests',
display_name='Script Requests',
value=len(scripts),
unit='count'
))
metrics.append(self._create_metric(
name='image_requests',
display_name='Image Requests',
value=len(images),
unit='count'
))
# Memory metrics
if performance_data.get('usedJSHeapSize'):
metrics.append(self._create_metric(
name='js_heap_used',
display_name='JS Heap Used',
value=performance_data['usedJSHeapSize'],
unit='bytes'
))
if performance_data.get('totalJSHeapSize'):
metrics.append(self._create_metric(
name='js_heap_total',
display_name='JS Heap Total',
value=performance_data['totalJSHeapSize'],
unit='bytes'
))
return metrics
def _extract_issues(
self,
console_messages: list,
network_requests: list,
failed_requests: list,
js_errors: list,
performance_data: dict
) -> list:
"""Extract issues from browser data."""
issues = []
# Console errors
errors = [m for m in console_messages if m.get('type') == 'error']
if errors:
issues.append(self._create_issue(
category='content',
severity='medium',
title=f'{len(errors)} console error(s) detected',
description='JavaScript console errors were detected on the page.',
remediation='Review and fix JavaScript errors to improve user experience.',
raw_data={'errors': errors[:10]}
))
# Console warnings
warnings = [m for m in console_messages if m.get('type') == 'warning']
if len(warnings) > 5:
issues.append(self._create_issue(
category='content',
severity='low',
title=f'{len(warnings)} console warning(s) detected',
description='Multiple JavaScript warnings were detected on the page.',
remediation='Review console warnings for potential issues.',
raw_data={'warnings': warnings[:10]}
))
# JavaScript page errors
if js_errors:
issues.append(self._create_issue(
category='content',
severity='high',
title=f'{len(js_errors)} JavaScript error(s) detected',
description='Uncaught JavaScript exceptions were detected.',
remediation='Fix JavaScript errors that could break page functionality.',
raw_data={'errors': js_errors}
))
# Failed network requests
if failed_requests:
issues.append(self._create_issue(
category='content',
severity='medium',
title=f'{len(failed_requests)} failed network request(s)',
description='Some resources failed to load.',
remediation='Ensure all resources are available and URLs are correct.',
raw_data={'failed': failed_requests}
))
# Large images
large_images = [
r for r in network_requests
if r.get('resource_type') == 'image' and r.get('size', 0) > self.large_image_threshold
]
if large_images:
issues.append(self._create_issue(
category='resources',
severity='medium',
title=f'{len(large_images)} large image(s) detected (>1MB)',
description='Large images slow down page load and increase bandwidth usage.',
remediation='Compress images and use modern formats like WebP or AVIF.',
raw_data={'images': [{'url': i['url'], 'size': i.get('size')} for i in large_images]}
))
# Too many requests
if len(network_requests) > 100:
issues.append(self._create_issue(
category='performance',
severity='medium',
title='High number of network requests',
description=f'Page makes {len(network_requests)} network requests, which can slow loading.',
remediation='Combine files, use sprites, and reduce third-party scripts.'
))
# High memory usage (potential memory issues)
used_heap = performance_data.get('usedJSHeapSize', 0)
total_heap = performance_data.get('totalJSHeapSize', 0)
if used_heap > 100 * 1024 * 1024: # > 100MB
issues.append(self._create_issue(
category='resources',
severity='medium',
title='High JavaScript memory usage',
description=f'Page uses {used_heap / (1024*1024):.1f}MB of JavaScript heap memory.',
remediation='Review for memory leaks and optimize JavaScript memory usage.'
))
if total_heap > 0 and used_heap / total_heap > 0.9:
issues.append(self._create_issue(
category='resources',
severity='high',
title='JavaScript heap near capacity',
description='JavaScript heap is using >90% of available memory, risking out-of-memory errors.',
remediation='Investigate potential memory leaks and reduce memory consumption.'
))
return issues