secure-web/backend/scanner/scanners/playwright_scanner.py

398 lines
14 KiB
Python

"""
Playwright Scanner Integration.
This module uses Playwright to perform browser-based analysis,
capturing console errors, network requests, and resource metrics.
"""
import asyncio
import logging
import time
from typing import Any, Dict, List, Optional
from django.conf import settings
from .base import (
BaseScanner,
ScannerResult,
ScannerStatus,
IssueData,
MetricData,
)
logger = logging.getLogger(__name__)
class PlaywrightScanner(BaseScanner):
"""
Scanner using Playwright for browser-based analysis.
Captures:
- Console errors and warnings
- Network request details
- Page load timing
- Large resources (images, scripts)
- Memory usage indicators
"""
name = "playwright"
def __init__(self, config: Optional[Dict[str, Any]] = None):
super().__init__(config)
self.timeout = self.config.get('timeout', 30000) # 30 seconds
self.viewport = self.config.get('viewport', {'width': 1920, 'height': 1080})
def run(self, url: str) -> ScannerResult:
"""
Run Playwright analysis on the URL.
Args:
url: The URL to analyze
Returns:
ScannerResult with browser analysis data
"""
self.logger.info(f"Starting Playwright scan for {url}")
try:
# Run async scan in sync context
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
try:
result = loop.run_until_complete(self._async_scan(url))
finally:
loop.close()
return result
except Exception as e:
return self._create_error_result(e)
async def _async_scan(self, url: str) -> ScannerResult:
"""
Async implementation of the scan.
Args:
url: The URL to analyze
Returns:
ScannerResult with findings
"""
from playwright.async_api import async_playwright
issues = []
metrics = []
raw_data = {
'console_messages': [],
'network_requests': [],
'failed_requests': [],
'large_resources': [],
}
async with async_playwright() as p:
browser = await p.chromium.launch(
headless=True,
args=[
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-gpu',
]
)
context = await browser.new_context(
viewport=self.viewport,
user_agent=(
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
'(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
)
)
page = await context.new_page()
# Collect data
console_messages = []
network_requests = []
failed_requests = []
# Set up event listeners
page.on("console", lambda msg: console_messages.append({
'type': msg.type,
'text': msg.text,
'location': str(msg.location) if msg.location else None,
}))
page.on("request", lambda req: network_requests.append({
'url': req.url,
'method': req.method,
'resource_type': req.resource_type,
'timestamp': time.time(),
}))
page.on("requestfailed", lambda req: failed_requests.append({
'url': req.url,
'failure': req.failure,
'resource_type': req.resource_type,
}))
# Navigate and measure
start_time = time.time()
try:
response = await page.goto(
url,
wait_until='networkidle',
timeout=self.timeout
)
load_time = (time.time() - start_time) * 1000 # Convert to ms
# Get response status
status_code = response.status if response else 0
# Wait a bit more for any delayed scripts
await page.wait_for_timeout(2000)
# Get performance timing
perf_timing = await page.evaluate('''() => {
const timing = performance.timing;
const navigation = performance.getEntriesByType("navigation")[0];
return {
domContentLoaded: timing.domContentLoadedEventEnd - timing.navigationStart,
domComplete: timing.domComplete - timing.navigationStart,
loadEvent: timing.loadEventEnd - timing.navigationStart,
firstPaint: navigation ? navigation.domComplete : null,
transferSize: navigation ? navigation.transferSize : null,
};
}''')
# Get memory info (if available)
memory_info = await page.evaluate('''() => {
if (performance.memory) {
return {
usedJSHeapSize: performance.memory.usedJSHeapSize,
totalJSHeapSize: performance.memory.totalJSHeapSize,
jsHeapSizeLimit: performance.memory.jsHeapSizeLimit,
};
}
return null;
}''')
# Get resource sizes
resources = await page.evaluate('''() => {
const entries = performance.getEntriesByType("resource");
return entries.map(e => ({
name: e.name,
type: e.initiatorType,
transferSize: e.transferSize,
duration: e.duration,
}));
}''')
except Exception as e:
self.logger.warning(f"Page navigation error: {e}")
load_time = self.timeout
status_code = 0
perf_timing = {}
memory_info = None
resources = []
await browser.close()
# Process collected data
raw_data['console_messages'] = console_messages
raw_data['network_requests'] = network_requests[:100] # Limit stored
raw_data['failed_requests'] = failed_requests
raw_data['performance_timing'] = perf_timing if 'perf_timing' in locals() else {}
raw_data['memory_info'] = memory_info if 'memory_info' in locals() else None
raw_data['status_code'] = status_code if 'status_code' in locals() else 0
# Create metrics
metrics.append(MetricData(
name='page_load_time',
display_name='Page Load Time',
value=load_time,
unit='ms',
source='playwright'
))
metrics.append(MetricData(
name='total_network_requests',
display_name='Total Network Requests',
value=float(len(network_requests)),
unit='count',
source='playwright'
))
# Calculate total transfer size
total_transfer = sum(r.get('transferSize', 0) for r in resources if r.get('transferSize'))
if total_transfer > 0:
metrics.append(MetricData(
name='total_transfer_size',
display_name='Total Transfer Size',
value=float(total_transfer),
unit='bytes',
source='playwright'
))
if perf_timing.get('domContentLoaded'):
metrics.append(MetricData(
name='dom_content_loaded',
display_name='DOM Content Loaded',
value=float(perf_timing['domContentLoaded']),
unit='ms',
source='playwright'
))
# Memory metrics
if memory_info:
metrics.append(MetricData(
name='js_heap_used',
display_name='JS Heap Used',
value=float(memory_info.get('usedJSHeapSize', 0)),
unit='bytes',
source='playwright'
))
# Check for high memory usage
heap_used = memory_info.get('usedJSHeapSize', 0)
heap_limit = memory_info.get('jsHeapSizeLimit', 1)
heap_percent = (heap_used / heap_limit) * 100 if heap_limit > 0 else 0
if heap_percent > 50:
issues.append(IssueData(
category='resources',
severity='medium',
title='High JavaScript memory usage',
description=(
f'JavaScript is using {heap_used / (1024*1024):.1f} MB '
f'({heap_percent:.1f}% of available heap). '
'This may indicate memory-heavy operations or potential leaks.'
),
tool='playwright',
affected_url=url,
remediation=(
'Review JavaScript for memory leaks, optimize data structures, '
'and ensure proper cleanup of event listeners and timers.'
),
raw_data=memory_info
))
# Analyze console messages for errors
errors = [m for m in console_messages if m['type'] == 'error']
warnings = [m for m in console_messages if m['type'] == 'warning']
metrics.append(MetricData(
name='console_errors_count',
display_name='Console Errors',
value=float(len(errors)),
unit='count',
source='playwright'
))
metrics.append(MetricData(
name='console_warnings_count',
display_name='Console Warnings',
value=float(len(warnings)),
unit='count',
source='playwright'
))
# Create issues for console errors
if errors:
# Group similar errors
error_texts = set(e['text'][:200] for e in errors)
for error_text in list(error_texts)[:10]: # Limit to 10 unique errors
issues.append(IssueData(
category='content',
severity='medium',
title='JavaScript console error',
description=f'JavaScript error logged to console: {error_text}',
tool='playwright',
affected_url=url,
remediation='Review and fix the JavaScript error in your code.',
raw_data={'error': error_text}
))
# Check for failed network requests
if failed_requests:
for req in failed_requests[:5]: # Limit reported
issues.append(IssueData(
category='content',
severity='low',
title='Failed network request',
description=(
f"Request to {req['url'][:100]} failed: {req.get('failure', 'Unknown error')}"
),
tool='playwright',
affected_url=req['url'],
remediation='Ensure the resource is available and CORS is configured correctly.',
raw_data=req
))
# Find large resources
large_threshold = settings.SCANNER_CONFIG.get('LARGE_IMAGE_THRESHOLD_BYTES', 1024 * 1024)
large_resources = [
r for r in resources
if r.get('transferSize', 0) > large_threshold
]
for resource in large_resources[:5]: # Limit reported
size_mb = resource['transferSize'] / (1024 * 1024)
issues.append(IssueData(
category='resources',
severity='medium' if size_mb > 2 else 'low',
title=f"Large resource detected ({size_mb:.1f} MB)",
description=(
f"The resource '{resource['name'][-80:]}' is {size_mb:.2f} MB. "
"Large resources increase page load time and bandwidth usage."
),
tool='playwright',
affected_url=resource['name'],
remediation=(
'Optimize images using compression, use appropriate formats (WebP, AVIF), '
'implement lazy loading, or consider a CDN.'
),
raw_data=resource
))
raw_data['large_resources'] = large_resources
# Count resources by type
resource_counts = {}
for req in network_requests:
rtype = req.get('resource_type', 'other')
resource_counts[rtype] = resource_counts.get(rtype, 0) + 1
raw_data['resource_counts'] = resource_counts
# Check for excessive requests
if len(network_requests) > 100:
issues.append(IssueData(
category='performance',
severity='medium',
title='High number of network requests',
description=(
f'Page made {len(network_requests)} network requests. '
'Excessive requests increase page load time and server load.'
),
tool='playwright',
affected_url=url,
remediation=(
'Consolidate resources, use HTTP/2 multiplexing, implement '
'resource bundling, and lazy load non-critical resources.'
),
raw_data=resource_counts
))
self.logger.info(
f"Playwright scan complete: {len(issues)} issues, {len(metrics)} metrics"
)
return ScannerResult(
scanner_name=self.name,
status=ScannerStatus.SUCCESS,
issues=issues,
metrics=metrics,
raw_data=raw_data
)