398 lines
14 KiB
Python
398 lines
14 KiB
Python
"""
|
|
Playwright Scanner Integration.
|
|
|
|
This module uses Playwright to perform browser-based analysis,
|
|
capturing console errors, network requests, and resource metrics.
|
|
"""
|
|
|
|
import asyncio
|
|
import logging
|
|
import time
|
|
from typing import Any, Dict, List, Optional
|
|
|
|
from django.conf import settings
|
|
|
|
from .base import (
|
|
BaseScanner,
|
|
ScannerResult,
|
|
ScannerStatus,
|
|
IssueData,
|
|
MetricData,
|
|
)
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class PlaywrightScanner(BaseScanner):
|
|
"""
|
|
Scanner using Playwright for browser-based analysis.
|
|
|
|
Captures:
|
|
- Console errors and warnings
|
|
- Network request details
|
|
- Page load timing
|
|
- Large resources (images, scripts)
|
|
- Memory usage indicators
|
|
"""
|
|
|
|
name = "playwright"
|
|
|
|
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
|
super().__init__(config)
|
|
self.timeout = self.config.get('timeout', 30000) # 30 seconds
|
|
self.viewport = self.config.get('viewport', {'width': 1920, 'height': 1080})
|
|
|
|
def run(self, url: str) -> ScannerResult:
|
|
"""
|
|
Run Playwright analysis on the URL.
|
|
|
|
Args:
|
|
url: The URL to analyze
|
|
|
|
Returns:
|
|
ScannerResult with browser analysis data
|
|
"""
|
|
self.logger.info(f"Starting Playwright scan for {url}")
|
|
|
|
try:
|
|
# Run async scan in sync context
|
|
loop = asyncio.new_event_loop()
|
|
asyncio.set_event_loop(loop)
|
|
try:
|
|
result = loop.run_until_complete(self._async_scan(url))
|
|
finally:
|
|
loop.close()
|
|
|
|
return result
|
|
|
|
except Exception as e:
|
|
return self._create_error_result(e)
|
|
|
|
async def _async_scan(self, url: str) -> ScannerResult:
|
|
"""
|
|
Async implementation of the scan.
|
|
|
|
Args:
|
|
url: The URL to analyze
|
|
|
|
Returns:
|
|
ScannerResult with findings
|
|
"""
|
|
from playwright.async_api import async_playwright
|
|
|
|
issues = []
|
|
metrics = []
|
|
raw_data = {
|
|
'console_messages': [],
|
|
'network_requests': [],
|
|
'failed_requests': [],
|
|
'large_resources': [],
|
|
}
|
|
|
|
async with async_playwright() as p:
|
|
browser = await p.chromium.launch(
|
|
headless=True,
|
|
args=[
|
|
'--no-sandbox',
|
|
'--disable-setuid-sandbox',
|
|
'--disable-dev-shm-usage',
|
|
'--disable-gpu',
|
|
]
|
|
)
|
|
|
|
context = await browser.new_context(
|
|
viewport=self.viewport,
|
|
user_agent=(
|
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
|
|
'(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
|
)
|
|
)
|
|
|
|
page = await context.new_page()
|
|
|
|
# Collect data
|
|
console_messages = []
|
|
network_requests = []
|
|
failed_requests = []
|
|
|
|
# Set up event listeners
|
|
page.on("console", lambda msg: console_messages.append({
|
|
'type': msg.type,
|
|
'text': msg.text,
|
|
'location': str(msg.location) if msg.location else None,
|
|
}))
|
|
|
|
page.on("request", lambda req: network_requests.append({
|
|
'url': req.url,
|
|
'method': req.method,
|
|
'resource_type': req.resource_type,
|
|
'timestamp': time.time(),
|
|
}))
|
|
|
|
page.on("requestfailed", lambda req: failed_requests.append({
|
|
'url': req.url,
|
|
'failure': req.failure,
|
|
'resource_type': req.resource_type,
|
|
}))
|
|
|
|
# Navigate and measure
|
|
start_time = time.time()
|
|
|
|
try:
|
|
response = await page.goto(
|
|
url,
|
|
wait_until='networkidle',
|
|
timeout=self.timeout
|
|
)
|
|
load_time = (time.time() - start_time) * 1000 # Convert to ms
|
|
|
|
# Get response status
|
|
status_code = response.status if response else 0
|
|
|
|
# Wait a bit more for any delayed scripts
|
|
await page.wait_for_timeout(2000)
|
|
|
|
# Get performance timing
|
|
perf_timing = await page.evaluate('''() => {
|
|
const timing = performance.timing;
|
|
const navigation = performance.getEntriesByType("navigation")[0];
|
|
return {
|
|
domContentLoaded: timing.domContentLoadedEventEnd - timing.navigationStart,
|
|
domComplete: timing.domComplete - timing.navigationStart,
|
|
loadEvent: timing.loadEventEnd - timing.navigationStart,
|
|
firstPaint: navigation ? navigation.domComplete : null,
|
|
transferSize: navigation ? navigation.transferSize : null,
|
|
};
|
|
}''')
|
|
|
|
# Get memory info (if available)
|
|
memory_info = await page.evaluate('''() => {
|
|
if (performance.memory) {
|
|
return {
|
|
usedJSHeapSize: performance.memory.usedJSHeapSize,
|
|
totalJSHeapSize: performance.memory.totalJSHeapSize,
|
|
jsHeapSizeLimit: performance.memory.jsHeapSizeLimit,
|
|
};
|
|
}
|
|
return null;
|
|
}''')
|
|
|
|
# Get resource sizes
|
|
resources = await page.evaluate('''() => {
|
|
const entries = performance.getEntriesByType("resource");
|
|
return entries.map(e => ({
|
|
name: e.name,
|
|
type: e.initiatorType,
|
|
transferSize: e.transferSize,
|
|
duration: e.duration,
|
|
}));
|
|
}''')
|
|
|
|
except Exception as e:
|
|
self.logger.warning(f"Page navigation error: {e}")
|
|
load_time = self.timeout
|
|
status_code = 0
|
|
perf_timing = {}
|
|
memory_info = None
|
|
resources = []
|
|
|
|
await browser.close()
|
|
|
|
# Process collected data
|
|
raw_data['console_messages'] = console_messages
|
|
raw_data['network_requests'] = network_requests[:100] # Limit stored
|
|
raw_data['failed_requests'] = failed_requests
|
|
raw_data['performance_timing'] = perf_timing if 'perf_timing' in locals() else {}
|
|
raw_data['memory_info'] = memory_info if 'memory_info' in locals() else None
|
|
raw_data['status_code'] = status_code if 'status_code' in locals() else 0
|
|
|
|
# Create metrics
|
|
metrics.append(MetricData(
|
|
name='page_load_time',
|
|
display_name='Page Load Time',
|
|
value=load_time,
|
|
unit='ms',
|
|
source='playwright'
|
|
))
|
|
|
|
metrics.append(MetricData(
|
|
name='total_network_requests',
|
|
display_name='Total Network Requests',
|
|
value=float(len(network_requests)),
|
|
unit='count',
|
|
source='playwright'
|
|
))
|
|
|
|
# Calculate total transfer size
|
|
total_transfer = sum(r.get('transferSize', 0) for r in resources if r.get('transferSize'))
|
|
if total_transfer > 0:
|
|
metrics.append(MetricData(
|
|
name='total_transfer_size',
|
|
display_name='Total Transfer Size',
|
|
value=float(total_transfer),
|
|
unit='bytes',
|
|
source='playwright'
|
|
))
|
|
|
|
if perf_timing.get('domContentLoaded'):
|
|
metrics.append(MetricData(
|
|
name='dom_content_loaded',
|
|
display_name='DOM Content Loaded',
|
|
value=float(perf_timing['domContentLoaded']),
|
|
unit='ms',
|
|
source='playwright'
|
|
))
|
|
|
|
# Memory metrics
|
|
if memory_info:
|
|
metrics.append(MetricData(
|
|
name='js_heap_used',
|
|
display_name='JS Heap Used',
|
|
value=float(memory_info.get('usedJSHeapSize', 0)),
|
|
unit='bytes',
|
|
source='playwright'
|
|
))
|
|
|
|
# Check for high memory usage
|
|
heap_used = memory_info.get('usedJSHeapSize', 0)
|
|
heap_limit = memory_info.get('jsHeapSizeLimit', 1)
|
|
heap_percent = (heap_used / heap_limit) * 100 if heap_limit > 0 else 0
|
|
|
|
if heap_percent > 50:
|
|
issues.append(IssueData(
|
|
category='resources',
|
|
severity='medium',
|
|
title='High JavaScript memory usage',
|
|
description=(
|
|
f'JavaScript is using {heap_used / (1024*1024):.1f} MB '
|
|
f'({heap_percent:.1f}% of available heap). '
|
|
'This may indicate memory-heavy operations or potential leaks.'
|
|
),
|
|
tool='playwright',
|
|
affected_url=url,
|
|
remediation=(
|
|
'Review JavaScript for memory leaks, optimize data structures, '
|
|
'and ensure proper cleanup of event listeners and timers.'
|
|
),
|
|
raw_data=memory_info
|
|
))
|
|
|
|
# Analyze console messages for errors
|
|
errors = [m for m in console_messages if m['type'] == 'error']
|
|
warnings = [m for m in console_messages if m['type'] == 'warning']
|
|
|
|
metrics.append(MetricData(
|
|
name='console_errors_count',
|
|
display_name='Console Errors',
|
|
value=float(len(errors)),
|
|
unit='count',
|
|
source='playwright'
|
|
))
|
|
|
|
metrics.append(MetricData(
|
|
name='console_warnings_count',
|
|
display_name='Console Warnings',
|
|
value=float(len(warnings)),
|
|
unit='count',
|
|
source='playwright'
|
|
))
|
|
|
|
# Create issues for console errors
|
|
if errors:
|
|
# Group similar errors
|
|
error_texts = set(e['text'][:200] for e in errors)
|
|
for error_text in list(error_texts)[:10]: # Limit to 10 unique errors
|
|
issues.append(IssueData(
|
|
category='content',
|
|
severity='medium',
|
|
title='JavaScript console error',
|
|
description=f'JavaScript error logged to console: {error_text}',
|
|
tool='playwright',
|
|
affected_url=url,
|
|
remediation='Review and fix the JavaScript error in your code.',
|
|
raw_data={'error': error_text}
|
|
))
|
|
|
|
# Check for failed network requests
|
|
if failed_requests:
|
|
for req in failed_requests[:5]: # Limit reported
|
|
issues.append(IssueData(
|
|
category='content',
|
|
severity='low',
|
|
title='Failed network request',
|
|
description=(
|
|
f"Request to {req['url'][:100]} failed: {req.get('failure', 'Unknown error')}"
|
|
),
|
|
tool='playwright',
|
|
affected_url=req['url'],
|
|
remediation='Ensure the resource is available and CORS is configured correctly.',
|
|
raw_data=req
|
|
))
|
|
|
|
# Find large resources
|
|
large_threshold = settings.SCANNER_CONFIG.get('LARGE_IMAGE_THRESHOLD_BYTES', 1024 * 1024)
|
|
large_resources = [
|
|
r for r in resources
|
|
if r.get('transferSize', 0) > large_threshold
|
|
]
|
|
|
|
for resource in large_resources[:5]: # Limit reported
|
|
size_mb = resource['transferSize'] / (1024 * 1024)
|
|
issues.append(IssueData(
|
|
category='resources',
|
|
severity='medium' if size_mb > 2 else 'low',
|
|
title=f"Large resource detected ({size_mb:.1f} MB)",
|
|
description=(
|
|
f"The resource '{resource['name'][-80:]}' is {size_mb:.2f} MB. "
|
|
"Large resources increase page load time and bandwidth usage."
|
|
),
|
|
tool='playwright',
|
|
affected_url=resource['name'],
|
|
remediation=(
|
|
'Optimize images using compression, use appropriate formats (WebP, AVIF), '
|
|
'implement lazy loading, or consider a CDN.'
|
|
),
|
|
raw_data=resource
|
|
))
|
|
|
|
raw_data['large_resources'] = large_resources
|
|
|
|
# Count resources by type
|
|
resource_counts = {}
|
|
for req in network_requests:
|
|
rtype = req.get('resource_type', 'other')
|
|
resource_counts[rtype] = resource_counts.get(rtype, 0) + 1
|
|
|
|
raw_data['resource_counts'] = resource_counts
|
|
|
|
# Check for excessive requests
|
|
if len(network_requests) > 100:
|
|
issues.append(IssueData(
|
|
category='performance',
|
|
severity='medium',
|
|
title='High number of network requests',
|
|
description=(
|
|
f'Page made {len(network_requests)} network requests. '
|
|
'Excessive requests increase page load time and server load.'
|
|
),
|
|
tool='playwright',
|
|
affected_url=url,
|
|
remediation=(
|
|
'Consolidate resources, use HTTP/2 multiplexing, implement '
|
|
'resource bundling, and lazy load non-critical resources.'
|
|
),
|
|
raw_data=resource_counts
|
|
))
|
|
|
|
self.logger.info(
|
|
f"Playwright scan complete: {len(issues)} issues, {len(metrics)} metrics"
|
|
)
|
|
|
|
return ScannerResult(
|
|
scanner_name=self.name,
|
|
status=ScannerStatus.SUCCESS,
|
|
issues=issues,
|
|
metrics=metrics,
|
|
raw_data=raw_data
|
|
)
|