""" Playwright Scanner Integration. This module uses Playwright to perform browser-based analysis, capturing console errors, network requests, and resource metrics. """ import asyncio import logging import time from typing import Any, Dict, List, Optional from django.conf import settings from .base import ( BaseScanner, ScannerResult, ScannerStatus, IssueData, MetricData, ) logger = logging.getLogger(__name__) class PlaywrightScanner(BaseScanner): """ Scanner using Playwright for browser-based analysis. Captures: - Console errors and warnings - Network request details - Page load timing - Large resources (images, scripts) - Memory usage indicators """ name = "playwright" def __init__(self, config: Optional[Dict[str, Any]] = None): super().__init__(config) self.timeout = self.config.get('timeout', 30000) # 30 seconds self.viewport = self.config.get('viewport', {'width': 1920, 'height': 1080}) def run(self, url: str) -> ScannerResult: """ Run Playwright analysis on the URL. Args: url: The URL to analyze Returns: ScannerResult with browser analysis data """ self.logger.info(f"Starting Playwright scan for {url}") try: # Run async scan in sync context loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) try: result = loop.run_until_complete(self._async_scan(url)) finally: loop.close() return result except Exception as e: return self._create_error_result(e) async def _async_scan(self, url: str) -> ScannerResult: """ Async implementation of the scan. Args: url: The URL to analyze Returns: ScannerResult with findings """ from playwright.async_api import async_playwright issues = [] metrics = [] raw_data = { 'console_messages': [], 'network_requests': [], 'failed_requests': [], 'large_resources': [], } async with async_playwright() as p: browser = await p.chromium.launch( headless=True, args=[ '--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage', '--disable-gpu', ] ) context = await browser.new_context( viewport=self.viewport, user_agent=( 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ' '(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36' ) ) page = await context.new_page() # Collect data console_messages = [] network_requests = [] failed_requests = [] # Set up event listeners page.on("console", lambda msg: console_messages.append({ 'type': msg.type, 'text': msg.text, 'location': str(msg.location) if msg.location else None, })) page.on("request", lambda req: network_requests.append({ 'url': req.url, 'method': req.method, 'resource_type': req.resource_type, 'timestamp': time.time(), })) page.on("requestfailed", lambda req: failed_requests.append({ 'url': req.url, 'failure': req.failure, 'resource_type': req.resource_type, })) # Navigate and measure start_time = time.time() try: response = await page.goto( url, wait_until='networkidle', timeout=self.timeout ) load_time = (time.time() - start_time) * 1000 # Convert to ms # Get response status status_code = response.status if response else 0 # Wait a bit more for any delayed scripts await page.wait_for_timeout(2000) # Get performance timing perf_timing = await page.evaluate('''() => { const timing = performance.timing; const navigation = performance.getEntriesByType("navigation")[0]; return { domContentLoaded: timing.domContentLoadedEventEnd - timing.navigationStart, domComplete: timing.domComplete - timing.navigationStart, loadEvent: timing.loadEventEnd - timing.navigationStart, firstPaint: navigation ? navigation.domComplete : null, transferSize: navigation ? navigation.transferSize : null, }; }''') # Get memory info (if available) memory_info = await page.evaluate('''() => { if (performance.memory) { return { usedJSHeapSize: performance.memory.usedJSHeapSize, totalJSHeapSize: performance.memory.totalJSHeapSize, jsHeapSizeLimit: performance.memory.jsHeapSizeLimit, }; } return null; }''') # Get resource sizes resources = await page.evaluate('''() => { const entries = performance.getEntriesByType("resource"); return entries.map(e => ({ name: e.name, type: e.initiatorType, transferSize: e.transferSize, duration: e.duration, })); }''') except Exception as e: self.logger.warning(f"Page navigation error: {e}") load_time = self.timeout status_code = 0 perf_timing = {} memory_info = None resources = [] await browser.close() # Process collected data raw_data['console_messages'] = console_messages raw_data['network_requests'] = network_requests[:100] # Limit stored raw_data['failed_requests'] = failed_requests raw_data['performance_timing'] = perf_timing if 'perf_timing' in locals() else {} raw_data['memory_info'] = memory_info if 'memory_info' in locals() else None raw_data['status_code'] = status_code if 'status_code' in locals() else 0 # Create metrics metrics.append(MetricData( name='page_load_time', display_name='Page Load Time', value=load_time, unit='ms', source='playwright' )) metrics.append(MetricData( name='total_network_requests', display_name='Total Network Requests', value=float(len(network_requests)), unit='count', source='playwright' )) # Calculate total transfer size total_transfer = sum(r.get('transferSize', 0) for r in resources if r.get('transferSize')) if total_transfer > 0: metrics.append(MetricData( name='total_transfer_size', display_name='Total Transfer Size', value=float(total_transfer), unit='bytes', source='playwright' )) if perf_timing.get('domContentLoaded'): metrics.append(MetricData( name='dom_content_loaded', display_name='DOM Content Loaded', value=float(perf_timing['domContentLoaded']), unit='ms', source='playwright' )) # Memory metrics if memory_info: metrics.append(MetricData( name='js_heap_used', display_name='JS Heap Used', value=float(memory_info.get('usedJSHeapSize', 0)), unit='bytes', source='playwright' )) # Check for high memory usage heap_used = memory_info.get('usedJSHeapSize', 0) heap_limit = memory_info.get('jsHeapSizeLimit', 1) heap_percent = (heap_used / heap_limit) * 100 if heap_limit > 0 else 0 if heap_percent > 50: issues.append(IssueData( category='resources', severity='medium', title='High JavaScript memory usage', description=( f'JavaScript is using {heap_used / (1024*1024):.1f} MB ' f'({heap_percent:.1f}% of available heap). ' 'This may indicate memory-heavy operations or potential leaks.' ), tool='playwright', affected_url=url, remediation=( 'Review JavaScript for memory leaks, optimize data structures, ' 'and ensure proper cleanup of event listeners and timers.' ), raw_data=memory_info )) # Analyze console messages for errors errors = [m for m in console_messages if m['type'] == 'error'] warnings = [m for m in console_messages if m['type'] == 'warning'] metrics.append(MetricData( name='console_errors_count', display_name='Console Errors', value=float(len(errors)), unit='count', source='playwright' )) metrics.append(MetricData( name='console_warnings_count', display_name='Console Warnings', value=float(len(warnings)), unit='count', source='playwright' )) # Create issues for console errors if errors: # Group similar errors error_texts = set(e['text'][:200] for e in errors) for error_text in list(error_texts)[:10]: # Limit to 10 unique errors issues.append(IssueData( category='content', severity='medium', title='JavaScript console error', description=f'JavaScript error logged to console: {error_text}', tool='playwright', affected_url=url, remediation='Review and fix the JavaScript error in your code.', raw_data={'error': error_text} )) # Check for failed network requests if failed_requests: for req in failed_requests[:5]: # Limit reported issues.append(IssueData( category='content', severity='low', title='Failed network request', description=( f"Request to {req['url'][:100]} failed: {req.get('failure', 'Unknown error')}" ), tool='playwright', affected_url=req['url'], remediation='Ensure the resource is available and CORS is configured correctly.', raw_data=req )) # Find large resources large_threshold = settings.SCANNER_CONFIG.get('LARGE_IMAGE_THRESHOLD_BYTES', 1024 * 1024) large_resources = [ r for r in resources if r.get('transferSize', 0) > large_threshold ] for resource in large_resources[:5]: # Limit reported size_mb = resource['transferSize'] / (1024 * 1024) issues.append(IssueData( category='resources', severity='medium' if size_mb > 2 else 'low', title=f"Large resource detected ({size_mb:.1f} MB)", description=( f"The resource '{resource['name'][-80:]}' is {size_mb:.2f} MB. " "Large resources increase page load time and bandwidth usage." ), tool='playwright', affected_url=resource['name'], remediation=( 'Optimize images using compression, use appropriate formats (WebP, AVIF), ' 'implement lazy loading, or consider a CDN.' ), raw_data=resource )) raw_data['large_resources'] = large_resources # Count resources by type resource_counts = {} for req in network_requests: rtype = req.get('resource_type', 'other') resource_counts[rtype] = resource_counts.get(rtype, 0) + 1 raw_data['resource_counts'] = resource_counts # Check for excessive requests if len(network_requests) > 100: issues.append(IssueData( category='performance', severity='medium', title='High number of network requests', description=( f'Page made {len(network_requests)} network requests. ' 'Excessive requests increase page load time and server load.' ), tool='playwright', affected_url=url, remediation=( 'Consolidate resources, use HTTP/2 multiplexing, implement ' 'resource bundling, and lazy load non-critical resources.' ), raw_data=resource_counts )) self.logger.info( f"Playwright scan complete: {len(issues)} issues, {len(metrics)} metrics" ) return ScannerResult( scanner_name=self.name, status=ScannerStatus.SUCCESS, issues=issues, metrics=metrics, raw_data=raw_data )