""" Playwright Browser Scanner. This module uses Playwright to perform browser-based analysis, including console error capture, resource loading, and basic memory usage indicators. """ import time import logging import asyncio from typing import Dict, List, Optional, Tuple from django.conf import settings from .base import BaseScanner, ScannerResult, ScannerStatus logger = logging.getLogger('scanner') class PlaywrightScanner(BaseScanner): """ Browser-based scanner using Playwright. Captures: - Console errors and warnings - Network request metrics - Large images and resources - JavaScript errors - Memory usage indicators - Page load timing """ name = "playwright" def __init__(self, config: dict = None): super().__init__(config) self.timeout = self.config.get( 'timeout', settings.SCANNER_CONFIG.get('PLAYWRIGHT_TIMEOUT', 30000) ) self.viewport = self.config.get( 'viewport', settings.SCANNER_CONFIG.get('PLAYWRIGHT_VIEWPORT', {'width': 1920, 'height': 1080}) ) self.large_image_threshold = settings.SCANNER_CONFIG.get( 'LARGE_IMAGE_THRESHOLD_BYTES', 1024 * 1024 ) def is_available(self) -> bool: """Check if Playwright is available.""" try: from playwright.sync_api import sync_playwright return True except ImportError: self.logger.warning("Playwright not installed") return False def run(self, url: str) -> ScannerResult: """ Run browser-based analysis using Playwright. Args: url: The URL to analyze Returns: ScannerResult with browser analysis data """ start_time = time.time() if not self.is_available(): return ScannerResult( status=ScannerStatus.FAILED, scanner_name=self.name, error_message="Playwright is not available", execution_time_seconds=time.time() - start_time ) try: from playwright.sync_api import sync_playwright with sync_playwright() as p: # Launch browser browser = p.chromium.launch( headless=True, args=[ '--no-sandbox', '--disable-dev-shm-usage', '--disable-gpu', '--disable-extensions', ] ) context = browser.new_context( viewport=self.viewport, user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' ) page = context.new_page() # Collect data console_messages = [] network_requests = [] failed_requests = [] js_errors = [] # Console message handler def handle_console(msg): console_messages.append({ 'type': msg.type, 'text': msg.text[:500], # Truncate long messages 'location': str(msg.location) if hasattr(msg, 'location') else None }) # Request handler def handle_request(request): network_requests.append({ 'url': request.url[:200], 'method': request.method, 'resource_type': request.resource_type, }) # Response handler def handle_response(response): # Find the corresponding request for req in network_requests: if req['url'] == response.url[:200]: req['status'] = response.status try: headers = response.headers content_length = headers.get('content-length', '0') req['size'] = int(content_length) if content_length else 0 except: req['size'] = 0 break # Request failed handler def handle_request_failed(request): failed_requests.append({ 'url': request.url[:200], 'failure': request.failure, 'resource_type': request.resource_type, }) # Page error handler def handle_page_error(error): js_errors.append({ 'message': str(error)[:500], }) # Attach handlers page.on('console', handle_console) page.on('request', handle_request) page.on('response', handle_response) page.on('requestfailed', handle_request_failed) page.on('pageerror', handle_page_error) # Navigate to page load_start = time.time() try: page.goto(url, timeout=self.timeout, wait_until='networkidle') except Exception as e: # Try with less strict wait condition self.logger.warning(f"Network idle timeout, trying load: {e}") page.goto(url, timeout=self.timeout, wait_until='load') load_time = (time.time() - load_start) * 1000 # ms # Wait a bit more for any async content page.wait_for_timeout(2000) # Get performance metrics if available performance_data = page.evaluate('''() => { const timing = performance.timing; const memory = performance.memory || {}; return { domContentLoaded: timing.domContentLoadedEventEnd - timing.navigationStart, loadComplete: timing.loadEventEnd - timing.navigationStart, domInteractive: timing.domInteractive - timing.navigationStart, firstPaint: performance.getEntriesByType('paint').find(p => p.name === 'first-paint')?.startTime || null, firstContentfulPaint: performance.getEntriesByType('paint').find(p => p.name === 'first-contentful-paint')?.startTime || null, jsHeapSizeLimit: memory.jsHeapSizeLimit || null, totalJSHeapSize: memory.totalJSHeapSize || null, usedJSHeapSize: memory.usedJSHeapSize || null, }; }''') # Close browser browser.close() # Process results metrics = self._extract_metrics( load_time, performance_data, network_requests ) issues = self._extract_issues( console_messages, network_requests, failed_requests, js_errors, performance_data ) raw_data = { 'console_messages': console_messages[:50], # Limit size 'network_requests': network_requests[:100], 'failed_requests': failed_requests, 'js_errors': js_errors, 'performance': performance_data, 'load_time_ms': load_time, } execution_time = time.time() - start_time return ScannerResult( status=ScannerStatus.SUCCESS, scanner_name=self.name, metrics=metrics, issues=issues, raw_data=raw_data, execution_time_seconds=execution_time ) except Exception as e: logger.exception(f"Playwright scan failed for {url}") return ScannerResult( status=ScannerStatus.FAILED, scanner_name=self.name, error_message=f"Browser scan failed: {e}", execution_time_seconds=time.time() - start_time ) def _extract_metrics( self, load_time: float, performance_data: dict, network_requests: list ) -> list: """Extract metrics from browser data.""" metrics = [] # Page load time metrics.append(self._create_metric( name='page_load_time', display_name='Page Load Time', value=load_time, unit='ms' )) # DOM Content Loaded if performance_data.get('domContentLoaded'): metrics.append(self._create_metric( name='dom_content_loaded', display_name='DOM Content Loaded', value=performance_data['domContentLoaded'], unit='ms' )) # DOM Interactive if performance_data.get('domInteractive'): metrics.append(self._create_metric( name='dom_interactive', display_name='DOM Interactive', value=performance_data['domInteractive'], unit='ms' )) # Network metrics total_requests = len(network_requests) total_size = sum(r.get('size', 0) for r in network_requests) metrics.append(self._create_metric( name='total_requests_playwright', display_name='Total Network Requests', value=total_requests, unit='count' )) metrics.append(self._create_metric( name='total_download_size', display_name='Total Downloaded', value=total_size, unit='bytes' )) # Request type breakdown scripts = [r for r in network_requests if r.get('resource_type') == 'script'] stylesheets = [r for r in network_requests if r.get('resource_type') == 'stylesheet'] images = [r for r in network_requests if r.get('resource_type') == 'image'] fonts = [r for r in network_requests if r.get('resource_type') == 'font'] metrics.append(self._create_metric( name='script_requests', display_name='Script Requests', value=len(scripts), unit='count' )) metrics.append(self._create_metric( name='image_requests', display_name='Image Requests', value=len(images), unit='count' )) # Memory metrics if performance_data.get('usedJSHeapSize'): metrics.append(self._create_metric( name='js_heap_used', display_name='JS Heap Used', value=performance_data['usedJSHeapSize'], unit='bytes' )) if performance_data.get('totalJSHeapSize'): metrics.append(self._create_metric( name='js_heap_total', display_name='JS Heap Total', value=performance_data['totalJSHeapSize'], unit='bytes' )) return metrics def _extract_issues( self, console_messages: list, network_requests: list, failed_requests: list, js_errors: list, performance_data: dict ) -> list: """Extract issues from browser data.""" issues = [] # Console errors errors = [m for m in console_messages if m.get('type') == 'error'] if errors: issues.append(self._create_issue( category='content', severity='medium', title=f'{len(errors)} console error(s) detected', description='JavaScript console errors were detected on the page.', remediation='Review and fix JavaScript errors to improve user experience.', raw_data={'errors': errors[:10]} )) # Console warnings warnings = [m for m in console_messages if m.get('type') == 'warning'] if len(warnings) > 5: issues.append(self._create_issue( category='content', severity='low', title=f'{len(warnings)} console warning(s) detected', description='Multiple JavaScript warnings were detected on the page.', remediation='Review console warnings for potential issues.', raw_data={'warnings': warnings[:10]} )) # JavaScript page errors if js_errors: issues.append(self._create_issue( category='content', severity='high', title=f'{len(js_errors)} JavaScript error(s) detected', description='Uncaught JavaScript exceptions were detected.', remediation='Fix JavaScript errors that could break page functionality.', raw_data={'errors': js_errors} )) # Failed network requests if failed_requests: issues.append(self._create_issue( category='content', severity='medium', title=f'{len(failed_requests)} failed network request(s)', description='Some resources failed to load.', remediation='Ensure all resources are available and URLs are correct.', raw_data={'failed': failed_requests} )) # Large images large_images = [ r for r in network_requests if r.get('resource_type') == 'image' and r.get('size', 0) > self.large_image_threshold ] if large_images: issues.append(self._create_issue( category='resources', severity='medium', title=f'{len(large_images)} large image(s) detected (>1MB)', description='Large images slow down page load and increase bandwidth usage.', remediation='Compress images and use modern formats like WebP or AVIF.', raw_data={'images': [{'url': i['url'], 'size': i.get('size')} for i in large_images]} )) # Too many requests if len(network_requests) > 100: issues.append(self._create_issue( category='performance', severity='medium', title='High number of network requests', description=f'Page makes {len(network_requests)} network requests, which can slow loading.', remediation='Combine files, use sprites, and reduce third-party scripts.' )) # High memory usage (potential memory issues) used_heap = performance_data.get('usedJSHeapSize', 0) total_heap = performance_data.get('totalJSHeapSize', 0) if used_heap > 100 * 1024 * 1024: # > 100MB issues.append(self._create_issue( category='resources', severity='medium', title='High JavaScript memory usage', description=f'Page uses {used_heap / (1024*1024):.1f}MB of JavaScript heap memory.', remediation='Review for memory leaks and optimize JavaScript memory usage.' )) if total_heap > 0 and used_heap / total_heap > 0.9: issues.append(self._create_issue( category='resources', severity='high', title='JavaScript heap near capacity', description='JavaScript heap is using >90% of available memory, risking out-of-memory errors.', remediation='Investigate potential memory leaks and reduce memory consumption.' )) return issues