secure-web/backend/scanner/scanners/playwright_scanner.py

"""
Playwright Scanner Integration.

This module uses Playwright to perform browser-based analysis,
capturing console errors, network requests, and resource metrics.
"""

import asyncio
import logging
import time
from typing import Any, Dict, List, Optional

from django.conf import settings

from .base import (
    BaseScanner,
    ScannerResult,
    ScannerStatus,
    IssueData,
    MetricData,
)

logger = logging.getLogger(__name__)


class PlaywrightScanner(BaseScanner):
    """
    Scanner using Playwright for browser-based analysis.

    Captures:
    - Console errors and warnings
    - Network request details
    - Page load timing
    - Large resources (images, scripts)
    - Memory usage indicators
    """

    name = "playwright"

    def __init__(self, config: Optional[Dict[str, Any]] = None):
        super().__init__(config)
        self.timeout = self.config.get('timeout', 30000)  # 30 seconds
        self.viewport = self.config.get('viewport', {'width': 1920, 'height': 1080})

    def run(self, url: str) -> ScannerResult:
        """
        Run Playwright analysis on the URL.

        Args:
            url: The URL to analyze

        Returns:
            ScannerResult with browser analysis data
        """
        self.logger.info(f"Starting Playwright scan for {url}")

        try:
            # Run async scan in sync context
            loop = asyncio.new_event_loop()
            asyncio.set_event_loop(loop)
            try:
                result = loop.run_until_complete(self._async_scan(url))
            finally:
                loop.close()

            return result

        except Exception as e:
            return self._create_error_result(e)

    async def _async_scan(self, url: str) -> ScannerResult:
        """
        Async implementation of the scan.

        Args:
            url: The URL to analyze

        Returns:
            ScannerResult with findings
        """
        from playwright.async_api import async_playwright

        issues = []
        metrics = []
        raw_data = {
            'console_messages': [],
            'network_requests': [],
            'failed_requests': [],
            'large_resources': [],
        }

        async with async_playwright() as p:
            browser = await p.chromium.launch(
                headless=True,
                args=[
                    '--no-sandbox',
                    '--disable-setuid-sandbox',
                    '--disable-dev-shm-usage',
                    '--disable-gpu',
                ]
            )

            context = await browser.new_context(
                viewport=self.viewport,
                user_agent=(
                    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
                    '(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
                )
            )

            page = await context.new_page()

            # Collect data
            console_messages = []
            network_requests = []
            failed_requests = []

            # Set up event listeners
            page.on("console", lambda msg: console_messages.append({
                'type': msg.type,
                'text': msg.text,
                'location': str(msg.location) if msg.location else None,
            }))

            page.on("request", lambda req: network_requests.append({
                'url': req.url,
                'method': req.method,
                'resource_type': req.resource_type,
                'timestamp': time.time(),
            }))

            page.on("requestfailed", lambda req: failed_requests.append({
                'url': req.url,
                'failure': req.failure,
                'resource_type': req.resource_type,
            }))

            # Navigate and measure
            start_time = time.time()

            try:
                response = await page.goto(
                    url,
                    wait_until='networkidle',
                    timeout=self.timeout
                )
                load_time = (time.time() - start_time) * 1000  # Convert to ms

                # Get response status
                status_code = response.status if response else 0

                # Wait a bit more for any delayed scripts
                await page.wait_for_timeout(2000)

                # Get performance timing
                perf_timing = await page.evaluate('''() => {
                    const timing = performance.timing;
                    const navigation = performance.getEntriesByType("navigation")[0];
                    return {
                        domContentLoaded: timing.domContentLoadedEventEnd - timing.navigationStart,
                        domComplete: timing.domComplete - timing.navigationStart,
                        loadEvent: timing.loadEventEnd - timing.navigationStart,
                        firstPaint: navigation ? navigation.domComplete : null,
                        transferSize: navigation ? navigation.transferSize : null,
                    };
                }''')

                # Get memory info (if available)
                memory_info = await page.evaluate('''() => {
                    if (performance.memory) {
                        return {
                            usedJSHeapSize: performance.memory.usedJSHeapSize,
                            totalJSHeapSize: performance.memory.totalJSHeapSize,
                            jsHeapSizeLimit: performance.memory.jsHeapSizeLimit,
                        };
                    }
                    return null;
                }''')

                # Get resource sizes
                resources = await page.evaluate('''() => {
                    const entries = performance.getEntriesByType("resource");
                    return entries.map(e => ({
                        name: e.name,
                        type: e.initiatorType,
                        transferSize: e.transferSize,
                        duration: e.duration,
                    }));
                }''')

            except Exception as e:
                self.logger.warning(f"Page navigation error: {e}")
                load_time = self.timeout
                status_code = 0
                perf_timing = {}
                memory_info = None
                resources = []

            await browser.close()

        # Process collected data
        raw_data['console_messages'] = console_messages
        raw_data['network_requests'] = network_requests[:100]  # Limit stored
        raw_data['failed_requests'] = failed_requests
        raw_data['performance_timing'] = perf_timing if 'perf_timing' in locals() else {}
        raw_data['memory_info'] = memory_info if 'memory_info' in locals() else None
        raw_data['status_code'] = status_code if 'status_code' in locals() else 0

        # Create metrics
        metrics.append(MetricData(
            name='page_load_time',
            display_name='Page Load Time',
            value=load_time,
            unit='ms',
            source='playwright'
        ))

        metrics.append(MetricData(
            name='total_network_requests',
            display_name='Total Network Requests',
            value=float(len(network_requests)),
            unit='count',
            source='playwright'
        ))

        # Calculate total transfer size
        total_transfer = sum(r.get('transferSize', 0) for r in resources if r.get('transferSize'))
        if total_transfer > 0:
            metrics.append(MetricData(
                name='total_transfer_size',
                display_name='Total Transfer Size',
                value=float(total_transfer),
                unit='bytes',
                source='playwright'
            ))

        if perf_timing.get('domContentLoaded'):
            metrics.append(MetricData(
                name='dom_content_loaded',
                display_name='DOM Content Loaded',
                value=float(perf_timing['domContentLoaded']),
                unit='ms',
                source='playwright'
            ))

        # Memory metrics
        if memory_info:
            metrics.append(MetricData(
                name='js_heap_used',
                display_name='JS Heap Used',
                value=float(memory_info.get('usedJSHeapSize', 0)),
                unit='bytes',
                source='playwright'
            ))

            # Check for high memory usage
            heap_used = memory_info.get('usedJSHeapSize', 0)
            heap_limit = memory_info.get('jsHeapSizeLimit', 1)
            heap_percent = (heap_used / heap_limit) * 100 if heap_limit > 0 else 0

            if heap_percent > 50:
                issues.append(IssueData(
                    category='resources',
                    severity='medium',
                    title='High JavaScript memory usage',
                    description=(
                        f'JavaScript is using {heap_used / (1024*1024):.1f} MB '
                        f'({heap_percent:.1f}% of available heap). '
                        'This may indicate memory-heavy operations or potential leaks.'
                    ),
                    tool='playwright',
                    affected_url=url,
                    remediation=(
                        'Review JavaScript for memory leaks, optimize data structures, '
                        'and ensure proper cleanup of event listeners and timers.'
                    ),
                    raw_data=memory_info
                ))

        # Analyze console messages for errors
        errors = [m for m in console_messages if m['type'] == 'error']
        warnings = [m for m in console_messages if m['type'] == 'warning']

        metrics.append(MetricData(
            name='console_errors_count',
            display_name='Console Errors',
            value=float(len(errors)),
            unit='count',
            source='playwright'
        ))

        metrics.append(MetricData(
            name='console_warnings_count',
            display_name='Console Warnings',
            value=float(len(warnings)),
            unit='count',
            source='playwright'
        ))

        # Create issues for console errors
        if errors:
            # Group similar errors
            error_texts = set(e['text'][:200] for e in errors)
            for error_text in list(error_texts)[:10]:  # Limit to 10 unique errors
                issues.append(IssueData(
                    category='content',
                    severity='medium',
                    title='JavaScript console error',
                    description=f'JavaScript error logged to console: {error_text}',
                    tool='playwright',
                    affected_url=url,
                    remediation='Review and fix the JavaScript error in your code.',
                    raw_data={'error': error_text}
                ))

        # Check for failed network requests
        if failed_requests:
            for req in failed_requests[:5]:  # Limit reported
                issues.append(IssueData(
                    category='content',
                    severity='low',
                    title='Failed network request',
                    description=(
                        f"Request to {req['url'][:100]} failed: {req.get('failure', 'Unknown error')}"
                    ),
                    tool='playwright',
                    affected_url=req['url'],
                    remediation='Ensure the resource is available and CORS is configured correctly.',
                    raw_data=req
                ))

        # Find large resources
        large_threshold = settings.SCANNER_CONFIG.get('LARGE_IMAGE_THRESHOLD_BYTES', 1024 * 1024)
        large_resources = [
            r for r in resources
            if r.get('transferSize', 0) > large_threshold
        ]

        for resource in large_resources[:5]:  # Limit reported
            size_mb = resource['transferSize'] / (1024 * 1024)
            issues.append(IssueData(
                category='resources',
                severity='medium' if size_mb > 2 else 'low',
                title=f"Large resource detected ({size_mb:.1f} MB)",
                description=(
                    f"The resource '{resource['name'][-80:]}' is {size_mb:.2f} MB. "
                    "Large resources increase page load time and bandwidth usage."
                ),
                tool='playwright',
                affected_url=resource['name'],
                remediation=(
                    'Optimize images using compression, use appropriate formats (WebP, AVIF), '
                    'implement lazy loading, or consider a CDN.'
                ),
                raw_data=resource
            ))

        raw_data['large_resources'] = large_resources

        # Count resources by type
        resource_counts = {}
        for req in network_requests:
            rtype = req.get('resource_type', 'other')
            resource_counts[rtype] = resource_counts.get(rtype, 0) + 1

        raw_data['resource_counts'] = resource_counts

        # Check for excessive requests
        if len(network_requests) > 100:
            issues.append(IssueData(
                category='performance',
                severity='medium',
                title='High number of network requests',
                description=(
                    f'Page made {len(network_requests)} network requests. '
                    'Excessive requests increase page load time and server load.'
                ),
                tool='playwright',
                affected_url=url,
                remediation=(
                    'Consolidate resources, use HTTP/2 multiplexing, implement '
                    'resource bundling, and lazy load non-critical resources.'
                ),
                raw_data=resource_counts
            ))

        self.logger.info(
            f"Playwright scan complete: {len(issues)} issues, {len(metrics)} metrics"
        )

        return ScannerResult(
            scanner_name=self.name,
            status=ScannerStatus.SUCCESS,
            issues=issues,
            metrics=metrics,
            raw_data=raw_data
        )