secure-web/backend/websites/models.py

"""
Database models for Website Analyzer.

This module defines the core data models for storing websites, scans,
issues, and metrics from various scanning tools.
"""

import uuid
from django.db import models
from django.utils import timezone
from django.core.validators import URLValidator


class Website(models.Model):
    """
    Represents a website that has been scanned.

    Each unique URL gets one Website record, which can have multiple
    Scan records associated with it.
    """

    id = models.UUIDField(
        primary_key=True,
        default=uuid.uuid4,
        editable=False,
        help_text="Unique identifier for the website"
    )
    url = models.URLField(
        max_length=2048,
        unique=True,
        validators=[URLValidator(schemes=['http', 'https'])],
        help_text="The normalized URL of the website"
    )
    domain = models.CharField(
        max_length=255,
        db_index=True,
        help_text="The domain extracted from the URL"
    )
    created_at = models.DateTimeField(
        auto_now_add=True,
        help_text="When the website was first added"
    )
    last_scanned_at = models.DateTimeField(
        null=True,
        blank=True,
        help_text="When the website was last scanned"
    )

    class Meta:
        db_table = 'websites'
        ordering = ['-created_at']
        indexes = [
            models.Index(fields=['domain']),
            models.Index(fields=['-last_scanned_at']),
        ]

    def __str__(self):
        return self.url

    def save(self, *args, **kwargs):
        """Extract domain from URL before saving."""
        if self.url:
            from urllib.parse import urlparse
            parsed = urlparse(self.url)
            self.domain = parsed.netloc.lower()
        super().save(*args, **kwargs)


class ScanStatus(models.TextChoices):
    """Enumeration of possible scan statuses."""
    PENDING = 'pending', 'Pending'
    RUNNING = 'running', 'Running'
    DONE = 'done', 'Completed'
    FAILED = 'failed', 'Failed'
    PARTIAL = 'partial', 'Partially Completed'


class Scan(models.Model):
    """
    Represents a single scan of a website.

    Contains aggregated scores from various scanning tools and
    links to detailed issues and metrics.
    """

    id = models.UUIDField(
        primary_key=True,
        default=uuid.uuid4,
        editable=False,
        help_text="Unique identifier for the scan"
    )
    website = models.ForeignKey(
        Website,
        on_delete=models.CASCADE,
        related_name='scans',
        help_text="The website that was scanned"
    )
    status = models.CharField(
        max_length=20,
        choices=ScanStatus.choices,
        default=ScanStatus.PENDING,
        db_index=True,
        help_text="Current status of the scan"
    )

    # Celery task tracking
    celery_task_id = models.CharField(
        max_length=255,
        null=True,
        blank=True,
        help_text="Celery task ID for tracking"
    )

    # Timestamps
    created_at = models.DateTimeField(
        auto_now_add=True,
        help_text="When the scan was created"
    )
    started_at = models.DateTimeField(
        null=True,
        blank=True,
        help_text="When the scan started running"
    )
    completed_at = models.DateTimeField(
        null=True,
        blank=True,
        help_text="When the scan completed"
    )

    # Aggregated scores (0-100)
    performance_score = models.IntegerField(
        null=True,
        blank=True,
        help_text="Lighthouse performance score (0-100)"
    )
    accessibility_score = models.IntegerField(
        null=True,
        blank=True,
        help_text="Lighthouse accessibility score (0-100)"
    )
    seo_score = models.IntegerField(
        null=True,
        blank=True,
        help_text="Lighthouse SEO score (0-100)"
    )
    best_practices_score = models.IntegerField(
        null=True,
        blank=True,
        help_text="Lighthouse best practices score (0-100)"
    )
    security_score = models.IntegerField(
        null=True,
        blank=True,
        help_text="Computed security score based on issues (0-100)"
    )

    # Overall health score (computed average)
    overall_score = models.IntegerField(
        null=True,
        blank=True,
        help_text="Overall health score (0-100)"
    )

    # Error tracking
    error_message = models.TextField(
        null=True,
        blank=True,
        help_text="Error message if scan failed"
    )

    # Raw data from scanners
    raw_lighthouse_data = models.JSONField(
        null=True,
        blank=True,
        help_text="Raw Lighthouse report data"
    )
    raw_zap_data = models.JSONField(
        null=True,
        blank=True,
        help_text="Raw OWASP ZAP report data"
    )
    raw_playwright_data = models.JSONField(
        null=True,
        blank=True,
        help_text="Raw Playwright analysis data"
    )
    raw_headers_data = models.JSONField(
        null=True,
        blank=True,
        help_text="Raw HTTP headers analysis data"
    )

    class Meta:
        db_table = 'scans'
        ordering = ['-created_at']
        indexes = [
            models.Index(fields=['status']),
            models.Index(fields=['-created_at']),
            models.Index(fields=['website', '-created_at']),
        ]

    def __str__(self):
        return f"Scan {self.id} - {self.website.url} ({self.status})"

    def calculate_overall_score(self):
        """
        Calculate overall health score as weighted average of all scores.

        Weights:
        - Performance: 25%
        - Security: 30%
        - Accessibility: 15%
        - SEO: 15%
        - Best Practices: 15%
        """
        scores = [
            (self.performance_score, 0.25),
            (self.security_score, 0.30),
            (self.accessibility_score, 0.15),
            (self.seo_score, 0.15),
            (self.best_practices_score, 0.15),
        ]

        total_weight = 0
        weighted_sum = 0

        for score, weight in scores:
            if score is not None:
                weighted_sum += score * weight
                total_weight += weight

        if total_weight > 0:
            self.overall_score = round(weighted_sum / total_weight)
        else:
            self.overall_score = None

        return self.overall_score

    def calculate_security_score(self):
        """
        Calculate security score based on security issues found.

        Starts at 100 and deducts points based on issue severity:
        - Critical: -25 points each
        - High: -15 points each
        - Medium: -8 points each
        - Low: -3 points each
        - Info: -1 point each
        """
        deductions = {
            'critical': 25,
            'high': 15,
            'medium': 8,
            'low': 3,
            'info': 1,
        }

        score = 100
        security_issues = self.issues.filter(
            category__in=['security', 'headers', 'tls', 'cors']
        )

        for issue in security_issues:
            score -= deductions.get(issue.severity, 0)

        self.security_score = max(0, score)
        return self.security_score


class IssueCategory(models.TextChoices):
    """Categories of issues that can be detected."""
    PERFORMANCE = 'performance', 'Performance'
    SECURITY = 'security', 'Security'
    HEADERS = 'headers', 'HTTP Headers'
    TLS = 'tls', 'TLS/SSL'
    CORS = 'cors', 'CORS'
    ACCESSIBILITY = 'accessibility', 'Accessibility'
    SEO = 'seo', 'SEO'
    BEST_PRACTICES = 'best_practices', 'Best Practices'
    CONTENT = 'content', 'Content'
    RESOURCES = 'resources', 'Resources'


class IssueSeverity(models.TextChoices):
    """Severity levels for issues."""
    CRITICAL = 'critical', 'Critical'
    HIGH = 'high', 'High'
    MEDIUM = 'medium', 'Medium'
    LOW = 'low', 'Low'
    INFO = 'info', 'Informational'


class ScannerTool(models.TextChoices):
    """Scanner tools that can detect issues."""
    LIGHTHOUSE = 'lighthouse', 'Google Lighthouse'
    ZAP = 'owasp_zap', 'OWASP ZAP'
    PLAYWRIGHT = 'playwright', 'Playwright'
    HEADER_CHECK = 'header_check', 'HTTP Header Check'
    TLS_CHECK = 'tls_check', 'TLS/SSL Check'


class Issue(models.Model):
    """
    Represents a specific issue found during a scan.

    Issues are categorized by type, severity, and the tool that detected them.
    Each issue includes a description and suggested remediation.
    """

    id = models.UUIDField(
        primary_key=True,
        default=uuid.uuid4,
        editable=False
    )
    scan = models.ForeignKey(
        Scan,
        on_delete=models.CASCADE,
        related_name='issues',
        help_text="The scan that found this issue"
    )

    # Classification
    category = models.CharField(
        max_length=30,
        choices=IssueCategory.choices,
        db_index=True,
        help_text="Category of the issue"
    )
    severity = models.CharField(
        max_length=20,
        choices=IssueSeverity.choices,
        db_index=True,
        help_text="Severity level of the issue"
    )
    tool = models.CharField(
        max_length=30,
        choices=ScannerTool.choices,
        help_text="Tool that detected this issue"
    )

    # Issue details
    title = models.CharField(
        max_length=500,
        help_text="Brief title of the issue"
    )
    description = models.TextField(
        help_text="Detailed description of the issue"
    )
    affected_url = models.URLField(
        max_length=2048,
        null=True,
        blank=True,
        help_text="Specific URL affected by this issue"
    )
    remediation = models.TextField(
        null=True,
        blank=True,
        help_text="Suggested fix or remediation"
    )

    # Additional data from scanner
    raw_data = models.JSONField(
        null=True,
        blank=True,
        help_text="Raw data from the scanner for this issue"
    )

    # Timestamps
    created_at = models.DateTimeField(
        auto_now_add=True
    )

    class Meta:
        db_table = 'issues'
        ordering = ['severity', '-created_at']
        indexes = [
            models.Index(fields=['scan', 'category']),
            models.Index(fields=['scan', 'severity']),
            models.Index(fields=['tool']),
        ]

    def __str__(self):
        return f"[{self.severity}] {self.title}"


class MetricUnit(models.TextChoices):
    """Units of measurement for metrics."""
    MILLISECONDS = 'ms', 'Milliseconds'
    SECONDS = 's', 'Seconds'
    BYTES = 'bytes', 'Bytes'
    KILOBYTES = 'kb', 'Kilobytes'
    MEGABYTES = 'mb', 'Megabytes'
    SCORE = 'score', 'Score (0-1)'
    PERCENT = 'percent', 'Percentage'
    COUNT = 'count', 'Count'


class Metric(models.Model):
    """
    Represents a specific metric measured during a scan.

    Metrics are numerical values with units, such as page load time,
    total byte weight, number of requests, etc.
    """

    id = models.UUIDField(
        primary_key=True,
        default=uuid.uuid4,
        editable=False
    )
    scan = models.ForeignKey(
        Scan,
        on_delete=models.CASCADE,
        related_name='metrics',
        help_text="The scan that measured this metric"
    )

    # Metric identification
    name = models.CharField(
        max_length=100,
        db_index=True,
        help_text="Name of the metric (e.g., 'first_contentful_paint_ms')"
    )
    display_name = models.CharField(
        max_length=200,
        help_text="Human-readable name for display"
    )

    # Value
    value = models.FloatField(
        help_text="Numeric value of the metric"
    )
    unit = models.CharField(
        max_length=20,
        choices=MetricUnit.choices,
        help_text="Unit of measurement"
    )

    # Source
    source = models.CharField(
        max_length=30,
        choices=ScannerTool.choices,
        help_text="Tool that provided this metric"
    )

    # Score (if applicable)
    score = models.FloatField(
        null=True,
        blank=True,
        help_text="Lighthouse score for this metric (0-1)"
    )

    # Timestamp
    created_at = models.DateTimeField(
        auto_now_add=True
    )

    class Meta:
        db_table = 'metrics'
        ordering = ['name']
        indexes = [
            models.Index(fields=['scan', 'name']),
            models.Index(fields=['source']),
        ]
        # Ensure unique metric names per scan
        constraints = [
            models.UniqueConstraint(
                fields=['scan', 'name'],
                name='unique_metric_per_scan'
            )
        ]

    def __str__(self):
        return f"{self.display_name}: {self.value} {self.unit}"

    def get_formatted_value(self):
        """Return a formatted string representation of the value."""
        if self.unit == MetricUnit.MILLISECONDS:
            if self.value >= 1000:
                return f"{self.value / 1000:.2f}s"
            return f"{self.value:.0f}ms"
        elif self.unit == MetricUnit.BYTES:
            if self.value >= 1024 * 1024:
                return f"{self.value / (1024 * 1024):.2f} MB"
            elif self.value >= 1024:
                return f"{self.value / 1024:.1f} KB"
            return f"{self.value:.0f} bytes"
        elif self.unit == MetricUnit.PERCENT:
            return f"{self.value:.1f}%"
        elif self.unit == MetricUnit.SCORE:
            return f"{self.value:.3f}"
        else:
            return f"{self.value:.2f} {self.get_unit_display()}"