""" URL validation and safety utilities. This module provides functions to validate URLs and ensure they don't target internal/private networks (SSRF protection). """ import ipaddress import socket import logging from typing import Tuple, Optional from urllib.parse import urlparse from django.conf import settings logger = logging.getLogger('scanner') def get_blocked_ip_ranges() -> list: """Get list of blocked IP ranges from settings.""" return settings.SCANNER_CONFIG.get('BLOCKED_IP_RANGES', [ '10.0.0.0/8', '172.16.0.0/12', '192.168.0.0/16', '127.0.0.0/8', '169.254.0.0/16', '::1/128', 'fc00::/7', 'fe80::/10', ]) def get_blocked_hosts() -> list: """Get list of blocked hostnames from settings.""" return settings.SCANNER_CONFIG.get('BLOCKED_HOSTS', [ 'localhost', 'localhost.localdomain', ]) def is_private_ip(ip_str: str) -> bool: """ Check if an IP address is private, loopback, or otherwise blocked. Args: ip_str: IP address string Returns: True if the IP should be blocked """ try: ip = ipaddress.ip_address(ip_str) # Check standard private/reserved ranges if ip.is_private or ip.is_loopback or ip.is_link_local: return True if ip.is_reserved or ip.is_multicast: return True # Check custom blocked ranges for cidr in get_blocked_ip_ranges(): try: network = ipaddress.ip_network(cidr, strict=False) if ip in network: return True except ValueError: continue return False except ValueError: # Invalid IP format return True def resolve_hostname(hostname: str) -> Optional[str]: """ Resolve a hostname to its IP address. Args: hostname: The hostname to resolve Returns: IP address string or None if resolution fails """ try: # Get all addresses and return the first one result = socket.getaddrinfo(hostname, None, socket.AF_UNSPEC) if result: return result[0][4][0] return None except socket.gaierror: return None def validate_url(url: str) -> Tuple[bool, str, Optional[str]]: """ Validate a URL for scanning safety. Checks: 1. URL format and scheme (must be http or https) 2. Hostname is not in blocked list 3. Resolved IP is not private/internal Args: url: The URL to validate Returns: Tuple of (is_valid, normalized_url, error_message) """ if not url: return False, url, "URL is required" # Parse the URL try: parsed = urlparse(url) except Exception as e: return False, url, f"Invalid URL format: {e}" # Check scheme if parsed.scheme not in ('http', 'https'): return False, url, "URL must use http or https scheme" # Check hostname exists if not parsed.netloc: return False, url, "URL must have a valid hostname" # Extract hostname (without port) hostname = parsed.hostname if not hostname: return False, url, "Could not extract hostname from URL" # Normalize hostname hostname = hostname.lower() # Check against blocked hostnames if hostname in get_blocked_hosts(): return False, url, f"Scanning {hostname} is not allowed" # Check if hostname is an IP address try: ip = ipaddress.ip_address(hostname) if is_private_ip(hostname): return False, url, "Scanning private/internal IP addresses is not allowed" except ValueError: # Not an IP, it's a hostname - resolve it resolved_ip = resolve_hostname(hostname) if resolved_ip: if is_private_ip(resolved_ip): return False, url, f"URL resolves to private IP ({resolved_ip}), scanning not allowed" # If we can't resolve, we'll let the scanner handle the error # Normalize the URL # Remove trailing slash from path, lowercase scheme and host normalized = f"{parsed.scheme}://{parsed.netloc.lower()}" if parsed.path and parsed.path != '/': normalized += parsed.path.rstrip('/') if parsed.query: normalized += f"?{parsed.query}" return True, normalized, None def normalize_url(url: str) -> str: """ Normalize a URL to a canonical form. Args: url: The URL to normalize Returns: Normalized URL string """ is_valid, normalized, _ = validate_url(url) return normalized if is_valid else url def extract_domain(url: str) -> str: """ Extract the domain from a URL. Args: url: The URL to extract domain from Returns: Domain string """ try: parsed = urlparse(url) return parsed.netloc.lower() except Exception: return ""