from enum import Enum class CacheMode(Enum): """ Defines the caching behavior for web crawling operations. Modes: - ENABLED: Normal caching behavior (read and write) - DISABLED: No caching at all - READ_ONLY: Only read from cache, don't write - WRITE_ONLY: Only write to cache, don't read - BYPASS: Bypass cache for this operation """ ENABLED = "enabled" DISABLED = "disabled" READ_ONLY = "read_only" WRITE_ONLY = "write_only" BYPASS = "bypass" class CacheContext: """ Encapsulates cache-related decisions and URL handling. This class centralizes all cache-related logic and URL type checking, making the caching behavior more predictable and maintainable. Attributes: url (str): The URL being processed. cache_mode (CacheMode): The cache mode for the current operation. always_bypass (bool): If True, bypasses caching for this operation. is_cacheable (bool): True if the URL is cacheable, False otherwise. is_web_url (bool): True if the URL is a web URL, False otherwise. is_local_file (bool): True if the URL is a local file, False otherwise. is_raw_html (bool): True if the URL is raw HTML, False otherwise. _url_display (str): The display name for the URL (web, local file, or raw HTML). """ def __init__(self, url: str, cache_mode: CacheMode, always_bypass: bool = False): """ Initializes the CacheContext with the provided URL and cache mode. Args: url (str): The URL being processed. cache_mode (CacheMode): The cache mode for the current operation. always_bypass (bool): If True, bypasses caching for this operation. """ self.url = url self.cache_mode = cache_mode self.always_bypass = always_bypass self.is_cacheable = url.startswith(('http://', 'https://', 'file://')) self.is_web_url = url.startswith(('http://', 'https://')) self.is_local_file = url.startswith("file://") self.is_raw_html = url.startswith("raw:") self._url_display = url if not self.is_raw_html else "Raw HTML" def should_read(self) -> bool: """ Determines if cache should be read based on context. How it works: 1. If always_bypass is True or is_cacheable is False, return False. 2. If cache_mode is ENABLED or READ_ONLY, return True. Returns: bool: True if cache should be read, False otherwise. """ if self.always_bypass or not self.is_cacheable: return False return self.cache_mode in [CacheMode.ENABLED, CacheMode.READ_ONLY] def should_write(self) -> bool: """ Determines if cache should be written based on context. How it works: 1. If always_bypass is True or is_cacheable is False, return False. 2. If cache_mode is ENABLED or WRITE_ONLY, return True. Returns: bool: True if cache should be written, False otherwise. """ if self.always_bypass or not self.is_cacheable: return False return self.cache_mode in [CacheMode.ENABLED, CacheMode.WRITE_ONLY] @property def display_url(self) -> str: """Returns the URL in display format.""" return self._url_display def _legacy_to_cache_mode( disable_cache: bool = False, bypass_cache: bool = False, no_cache_read: bool = False, no_cache_write: bool = False ) -> CacheMode: """ Converts legacy cache parameters to the new CacheMode enum. This is an internal function to help transition from the old boolean flags to the new CacheMode system. """ if disable_cache: return CacheMode.DISABLED if bypass_cache: return CacheMode.BYPASS if no_cache_read and no_cache_write: return CacheMode.DISABLED if no_cache_read: return CacheMode.WRITE_ONLY if no_cache_write: return CacheMode.READ_ONLY return CacheMode.ENABLED