Skip to content

Cache Key Generation

beautyspot.cachekey モジュールは、関数の引数(入力)から一意で安定した SHA-256 ハッシュ値を生成する役割を担います。

beautyspot.cachekey

KeyGen

Generates stable cache keys (SHA-256) for function inputs (Identity Layer).

Source code in src/beautyspot/cachekey.py
class KeyGen:
    """
    Generates stable cache keys (SHA-256) for function inputs (Identity Layer).
    """

    # Constants for convenience usage in KeyGen.map()
    HASH = Strategy.DEFAULT
    IGNORE = Strategy.IGNORE
    FILE_CONTENT = Strategy.FILE_CONTENT
    PATH_STAT = Strategy.PATH_STAT

    @staticmethod
    def from_path_stat(filepath: str) -> str:
        """Fast: path + size + mtime (SHA-256)"""
        if not os.path.exists(filepath):
            return f"MISSING_{filepath}"
        stat = os.stat(filepath)
        identifier = f"{filepath}_{stat.st_size}_{stat.st_mtime}"
        return hashlib.sha256(identifier.encode()).hexdigest()

    @staticmethod
    def from_file_content(filepath: str) -> str:
        """Strict: file content hash (SHA-256)"""
        if not os.path.exists(filepath):
            return f"MISSING_{filepath}"

        hasher = hashlib.sha256()
        # Include extension to distinguish format changes
        hasher.update(os.path.splitext(filepath)[1].lower().encode())

        try:
            with open(filepath, "rb") as f:
                while chunk := f.read(65536):
                    hasher.update(chunk)
        except OSError:
            return f"ERROR_{filepath}"
        return hasher.hexdigest()

    @staticmethod
    def _default(args: tuple, kwargs: dict) -> str:
        """
        Generates a stable SHA-256 hash from function arguments using recursive canonicalization.
        This is the default legacy behavior sensitive to args/kwargs structure.
        """
        try:
            # 1. Normalize structure
            normalized = [canonicalize(args), canonicalize(kwargs)]

            # 2. Serialize to bytes
            packed = msgpack.packb(normalized)

            if packed is None:
                raise ValueError("msgpack.packb returned None")

            # 3. Hash (SHA-256)
            return hashlib.sha256(packed).hexdigest()

        except RecursionError:
            logger.warning(
                "Circular reference detected in arguments; falling back to str-based hash. "
                "This may cause unexpected cache misses if argument repr is not stable."
            )
            return hashlib.sha256(str((args, kwargs)).encode()).hexdigest()
        except Exception:
            logger.warning(
                "Failed to canonicalize or pack arguments; falling back to str-based hash. "
                "This may cause unexpected cache misses if argument repr is not stable."
            )
            return hashlib.sha256(str((args, kwargs)).encode()).hexdigest()

    @staticmethod
    def hash_items(items: list) -> str:
        """Helper to hash a list of canonicalized items."""
        try:
            packed = msgpack.packb(items)
            if packed is None:
                raise ValueError("msgpack.packb returned None")
            return hashlib.sha256(packed).hexdigest()
        except Exception:
            logger.warning(
                "Failed to pack canonicalized items; falling back to str-based hash. "
                "This may cause unexpected cache misses if argument repr is not stable."
            )
            return hashlib.sha256(str(items).encode()).hexdigest()

    # --- Factory Methods for Policies ---

    @classmethod
    def ignore(cls, *arg_names: str) -> KeyGenPolicy:
        """
        Creates a policy that ignores specific arguments (e.g., 'verbose', 'logger').
        """
        strategies = {name: Strategy.IGNORE for name in arg_names}
        return KeyGenPolicy(strategies, default_strategy=Strategy.DEFAULT)

    @classmethod
    def map(cls, **arg_strategies: Strategy) -> KeyGenPolicy:
        """
        Creates a policy with explicit strategies for specific arguments.
        """
        return KeyGenPolicy(arg_strategies, default_strategy=Strategy.DEFAULT)

    @classmethod
    def file_content(cls, *arg_names: str) -> KeyGenPolicy:
        """
        Creates a policy that treats specified arguments as file paths and hashes their content.
        """
        strategies = {name: Strategy.FILE_CONTENT for name in arg_names}
        return KeyGenPolicy(strategies, default_strategy=Strategy.DEFAULT)

    @classmethod
    def path_stat(cls, *arg_names: str) -> KeyGenPolicy:
        """
        Creates a policy that treats specified arguments as file paths and hashes their metadata (stat).
        """
        strategies = {name: Strategy.PATH_STAT for name in arg_names}
        return KeyGenPolicy(strategies, default_strategy=Strategy.DEFAULT)

file_content(*arg_names) classmethod

Creates a policy that treats specified arguments as file paths and hashes their content.

Source code in src/beautyspot/cachekey.py
@classmethod
def file_content(cls, *arg_names: str) -> KeyGenPolicy:
    """
    Creates a policy that treats specified arguments as file paths and hashes their content.
    """
    strategies = {name: Strategy.FILE_CONTENT for name in arg_names}
    return KeyGenPolicy(strategies, default_strategy=Strategy.DEFAULT)

from_file_content(filepath) staticmethod

Strict: file content hash (SHA-256)

Source code in src/beautyspot/cachekey.py
@staticmethod
def from_file_content(filepath: str) -> str:
    """Strict: file content hash (SHA-256)"""
    if not os.path.exists(filepath):
        return f"MISSING_{filepath}"

    hasher = hashlib.sha256()
    # Include extension to distinguish format changes
    hasher.update(os.path.splitext(filepath)[1].lower().encode())

    try:
        with open(filepath, "rb") as f:
            while chunk := f.read(65536):
                hasher.update(chunk)
    except OSError:
        return f"ERROR_{filepath}"
    return hasher.hexdigest()

from_path_stat(filepath) staticmethod

Fast: path + size + mtime (SHA-256)

Source code in src/beautyspot/cachekey.py
@staticmethod
def from_path_stat(filepath: str) -> str:
    """Fast: path + size + mtime (SHA-256)"""
    if not os.path.exists(filepath):
        return f"MISSING_{filepath}"
    stat = os.stat(filepath)
    identifier = f"{filepath}_{stat.st_size}_{stat.st_mtime}"
    return hashlib.sha256(identifier.encode()).hexdigest()

hash_items(items) staticmethod

Helper to hash a list of canonicalized items.

Source code in src/beautyspot/cachekey.py
@staticmethod
def hash_items(items: list) -> str:
    """Helper to hash a list of canonicalized items."""
    try:
        packed = msgpack.packb(items)
        if packed is None:
            raise ValueError("msgpack.packb returned None")
        return hashlib.sha256(packed).hexdigest()
    except Exception:
        logger.warning(
            "Failed to pack canonicalized items; falling back to str-based hash. "
            "This may cause unexpected cache misses if argument repr is not stable."
        )
        return hashlib.sha256(str(items).encode()).hexdigest()

ignore(*arg_names) classmethod

Creates a policy that ignores specific arguments (e.g., 'verbose', 'logger').

Source code in src/beautyspot/cachekey.py
@classmethod
def ignore(cls, *arg_names: str) -> KeyGenPolicy:
    """
    Creates a policy that ignores specific arguments (e.g., 'verbose', 'logger').
    """
    strategies = {name: Strategy.IGNORE for name in arg_names}
    return KeyGenPolicy(strategies, default_strategy=Strategy.DEFAULT)

map(**arg_strategies) classmethod

Creates a policy with explicit strategies for specific arguments.

Source code in src/beautyspot/cachekey.py
@classmethod
def map(cls, **arg_strategies: Strategy) -> KeyGenPolicy:
    """
    Creates a policy with explicit strategies for specific arguments.
    """
    return KeyGenPolicy(arg_strategies, default_strategy=Strategy.DEFAULT)

path_stat(*arg_names) classmethod

Creates a policy that treats specified arguments as file paths and hashes their metadata (stat).

Source code in src/beautyspot/cachekey.py
@classmethod
def path_stat(cls, *arg_names: str) -> KeyGenPolicy:
    """
    Creates a policy that treats specified arguments as file paths and hashes their metadata (stat).
    """
    strategies = {name: Strategy.PATH_STAT for name in arg_names}
    return KeyGenPolicy(strategies, default_strategy=Strategy.DEFAULT)

KeyGenPolicy

A policy object that binds to a function signature to generate cache keys based on argument-specific strategies.

Source code in src/beautyspot/cachekey.py
class KeyGenPolicy:
    """
    A policy object that binds to a function signature to generate cache keys
    based on argument-specific strategies.
    """

    def __init__(
        self,
        strategies: Dict[str, Strategy],
        default_strategy: Strategy = Strategy.DEFAULT,
    ):
        self.strategies = strategies
        self.default_strategy = default_strategy

    def bind(self, func: Callable[P, Any]) -> Callable[P, str]:
        """
        Creates a key generation function bound to the specific signature of `func`.
        """
        sig = inspect.signature(func)

        def _bound_keygen(*args: P.args, **kwargs: P.kwargs) -> str:
            # Bind arguments to names, applying defaults
            bound = sig.bind(*args, **kwargs)
            bound.apply_defaults()

            items_to_hash = []

            # Iterate over arguments in definition order
            for name, val in bound.arguments.items():
                strategy = self.strategies.get(name, self.default_strategy)

                if strategy == Strategy.IGNORE:
                    continue

                elif strategy == Strategy.FILE_CONTENT:
                    # Expecting val to be a path-like string
                    items_to_hash.append(KeyGen.from_file_content(str(val)))

                elif strategy == Strategy.PATH_STAT:
                    items_to_hash.append(KeyGen.from_path_stat(str(val)))

                else:  # DEFAULT
                    try:
                        items_to_hash.append(canonicalize(val))
                    except RecursionError:
                        logger.warning(
                            f"Circular reference detected in argument '{name}'; "
                            "falling back to str-based representation for this argument."
                        )
                        items_to_hash.append(str(val))

            # Hash the accumulated list of canonical items
            return KeyGen.hash_items(items_to_hash)

        return _bound_keygen

bind(func)

Creates a key generation function bound to the specific signature of func.

Source code in src/beautyspot/cachekey.py
def bind(self, func: Callable[P, Any]) -> Callable[P, str]:
    """
    Creates a key generation function bound to the specific signature of `func`.
    """
    sig = inspect.signature(func)

    def _bound_keygen(*args: P.args, **kwargs: P.kwargs) -> str:
        # Bind arguments to names, applying defaults
        bound = sig.bind(*args, **kwargs)
        bound.apply_defaults()

        items_to_hash = []

        # Iterate over arguments in definition order
        for name, val in bound.arguments.items():
            strategy = self.strategies.get(name, self.default_strategy)

            if strategy == Strategy.IGNORE:
                continue

            elif strategy == Strategy.FILE_CONTENT:
                # Expecting val to be a path-like string
                items_to_hash.append(KeyGen.from_file_content(str(val)))

            elif strategy == Strategy.PATH_STAT:
                items_to_hash.append(KeyGen.from_path_stat(str(val)))

            else:  # DEFAULT
                try:
                    items_to_hash.append(canonicalize(val))
                except RecursionError:
                    logger.warning(
                        f"Circular reference detected in argument '{name}'; "
                        "falling back to str-based representation for this argument."
                    )
                    items_to_hash.append(str(val))

        # Hash the accumulated list of canonical items
        return KeyGen.hash_items(items_to_hash)

    return _bound_keygen

Strategy

Bases: Enum

Defines the strategy for hashing a specific argument.

Source code in src/beautyspot/cachekey.py
class Strategy(Enum):
    """
    Defines the strategy for hashing a specific argument.
    """

    DEFAULT = auto()  # Recursively canonicalize and hash (Default behavior)
    IGNORE = auto()  # Exclude from hash calculation completely
    FILE_CONTENT = auto()  # Treat as file path and hash its content (Strict)
    PATH_STAT = (
        auto()
    )  # Treat as file path and hash its metadata (Fast: path+size+mtime)

canonicalize(obj)

Recursively converts an object into a canonical form suitable for stable Msgpack serialization.

Dispatch order for unregistered types: 1. Primitives → return as-is 2. Numpy-like arrays → tagged tuple via duck typing 3. Object instances → via dict / slots 4. Fallback → str()

Source code in src/beautyspot/cachekey.py
@singledispatch
def canonicalize(obj: Any) -> Any:
    """
    Recursively converts an object into a canonical form suitable for stable
    Msgpack serialization.

    Dispatch order for unregistered types:
    1. Primitives        → return as-is
    2. Numpy-like arrays → tagged tuple via duck typing
    3. Object instances  → via __dict__ / __slots__
    4. Fallback          → str()
    """
    if obj is None:
        return obj
    # bool は int のサブクラスなので、先に判定して型タグを付与する。
    # これにより f(True) と f(1) が異なるキャッシュキーを生成する。
    if isinstance(obj, bool):
        return ("__bool__", obj)
    if isinstance(obj, (int, float, str, bytes)):
        return obj

    if _is_ndarray_like(obj):
        try:
            return _canonicalize_ndarray(obj)
        except Exception:
            pass

    if hasattr(obj, "__dict__") or hasattr(obj, "__slots__"):
        return _canonicalize_instance(obj)

    logger.warning(
        f"Using str() fallback for unhandled type {type(obj)}. "
        "This may cause unstable cache keys across processes. "
        "Consider explicit type registration."
    )
    return str(obj)

設計思想

キャッシュキーの生成において、beautyspot は以下の 3 つを重視しています。

  1. 安定性 (Stability): Python のデフォルトの __repr__ に含まれるメモリアドレス(例: <Object at 0x...>)に依存せず、オブジェクトの内容に基づいたハッシュを生成します。
  2. 正規化 (Canonicalization): 辞書のキー順序や集合(Set)の順序を固定し、論理的に同じ入力からは必ず同じハッシュが生成されるようにします。
  3. 効率性: バイナリデータ(Numpy 配列等)を扱う際、テキスト変換のオーバーヘッドを避けるため msgpack を利用したバイナリシリアライズを採用しています。

正規化の戦略 (canonicalize)

canonicalize 関数は、あらゆる Python オブジェクトをシリアライズ可能な安定した形式に再帰的に変換します。

  • プリミティブ型: int, float, str, bytes, bool, None はそのまま保持されます。
  • コレクション: dict はキーでソートされたリストに、set はソートされたリストに変換されます。
  • Numpy 配列: numpy への依存を避けつつ、Duck Typing(shape, dtype, tobytes の確認)によって検知し、バイナリ情報を保持したままハッシュ化されます。これにより、巨大な配列の省略表示によるハッシュ衝突を防ぎます。
  • カスタムオブジェクト: __dict__ または __slots__ をスキャンし、オブジェクトの構造を反映します。 Pydantic (v1/v2) モデルのスキーマ抽出もサポートしています。

キー生成ポリシー (KeyGenPolicy)

特定の引数に対して、ハッシュ計算の方法をカスタマイズできます。

戦略 内容
DEFAULT オブジェクトを正規化してハッシュ化します。
IGNORE その引数をハッシュ計算から除外します(例: verbose フラグや logger)。
FILE_CONTENT 引数をファイルパスとみなし、ファイルの中身のハッシュを使用します。
PATH_STAT 引数をファイルパスとみなし、メタデータ(パス、サイズ、更新時刻)のハッシュを使用します(高速)。

使用例: ポリシーの適用

from beautyspot.cachekey import KeyGen

# 'verbose' 引数を無視し、'input_path' はファイルの中身でハッシュ化する
policy = KeyGen.map(
    input_path=KeyGen.FILE_CONTENT,
    verbose=KeyGen.IGNORE,
)

@spot.mark(keygen=policy)
def process_file(input_path, verbose=False):
    ...

技術的な詳細

  • シリアライズ: 正規化されたデータは msgpack を用いてバイト列に変換されます。
  • ハッシュアルゴリズム: セキュリティ基準と衝突耐性を考慮し、SHA-256 を採用しています(v1.x の MD5 から刷新されました)。