Cache Key Generation
beautyspot.cachekey モジュールは、関数の引数(入力)から一意で安定した SHA-256 ハッシュ値を生成する役割を担います。
beautyspot.cachekey
KeyGen
Generates stable cache keys (SHA-256) for function inputs (Identity Layer).
Source code in src/beautyspot/cachekey.py
| class KeyGen:
"""
Generates stable cache keys (SHA-256) for function inputs (Identity Layer).
"""
# Constants for convenience usage in KeyGen.map()
HASH = Strategy.DEFAULT
IGNORE = Strategy.IGNORE
FILE_CONTENT = Strategy.FILE_CONTENT
PATH_STAT = Strategy.PATH_STAT
@staticmethod
def from_path_stat(filepath: str) -> str:
"""Fast: path + size + mtime (SHA-256)"""
if not os.path.exists(filepath):
return f"MISSING_{filepath}"
stat = os.stat(filepath)
identifier = f"{filepath}_{stat.st_size}_{stat.st_mtime}"
return hashlib.sha256(identifier.encode()).hexdigest()
@staticmethod
def from_file_content(filepath: str) -> str:
"""Strict: file content hash (SHA-256)"""
if not os.path.exists(filepath):
return f"MISSING_{filepath}"
hasher = hashlib.sha256()
# Include extension to distinguish format changes
hasher.update(os.path.splitext(filepath)[1].lower().encode())
try:
with open(filepath, "rb") as f:
while chunk := f.read(65536):
hasher.update(chunk)
except OSError:
return f"ERROR_{filepath}"
return hasher.hexdigest()
@staticmethod
def _default(args: tuple, kwargs: dict) -> str:
"""
Generates a stable SHA-256 hash from function arguments using recursive canonicalization.
This is the default legacy behavior sensitive to args/kwargs structure.
"""
try:
# 1. Normalize structure
normalized = [canonicalize(args), canonicalize(kwargs)]
# 2. Serialize to bytes
packed = msgpack.packb(normalized)
if packed is None:
raise ValueError("msgpack.packb returned None")
# 3. Hash (SHA-256)
return hashlib.sha256(packed).hexdigest()
except RecursionError:
logger.warning(
"Circular reference detected in arguments; falling back to str-based hash. "
"This may cause unexpected cache misses if argument repr is not stable."
)
return hashlib.sha256(str((args, kwargs)).encode()).hexdigest()
except Exception:
logger.warning(
"Failed to canonicalize or pack arguments; falling back to str-based hash. "
"This may cause unexpected cache misses if argument repr is not stable."
)
return hashlib.sha256(str((args, kwargs)).encode()).hexdigest()
@staticmethod
def hash_items(items: list) -> str:
"""Helper to hash a list of canonicalized items."""
try:
packed = msgpack.packb(items)
if packed is None:
raise ValueError("msgpack.packb returned None")
return hashlib.sha256(packed).hexdigest()
except Exception:
logger.warning(
"Failed to pack canonicalized items; falling back to str-based hash. "
"This may cause unexpected cache misses if argument repr is not stable."
)
return hashlib.sha256(str(items).encode()).hexdigest()
# --- Factory Methods for Policies ---
@classmethod
def ignore(cls, *arg_names: str) -> KeyGenPolicy:
"""
Creates a policy that ignores specific arguments (e.g., 'verbose', 'logger').
"""
strategies = {name: Strategy.IGNORE for name in arg_names}
return KeyGenPolicy(strategies, default_strategy=Strategy.DEFAULT)
@classmethod
def map(cls, **arg_strategies: Strategy) -> KeyGenPolicy:
"""
Creates a policy with explicit strategies for specific arguments.
"""
return KeyGenPolicy(arg_strategies, default_strategy=Strategy.DEFAULT)
@classmethod
def file_content(cls, *arg_names: str) -> KeyGenPolicy:
"""
Creates a policy that treats specified arguments as file paths and hashes their content.
"""
strategies = {name: Strategy.FILE_CONTENT for name in arg_names}
return KeyGenPolicy(strategies, default_strategy=Strategy.DEFAULT)
@classmethod
def path_stat(cls, *arg_names: str) -> KeyGenPolicy:
"""
Creates a policy that treats specified arguments as file paths and hashes their metadata (stat).
"""
strategies = {name: Strategy.PATH_STAT for name in arg_names}
return KeyGenPolicy(strategies, default_strategy=Strategy.DEFAULT)
|
file_content(*arg_names)
classmethod
Creates a policy that treats specified arguments as file paths and hashes their content.
Source code in src/beautyspot/cachekey.py
| @classmethod
def file_content(cls, *arg_names: str) -> KeyGenPolicy:
"""
Creates a policy that treats specified arguments as file paths and hashes their content.
"""
strategies = {name: Strategy.FILE_CONTENT for name in arg_names}
return KeyGenPolicy(strategies, default_strategy=Strategy.DEFAULT)
|
from_file_content(filepath)
staticmethod
Strict: file content hash (SHA-256)
Source code in src/beautyspot/cachekey.py
| @staticmethod
def from_file_content(filepath: str) -> str:
"""Strict: file content hash (SHA-256)"""
if not os.path.exists(filepath):
return f"MISSING_{filepath}"
hasher = hashlib.sha256()
# Include extension to distinguish format changes
hasher.update(os.path.splitext(filepath)[1].lower().encode())
try:
with open(filepath, "rb") as f:
while chunk := f.read(65536):
hasher.update(chunk)
except OSError:
return f"ERROR_{filepath}"
return hasher.hexdigest()
|
from_path_stat(filepath)
staticmethod
Fast: path + size + mtime (SHA-256)
Source code in src/beautyspot/cachekey.py
| @staticmethod
def from_path_stat(filepath: str) -> str:
"""Fast: path + size + mtime (SHA-256)"""
if not os.path.exists(filepath):
return f"MISSING_{filepath}"
stat = os.stat(filepath)
identifier = f"{filepath}_{stat.st_size}_{stat.st_mtime}"
return hashlib.sha256(identifier.encode()).hexdigest()
|
hash_items(items)
staticmethod
Helper to hash a list of canonicalized items.
Source code in src/beautyspot/cachekey.py
| @staticmethod
def hash_items(items: list) -> str:
"""Helper to hash a list of canonicalized items."""
try:
packed = msgpack.packb(items)
if packed is None:
raise ValueError("msgpack.packb returned None")
return hashlib.sha256(packed).hexdigest()
except Exception:
logger.warning(
"Failed to pack canonicalized items; falling back to str-based hash. "
"This may cause unexpected cache misses if argument repr is not stable."
)
return hashlib.sha256(str(items).encode()).hexdigest()
|
ignore(*arg_names)
classmethod
Creates a policy that ignores specific arguments (e.g., 'verbose', 'logger').
Source code in src/beautyspot/cachekey.py
| @classmethod
def ignore(cls, *arg_names: str) -> KeyGenPolicy:
"""
Creates a policy that ignores specific arguments (e.g., 'verbose', 'logger').
"""
strategies = {name: Strategy.IGNORE for name in arg_names}
return KeyGenPolicy(strategies, default_strategy=Strategy.DEFAULT)
|
map(**arg_strategies)
classmethod
Creates a policy with explicit strategies for specific arguments.
Source code in src/beautyspot/cachekey.py
| @classmethod
def map(cls, **arg_strategies: Strategy) -> KeyGenPolicy:
"""
Creates a policy with explicit strategies for specific arguments.
"""
return KeyGenPolicy(arg_strategies, default_strategy=Strategy.DEFAULT)
|
path_stat(*arg_names)
classmethod
Creates a policy that treats specified arguments as file paths and hashes their metadata (stat).
Source code in src/beautyspot/cachekey.py
| @classmethod
def path_stat(cls, *arg_names: str) -> KeyGenPolicy:
"""
Creates a policy that treats specified arguments as file paths and hashes their metadata (stat).
"""
strategies = {name: Strategy.PATH_STAT for name in arg_names}
return KeyGenPolicy(strategies, default_strategy=Strategy.DEFAULT)
|
KeyGenPolicy
A policy object that binds to a function signature to generate cache keys
based on argument-specific strategies.
Source code in src/beautyspot/cachekey.py
| class KeyGenPolicy:
"""
A policy object that binds to a function signature to generate cache keys
based on argument-specific strategies.
"""
def __init__(
self,
strategies: Dict[str, Strategy],
default_strategy: Strategy = Strategy.DEFAULT,
):
self.strategies = strategies
self.default_strategy = default_strategy
def bind(self, func: Callable[P, Any]) -> Callable[P, str]:
"""
Creates a key generation function bound to the specific signature of `func`.
"""
sig = inspect.signature(func)
def _bound_keygen(*args: P.args, **kwargs: P.kwargs) -> str:
# Bind arguments to names, applying defaults
bound = sig.bind(*args, **kwargs)
bound.apply_defaults()
items_to_hash = []
# Iterate over arguments in definition order
for name, val in bound.arguments.items():
strategy = self.strategies.get(name, self.default_strategy)
if strategy == Strategy.IGNORE:
continue
elif strategy == Strategy.FILE_CONTENT:
# Expecting val to be a path-like string
items_to_hash.append(KeyGen.from_file_content(str(val)))
elif strategy == Strategy.PATH_STAT:
items_to_hash.append(KeyGen.from_path_stat(str(val)))
else: # DEFAULT
try:
items_to_hash.append(canonicalize(val))
except RecursionError:
logger.warning(
f"Circular reference detected in argument '{name}'; "
"falling back to str-based representation for this argument."
)
items_to_hash.append(str(val))
# Hash the accumulated list of canonical items
return KeyGen.hash_items(items_to_hash)
return _bound_keygen
|
bind(func)
Creates a key generation function bound to the specific signature of func.
Source code in src/beautyspot/cachekey.py
| def bind(self, func: Callable[P, Any]) -> Callable[P, str]:
"""
Creates a key generation function bound to the specific signature of `func`.
"""
sig = inspect.signature(func)
def _bound_keygen(*args: P.args, **kwargs: P.kwargs) -> str:
# Bind arguments to names, applying defaults
bound = sig.bind(*args, **kwargs)
bound.apply_defaults()
items_to_hash = []
# Iterate over arguments in definition order
for name, val in bound.arguments.items():
strategy = self.strategies.get(name, self.default_strategy)
if strategy == Strategy.IGNORE:
continue
elif strategy == Strategy.FILE_CONTENT:
# Expecting val to be a path-like string
items_to_hash.append(KeyGen.from_file_content(str(val)))
elif strategy == Strategy.PATH_STAT:
items_to_hash.append(KeyGen.from_path_stat(str(val)))
else: # DEFAULT
try:
items_to_hash.append(canonicalize(val))
except RecursionError:
logger.warning(
f"Circular reference detected in argument '{name}'; "
"falling back to str-based representation for this argument."
)
items_to_hash.append(str(val))
# Hash the accumulated list of canonical items
return KeyGen.hash_items(items_to_hash)
return _bound_keygen
|
Strategy
Bases: Enum
Defines the strategy for hashing a specific argument.
Source code in src/beautyspot/cachekey.py
| class Strategy(Enum):
"""
Defines the strategy for hashing a specific argument.
"""
DEFAULT = auto() # Recursively canonicalize and hash (Default behavior)
IGNORE = auto() # Exclude from hash calculation completely
FILE_CONTENT = auto() # Treat as file path and hash its content (Strict)
PATH_STAT = (
auto()
) # Treat as file path and hash its metadata (Fast: path+size+mtime)
|
canonicalize(obj)
Recursively converts an object into a canonical form suitable for stable
Msgpack serialization.
Dispatch order for unregistered types:
1. Primitives → return as-is
2. Numpy-like arrays → tagged tuple via duck typing
3. Object instances → via dict / slots
4. Fallback → str()
Source code in src/beautyspot/cachekey.py
| @singledispatch
def canonicalize(obj: Any) -> Any:
"""
Recursively converts an object into a canonical form suitable for stable
Msgpack serialization.
Dispatch order for unregistered types:
1. Primitives → return as-is
2. Numpy-like arrays → tagged tuple via duck typing
3. Object instances → via __dict__ / __slots__
4. Fallback → str()
"""
if obj is None:
return obj
# bool は int のサブクラスなので、先に判定して型タグを付与する。
# これにより f(True) と f(1) が異なるキャッシュキーを生成する。
if isinstance(obj, bool):
return ("__bool__", obj)
if isinstance(obj, (int, float, str, bytes)):
return obj
if _is_ndarray_like(obj):
try:
return _canonicalize_ndarray(obj)
except Exception:
pass
if hasattr(obj, "__dict__") or hasattr(obj, "__slots__"):
return _canonicalize_instance(obj)
logger.warning(
f"Using str() fallback for unhandled type {type(obj)}. "
"This may cause unstable cache keys across processes. "
"Consider explicit type registration."
)
return str(obj)
|
設計思想
キャッシュキーの生成において、beautyspot は以下の 3 つを重視しています。
- 安定性 (Stability): Python のデフォルトの
__repr__ に含まれるメモリアドレス(例: <Object at 0x...>)に依存せず、オブジェクトの内容に基づいたハッシュを生成します。
- 正規化 (Canonicalization): 辞書のキー順序や集合(Set)の順序を固定し、論理的に同じ入力からは必ず同じハッシュが生成されるようにします。
- 効率性: バイナリデータ(Numpy 配列等)を扱う際、テキスト変換のオーバーヘッドを避けるため
msgpack を利用したバイナリシリアライズを採用しています。
正規化の戦略 (canonicalize)
canonicalize 関数は、あらゆる Python オブジェクトをシリアライズ可能な安定した形式に再帰的に変換します。
- プリミティブ型:
int, float, str, bytes, bool, None はそのまま保持されます。
- コレクション:
dict はキーでソートされたリストに、set はソートされたリストに変換されます。
- Numpy 配列:
numpy への依存を避けつつ、Duck Typing(shape, dtype, tobytes の確認)によって検知し、バイナリ情報を保持したままハッシュ化されます。これにより、巨大な配列の省略表示によるハッシュ衝突を防ぎます。
- カスタムオブジェクト:
__dict__ または __slots__ をスキャンし、オブジェクトの構造を反映します。 Pydantic (v1/v2) モデルのスキーマ抽出もサポートしています。
キー生成ポリシー (KeyGenPolicy)
特定の引数に対して、ハッシュ計算の方法をカスタマイズできます。
| 戦略 |
内容 |
DEFAULT |
オブジェクトを正規化してハッシュ化します。 |
IGNORE |
その引数をハッシュ計算から除外します(例: verbose フラグや logger)。 |
FILE_CONTENT |
引数をファイルパスとみなし、ファイルの中身のハッシュを使用します。 |
PATH_STAT |
引数をファイルパスとみなし、メタデータ(パス、サイズ、更新時刻)のハッシュを使用します(高速)。 |
使用例: ポリシーの適用
from beautyspot.cachekey import KeyGen
# 'verbose' 引数を無視し、'input_path' はファイルの中身でハッシュ化する
policy = KeyGen.map(
input_path=KeyGen.FILE_CONTENT,
verbose=KeyGen.IGNORE,
)
@spot.mark(keygen=policy)
def process_file(input_path, verbose=False):
...
技術的な詳細
- シリアライズ: 正規化されたデータは
msgpack を用いてバイト列に変換されます。
- ハッシュアルゴリズム: セキュリティ基準と衝突耐性を考慮し、
SHA-256 を採用しています(v1.x の MD5 から刷新されました)。