Some docstrings, slots, refactored a bit how cache functions work - pymake - A build system based on Build Systems à la Carte

commit 627941f7673222f72eb5c654bcffe08504a2c798
parent 5ce8b7bc6309bf3c1ee0071fb239e79fcc043800
Author: gracefu <81774659+gracefuu@users.noreply.github.com>
Date:   Tue, 15 Apr 2025 22:07:32 +0800

Some docstrings, slots, refactored a bit how cache functions work

Diffstat:
M examples.py  | 14 +++++++-------
M make.py  | 283 +++++++++++++++++++++++++++++++++++++++++++++++++++----------------------------

2 files changed, 191 insertions(+), 106 deletions(-)
diff --git a/examples.py b/examples.py
@@ -1,4 +1,4 @@
-from make import cache, rule, detach, Fetch, Task, Build
+from make import hash_cache, rule, detach, Fetch, Task, Build
 import asyncio
 
 # Example rules
@@ -6,7 +6,7 @@ import asyncio
 #     res = await fetch(rule(task_args...))
 
 
-@cache
+@hash_cache
 @rule
 async def _eg_six(fetch: Fetch):
     _ = fetch
@@ -35,9 +35,9 @@ async def _eg_multiply_add(fetch: Fetch, taskA: Task, taskB: Task, num: int):
 
 
 # When interfacing with inputs or in general anything outside the build system,
-# Do NOT add @ctRebuilder, as it makes the task only rerun if a dependency was known to be modified.
+# Do NOT add @hash_cache, as it makes the task only rerun if a dependency was known to be modified.
 # In this case, we have no real dependencies, and our output depends on the filesystem.
-# So we leave out @ctRebuilder to ensure we always check that the file has not changed.
+# So we leave out @hash_cache to ensure we always check that the file has not changed.
 @rule
 async def _eg_file(fetch: Fetch, filename: str):
     _ = fetch
@@ -52,7 +52,7 @@ async def _eg_file(fetch: Fetch, filename: str):
 _sem = asyncio.Semaphore(4)
 
 
-@cache
+@hash_cache
 @rule
 async def _eg_rec(fetch: Fetch, i: int):
     if i // 3 - 1 >= 0:
@@ -70,7 +70,7 @@ async def _eg_rec(fetch: Fetch, i: int):
         print("- rec", i)
 
 
-async def run_examples():
+async def main():
     # To actually run the build system,
     # 1) Create the store
     #    Use context manager to ensure the store is saved automatically when exiting
@@ -90,4 +90,4 @@ async def run_examples():
 
 
 if __name__ == "__main__":
-    asyncio.run(run_examples())
+    asyncio.run(main())
diff --git a/make.py b/make.py
@@ -1,27 +1,24 @@
 """
-pymake
-------
+make.py
+-------
 
 Design inspired by the paper "Build Systems à la Carte"
-
 - https://github.com/snowleopard/build
 - https://www.microsoft.com/en-us/research/wp-content/uploads/2018/03/build-systems.pdf
 
-As such, we will adopt mostly the same vocabulary:
-
-- The goal of any build system is to bring up to date a *store* that implements a mapping from *keys* to *values*.
-- Keys/values provided by the user are *inputs*, keys/values produced by the build system are *outputs*, and all other keys/values are *intermediate*.
-- *Persistent build information* are additional information kept across invocations, in addition to the store itself.
-- *Task descriptions* specify how the new value for a key should be computed based on the (current) values of the dependencies.
-- A *build system* takes a set of task descriptions, a *target key*, a store, and updates the store such that the target key is up to date.
+Key concepts:
+- The goal is to maintain an up-to-date *store* mapping *tasks* to *values*.
+- Tasks are described using rules, functions from parameters to tasks.
+- Each rule can choose its own caching policy, the default being a persistent cache keyed by hashes.
+- The scheduler can also be adjusted, but currently the only scheduler is a top-down suspending scheduler.
 
-In our system, we make some slight adjustments
+make.py improves upon the paper's design in a few ways:
+- Task keys (for book-keeping purposes) are automatically derived from the rule functions
+- Supporting per-task cache policies rather than global rebuilders
+- Using modern Python async features for concurrent execution
 
-- For convenience, we automatically derive the task key from the task function, see fn_to_key.
-- As with the paper, we don't handle dependency cycles, since it's unclear which key to "seed" and with what "seed value".
-- While `rebuilder` is a global that's fixed for the whole build system, we reinterpret it as a cache policy, and so it really makes more sense to let each task choose its cache policy as opposed to having it be global.
-- This means that there is no such thing as an "input" or "output"/"intermediate" key, an input key is simply a key that hasn't been wrapped by a caching layer.
-- We focus on implementing the suspending scheduler and constructive traces cache policy.
+TODO:
+- Make files on the filesystem a core concept as opposed to merely something you can do.
 """
 
 import asyncio
@@ -31,28 +28,75 @@ import inspect
 import collections
 import hashlib
 
-from typing import Awaitable, Callable, Any, Concatenate, Optional
+from typing import (
+    Any,
+    Optional,
+    Callable,
+    Protocol,
+    Tuple,
+    List,
+    TypedDict,
+)
+
+
+class Fetch(Protocol):
+    """Protocol defining the fetch operation used by tasks."""
+
+    async def __call__(self, task: "Task") -> Any: ...
 
 
-Fetch = Callable[["Task"], Awaitable[Any]]
-TaskKey = tuple
 RuleKey = bytes
+TaskKey = tuple
+ValueHash = bytes
+TaskInputs = List[Tuple[TaskKey, ValueHash]]
+
+
+class RuleFn(Protocol):
+    """Protocol for rule functions that produce task values."""
 
-RuleFn = Callable[Concatenate[Fetch, TaskKey, "Store", ...], Awaitable[Any]]
+    async def __call__(
+        self,
+        fetch: Fetch,
+        task_key: TaskKey,
+        store: "Store",
+        *args: Any,
+    ) -> Any: ...
 
 
-def _make_hash(o: Any) -> bytes:
-    h = hashlib.sha256()
+class NiceRuleFn(Protocol):
+    """Protocol for simplified rule functions that produce task values."""
+
+    async def __call__(
+        self,
+        fetch: Fetch,
+        *args: Any,
+    ) -> Any: ...
+
+
+class CacheFn(Protocol):
+    """Protocol for cache functions that call rule functions."""
+
+    async def __call__(
+        self,
+        fetch: Fetch,
+        task_key: TaskKey,
+        store: "Store",
+        rule_fn: RuleFn,
+        *args: Any,
+    ) -> Any: ...
+
+
+def make_hash(o: Any) -> bytes:
     if isinstance(o, bytes):
-        h.update(b"s")
+        h = hashlib.sha256(b"s")
         h.update(o)
     else:
-        h.update(b"r")
+        h = hashlib.sha256(b"r")
         h.update(repr(o).encode("utf-8"))
     return h.digest()
 
 
-def _rule_fn_to_key(fn) -> RuleKey:
+def rule_fn_to_key(fn: Callable) -> RuleKey:
     name = fn.__name__
     source = inspect.getsource(fn)
     h = hashlib.sha256(source.encode("utf-8")).hexdigest()[:16]
@@ -61,49 +105,51 @@ def _rule_fn_to_key(fn) -> RuleKey:
 
 
 class Task:
+    """A computation of a value."""
+
+    __slots__ = "task_key", "rule_fn", "args", "hash"
+
     task_key: TaskKey
     rule_fn: RuleFn
     args: tuple
     hash: int
 
-    @staticmethod
-    def new(rule, *args):
-        return Task(
-            (
-                rule.rule_key,
-                *(arg.task_key if isinstance(arg, Task) else arg for arg in args),
-            ),
-            rule.rule_fn,
-            *args,
-        )
-
-    def __init__(self, task_key, rule_fn, *args):
+    def __init__(self, task_key: TaskKey, rule_fn: RuleFn, *args):
         self.task_key = task_key
         self.rule_fn = rule_fn
         self.args = args
         self.hash = hash(self.task_key)
 
-    def __call__(self, fetch: "Fetch", store: "Store"):
+    def __call__(self, fetch: Fetch, store: "Store"):
         return self.rule_fn(fetch, self.task_key, store, *self.args)
 
-    def __repr__(self):
+    def __repr__(self) -> str:
         return repr(self.task_key)
 
-    def __eq__(self, other):
+    def __eq__(self, other: object) -> bool:
+        if not isinstance(other, Task):
+            return NotImplemented
         return self.task_key == other.task_key
 
-    def __hash__(self):
+    def __hash__(self) -> int:
         return self.hash
 
 
 class Rule:
+    """A function that returns tasks."""
+
+    __slots__ = "rule_key", "rule_fn", "hash"
+
     rule_key: RuleKey
     rule_fn: RuleFn
     hash: int
 
     @staticmethod
     def new(rule_fn: RuleFn):
-        return Rule(_rule_fn_to_key(rule_fn), rule_fn)
+        return Rule(
+            rule_fn_to_key(rule_fn),
+            rule_fn,
+        )
 
     def __init__(self, rule_key: RuleKey, rule_fn: RuleFn):
         self.rule_key = rule_key
@@ -111,9 +157,18 @@ class Rule:
         self.hash = hash(self.rule_key)
 
     def __call__(self, *args):
-        return Task.new(self, *args)
+        return Task(
+            (
+                self.rule_key,
+                *(arg.task_key if isinstance(arg, Task) else arg for arg in args),
+            ),
+            self.rule_fn,
+            *args,
+        )
 
     def __eq__(self, other):
+        if not isinstance(other, Rule):
+            return NotImplemented
         return self.rule_key == other.rule_key
 
     def __hash__(self):
@@ -121,12 +176,29 @@ class Rule:
 
 
 class Rules:
+    """The registry of all rules created."""
+
+    __slots__ = "rules"
+
     rules: dict[RuleKey, Rule]
 
     def __init__(self):
         self.rules = dict()
 
-    def rule(self, rule_fn):
+    def eval_task_key(self, task_key: TaskKey) -> Optional[Task]:
+        rule_key, *arg_keys = task_key
+        if rule_key not in self.rules:
+            return None
+        rule = self.rules[rule_key]
+        args = []
+        for arg in arg_keys:
+            if isinstance(arg, tuple) and arg[0] not in self.rules:
+                return None
+            args.append(self.eval_task_key(arg) if isinstance(arg, tuple) else arg)
+        return rule(*args)
+
+    def rule(self, rule_fn: NiceRuleFn) -> Rule:
+        @self.hash_cache
         @self.rawrule
         @functools.wraps(rule_fn)
         def wrapped(fetch, task_key, store, *args):
@@ -134,74 +206,81 @@ class Rules:
 
         return wrapped
 
-    def rawrule(self, rule_fn):
+    def rawrule(self, rule_fn: RuleFn) -> Rule:
         rule = Rule.new(rule_fn)
         self.rules[rule.rule_key] = rule
         return rule
 
-    def eval_task_key(self, task_key) -> Optional[Task]:
-        rule_key, *arg_keys = task_key
-        if rule_key not in self.rules:
-            return None
-        rule = self.rules[rule_key]
-        args = []
-        for arg in arg_keys:
-            if isinstance(arg, tuple) and arg[0] not in self.rules:
-                return None
-            args.append(self.eval_task_key(arg) if isinstance(arg, tuple) else arg)
-        return rule(*args)
+    def hash_cache(self, rule: Rule) -> Rule:
+        """Adds hash based caching to a rule
 
-    # Wraps a rule so it only gets rebuilt if the constructive traces don't match
-    def cache(self):
-        def decorator(rule: Rule):
-            @functools.wraps(rule.rule_fn)
-            async def new_rule_fn(fetch: Fetch, task_key: TaskKey, store: Store, *args):
-                past_runs = store.key_info[task_key]
-                output_value = store.key_value[task_key]
-                possible_values = []
-                for past_inputs, past_value in past_runs:
-                    for past_input_key, past_input_hash in past_inputs:
-                        input_task = self.eval_task_key(past_input_key)
-                        if not input_task:
-                            break
-                        current_input_value = await fetch(input_task)
-                        if _make_hash(current_input_value) != past_input_hash:
-                            break
-                    else:
-                        if output_value == past_value:
-                            return past_value
-                        possible_values.append(past_value)
-
-                if possible_values:
-                    store.key_value[task_key] = possible_values[0]
-                    return possible_values[0]
-
-                new_inputs = []
-
-                async def track_fetch(task: Task):
-                    result = await fetch(task)
-                    new_inputs.append((task.task_key, _make_hash(result)))
-                    return result
-
-                new_value = await rule.rule_fn(track_fetch, task_key, store, *args)
-                store.key_value[task_key] = new_value
-                store.key_info[task_key].append((new_inputs, new_value))
-                return new_value
-
-            wrapped_rule = Rule(rule.rule_key, new_rule_fn)
-            self.rules[rule.rule_key] = wrapped_rule
-            return wrapped_rule
-
-        return decorator
+        Attempts to replay the rule by checking if the hashes of each input
+        it would have obtained if run now matches up with a previous run.
+
+        Currently, there is no cache eviction policy (all previous runs are stored forever).
+
+        TODO: Implement some cache eviction.
+        """
+        rule.rule_fn = functools.update_wrapper(
+            functools.partial(Rules.hash_cache_fn, self, rule.rule_fn),
+            rule.rule_fn,
+        )
+        return rule
+
+    async def hash_cache_fn(
+        self,
+        inner_rule_fn: RuleFn,
+        fetch: Fetch,
+        task_key: TaskKey,
+        store: "Store",
+        *args,
+    ):
+        """Actual implementation of hash_cache"""
+        if task_key in store.key_info:
+            past_runs = store.key_info[task_key]
+            output_value = store.key_value[task_key]
+            possible_values = []
+            for past_inputs, past_value in past_runs:
+                for past_input_key, past_input_hash in past_inputs:
+                    input_task = self.eval_task_key(past_input_key)
+                    if not input_task:
+                        break
+                    current_input_value = await fetch(input_task)
+                    if make_hash(current_input_value) != past_input_hash:
+                        break
+                else:
+                    if output_value == past_value:
+                        return past_value
+                    possible_values.append(past_value)
+
+            if possible_values:
+                store.key_value[task_key] = possible_values[0]
+                return possible_values[0]
+
+        new_inputs = []
+
+        async def track_fetch(task: Task):
+            result = await fetch(task)
+            new_inputs.append((task.task_key, make_hash(result)))
+            return result
+
+        new_value = await inner_rule_fn(track_fetch, task_key, store, *args)
+        store.key_value[task_key] = new_value
+        store.key_info[task_key].append((new_inputs, new_value))
+        return new_value
 
 
 _rules = Rules()
 rule = _rules.rule
 rawrule = _rules.rawrule
-cache = _rules.cache()
+hash_cache = _rules.hash_cache
 
 
 class Store:
+    """Stores a mapping from tasks to their values."""
+
+    __slots__ = "filename", "rules", "key_value", "key_info"
+
     @staticmethod
     def _fNone():
         return None
@@ -231,6 +310,8 @@ class Store:
 
 
 class Detach:
+    __slots__ = "_background_tasks"
+
     def __init__(self):
         self._background_tasks = set()
 
@@ -244,6 +325,8 @@ detach = Detach()
 
 
 class SuspendingFetch:
+    __slots__ = "store", "done", "waits"
+
     def __init__(self, store: Store):
         self.store = store
         self.done = dict()
@@ -275,6 +358,8 @@ class SuspendingFetch:
 
 
 class Build:
+    __slots__ = "_store", "_fetch"
+
     def __init__(self, filename, rules=_rules):
         self._store = Store(filename, rules)
         self._fetch = SuspendingFetch(self._store)

	pymake A build system based on Build Systems à la Carte
	git clone https://git.grace.moe/pymake
	Log \| Files \| Refs \| README

M	examples.py	\|	14	+++++++-------
M	make.py	\|	283	+++++++++++++++++++++++++++++++++++++++++++++++++++----------------------------