commit 627941f7673222f72eb5c654bcffe08504a2c798
parent 5ce8b7bc6309bf3c1ee0071fb239e79fcc043800
Author: gracefu <81774659+gracefuu@users.noreply.github.com>
Date: Tue, 15 Apr 2025 22:07:32 +0800
Some docstrings, slots, refactored a bit how cache functions work
Diffstat:
| M | examples.py | | | 14 | +++++++------- |
| M | make.py | | | 283 | +++++++++++++++++++++++++++++++++++++++++++++++++++---------------------------- |
2 files changed, 191 insertions(+), 106 deletions(-)
diff --git a/examples.py b/examples.py
@@ -1,4 +1,4 @@
-from make import cache, rule, detach, Fetch, Task, Build
+from make import hash_cache, rule, detach, Fetch, Task, Build
import asyncio
# Example rules
@@ -6,7 +6,7 @@ import asyncio
# res = await fetch(rule(task_args...))
-@cache
+@hash_cache
@rule
async def _eg_six(fetch: Fetch):
_ = fetch
@@ -35,9 +35,9 @@ async def _eg_multiply_add(fetch: Fetch, taskA: Task, taskB: Task, num: int):
# When interfacing with inputs or in general anything outside the build system,
-# Do NOT add @ctRebuilder, as it makes the task only rerun if a dependency was known to be modified.
+# Do NOT add @hash_cache, as it makes the task only rerun if a dependency was known to be modified.
# In this case, we have no real dependencies, and our output depends on the filesystem.
-# So we leave out @ctRebuilder to ensure we always check that the file has not changed.
+# So we leave out @hash_cache to ensure we always check that the file has not changed.
@rule
async def _eg_file(fetch: Fetch, filename: str):
_ = fetch
@@ -52,7 +52,7 @@ async def _eg_file(fetch: Fetch, filename: str):
_sem = asyncio.Semaphore(4)
-@cache
+@hash_cache
@rule
async def _eg_rec(fetch: Fetch, i: int):
if i // 3 - 1 >= 0:
@@ -70,7 +70,7 @@ async def _eg_rec(fetch: Fetch, i: int):
print("- rec", i)
-async def run_examples():
+async def main():
# To actually run the build system,
# 1) Create the store
# Use context manager to ensure the store is saved automatically when exiting
@@ -90,4 +90,4 @@ async def run_examples():
if __name__ == "__main__":
- asyncio.run(run_examples())
+ asyncio.run(main())
diff --git a/make.py b/make.py
@@ -1,27 +1,24 @@
"""
-pymake
-------
+make.py
+-------
Design inspired by the paper "Build Systems à la Carte"
-
- https://github.com/snowleopard/build
- https://www.microsoft.com/en-us/research/wp-content/uploads/2018/03/build-systems.pdf
-As such, we will adopt mostly the same vocabulary:
-
-- The goal of any build system is to bring up to date a *store* that implements a mapping from *keys* to *values*.
-- Keys/values provided by the user are *inputs*, keys/values produced by the build system are *outputs*, and all other keys/values are *intermediate*.
-- *Persistent build information* are additional information kept across invocations, in addition to the store itself.
-- *Task descriptions* specify how the new value for a key should be computed based on the (current) values of the dependencies.
-- A *build system* takes a set of task descriptions, a *target key*, a store, and updates the store such that the target key is up to date.
+Key concepts:
+- The goal is to maintain an up-to-date *store* mapping *tasks* to *values*.
+- Tasks are described using rules, functions from parameters to tasks.
+- Each rule can choose its own caching policy, the default being a persistent cache keyed by hashes.
+- The scheduler can also be adjusted, but currently the only scheduler is a top-down suspending scheduler.
-In our system, we make some slight adjustments
+make.py improves upon the paper's design in a few ways:
+- Task keys (for book-keeping purposes) are automatically derived from the rule functions
+- Supporting per-task cache policies rather than global rebuilders
+- Using modern Python async features for concurrent execution
-- For convenience, we automatically derive the task key from the task function, see fn_to_key.
-- As with the paper, we don't handle dependency cycles, since it's unclear which key to "seed" and with what "seed value".
-- While `rebuilder` is a global that's fixed for the whole build system, we reinterpret it as a cache policy, and so it really makes more sense to let each task choose its cache policy as opposed to having it be global.
-- This means that there is no such thing as an "input" or "output"/"intermediate" key, an input key is simply a key that hasn't been wrapped by a caching layer.
-- We focus on implementing the suspending scheduler and constructive traces cache policy.
+TODO:
+- Make files on the filesystem a core concept as opposed to merely something you can do.
"""
import asyncio
@@ -31,28 +28,75 @@ import inspect
import collections
import hashlib
-from typing import Awaitable, Callable, Any, Concatenate, Optional
+from typing import (
+ Any,
+ Optional,
+ Callable,
+ Protocol,
+ Tuple,
+ List,
+ TypedDict,
+)
+
+
+class Fetch(Protocol):
+ """Protocol defining the fetch operation used by tasks."""
+
+ async def __call__(self, task: "Task") -> Any: ...
-Fetch = Callable[["Task"], Awaitable[Any]]
-TaskKey = tuple
RuleKey = bytes
+TaskKey = tuple
+ValueHash = bytes
+TaskInputs = List[Tuple[TaskKey, ValueHash]]
+
+
+class RuleFn(Protocol):
+ """Protocol for rule functions that produce task values."""
-RuleFn = Callable[Concatenate[Fetch, TaskKey, "Store", ...], Awaitable[Any]]
+ async def __call__(
+ self,
+ fetch: Fetch,
+ task_key: TaskKey,
+ store: "Store",
+ *args: Any,
+ ) -> Any: ...
-def _make_hash(o: Any) -> bytes:
- h = hashlib.sha256()
+class NiceRuleFn(Protocol):
+ """Protocol for simplified rule functions that produce task values."""
+
+ async def __call__(
+ self,
+ fetch: Fetch,
+ *args: Any,
+ ) -> Any: ...
+
+
+class CacheFn(Protocol):
+ """Protocol for cache functions that call rule functions."""
+
+ async def __call__(
+ self,
+ fetch: Fetch,
+ task_key: TaskKey,
+ store: "Store",
+ rule_fn: RuleFn,
+ *args: Any,
+ ) -> Any: ...
+
+
+def make_hash(o: Any) -> bytes:
if isinstance(o, bytes):
- h.update(b"s")
+ h = hashlib.sha256(b"s")
h.update(o)
else:
- h.update(b"r")
+ h = hashlib.sha256(b"r")
h.update(repr(o).encode("utf-8"))
return h.digest()
-def _rule_fn_to_key(fn) -> RuleKey:
+def rule_fn_to_key(fn: Callable) -> RuleKey:
name = fn.__name__
source = inspect.getsource(fn)
h = hashlib.sha256(source.encode("utf-8")).hexdigest()[:16]
@@ -61,49 +105,51 @@ def _rule_fn_to_key(fn) -> RuleKey:
class Task:
+ """A computation of a value."""
+
+ __slots__ = "task_key", "rule_fn", "args", "hash"
+
task_key: TaskKey
rule_fn: RuleFn
args: tuple
hash: int
- @staticmethod
- def new(rule, *args):
- return Task(
- (
- rule.rule_key,
- *(arg.task_key if isinstance(arg, Task) else arg for arg in args),
- ),
- rule.rule_fn,
- *args,
- )
-
- def __init__(self, task_key, rule_fn, *args):
+ def __init__(self, task_key: TaskKey, rule_fn: RuleFn, *args):
self.task_key = task_key
self.rule_fn = rule_fn
self.args = args
self.hash = hash(self.task_key)
- def __call__(self, fetch: "Fetch", store: "Store"):
+ def __call__(self, fetch: Fetch, store: "Store"):
return self.rule_fn(fetch, self.task_key, store, *self.args)
- def __repr__(self):
+ def __repr__(self) -> str:
return repr(self.task_key)
- def __eq__(self, other):
+ def __eq__(self, other: object) -> bool:
+ if not isinstance(other, Task):
+ return NotImplemented
return self.task_key == other.task_key
- def __hash__(self):
+ def __hash__(self) -> int:
return self.hash
class Rule:
+ """A function that returns tasks."""
+
+ __slots__ = "rule_key", "rule_fn", "hash"
+
rule_key: RuleKey
rule_fn: RuleFn
hash: int
@staticmethod
def new(rule_fn: RuleFn):
- return Rule(_rule_fn_to_key(rule_fn), rule_fn)
+ return Rule(
+ rule_fn_to_key(rule_fn),
+ rule_fn,
+ )
def __init__(self, rule_key: RuleKey, rule_fn: RuleFn):
self.rule_key = rule_key
@@ -111,9 +157,18 @@ class Rule:
self.hash = hash(self.rule_key)
def __call__(self, *args):
- return Task.new(self, *args)
+ return Task(
+ (
+ self.rule_key,
+ *(arg.task_key if isinstance(arg, Task) else arg for arg in args),
+ ),
+ self.rule_fn,
+ *args,
+ )
def __eq__(self, other):
+ if not isinstance(other, Rule):
+ return NotImplemented
return self.rule_key == other.rule_key
def __hash__(self):
@@ -121,12 +176,29 @@ class Rule:
class Rules:
+ """The registry of all rules created."""
+
+ __slots__ = "rules"
+
rules: dict[RuleKey, Rule]
def __init__(self):
self.rules = dict()
- def rule(self, rule_fn):
+ def eval_task_key(self, task_key: TaskKey) -> Optional[Task]:
+ rule_key, *arg_keys = task_key
+ if rule_key not in self.rules:
+ return None
+ rule = self.rules[rule_key]
+ args = []
+ for arg in arg_keys:
+ if isinstance(arg, tuple) and arg[0] not in self.rules:
+ return None
+ args.append(self.eval_task_key(arg) if isinstance(arg, tuple) else arg)
+ return rule(*args)
+
+ def rule(self, rule_fn: NiceRuleFn) -> Rule:
+ @self.hash_cache
@self.rawrule
@functools.wraps(rule_fn)
def wrapped(fetch, task_key, store, *args):
@@ -134,74 +206,81 @@ class Rules:
return wrapped
- def rawrule(self, rule_fn):
+ def rawrule(self, rule_fn: RuleFn) -> Rule:
rule = Rule.new(rule_fn)
self.rules[rule.rule_key] = rule
return rule
- def eval_task_key(self, task_key) -> Optional[Task]:
- rule_key, *arg_keys = task_key
- if rule_key not in self.rules:
- return None
- rule = self.rules[rule_key]
- args = []
- for arg in arg_keys:
- if isinstance(arg, tuple) and arg[0] not in self.rules:
- return None
- args.append(self.eval_task_key(arg) if isinstance(arg, tuple) else arg)
- return rule(*args)
+ def hash_cache(self, rule: Rule) -> Rule:
+ """Adds hash based caching to a rule
- # Wraps a rule so it only gets rebuilt if the constructive traces don't match
- def cache(self):
- def decorator(rule: Rule):
- @functools.wraps(rule.rule_fn)
- async def new_rule_fn(fetch: Fetch, task_key: TaskKey, store: Store, *args):
- past_runs = store.key_info[task_key]
- output_value = store.key_value[task_key]
- possible_values = []
- for past_inputs, past_value in past_runs:
- for past_input_key, past_input_hash in past_inputs:
- input_task = self.eval_task_key(past_input_key)
- if not input_task:
- break
- current_input_value = await fetch(input_task)
- if _make_hash(current_input_value) != past_input_hash:
- break
- else:
- if output_value == past_value:
- return past_value
- possible_values.append(past_value)
-
- if possible_values:
- store.key_value[task_key] = possible_values[0]
- return possible_values[0]
-
- new_inputs = []
-
- async def track_fetch(task: Task):
- result = await fetch(task)
- new_inputs.append((task.task_key, _make_hash(result)))
- return result
-
- new_value = await rule.rule_fn(track_fetch, task_key, store, *args)
- store.key_value[task_key] = new_value
- store.key_info[task_key].append((new_inputs, new_value))
- return new_value
-
- wrapped_rule = Rule(rule.rule_key, new_rule_fn)
- self.rules[rule.rule_key] = wrapped_rule
- return wrapped_rule
-
- return decorator
+ Attempts to replay the rule by checking if the hashes of each input
+ it would have obtained if run now matches up with a previous run.
+
+ Currently, there is no cache eviction policy (all previous runs are stored forever).
+
+ TODO: Implement some cache eviction.
+ """
+ rule.rule_fn = functools.update_wrapper(
+ functools.partial(Rules.hash_cache_fn, self, rule.rule_fn),
+ rule.rule_fn,
+ )
+ return rule
+
+ async def hash_cache_fn(
+ self,
+ inner_rule_fn: RuleFn,
+ fetch: Fetch,
+ task_key: TaskKey,
+ store: "Store",
+ *args,
+ ):
+ """Actual implementation of hash_cache"""
+ if task_key in store.key_info:
+ past_runs = store.key_info[task_key]
+ output_value = store.key_value[task_key]
+ possible_values = []
+ for past_inputs, past_value in past_runs:
+ for past_input_key, past_input_hash in past_inputs:
+ input_task = self.eval_task_key(past_input_key)
+ if not input_task:
+ break
+ current_input_value = await fetch(input_task)
+ if make_hash(current_input_value) != past_input_hash:
+ break
+ else:
+ if output_value == past_value:
+ return past_value
+ possible_values.append(past_value)
+
+ if possible_values:
+ store.key_value[task_key] = possible_values[0]
+ return possible_values[0]
+
+ new_inputs = []
+
+ async def track_fetch(task: Task):
+ result = await fetch(task)
+ new_inputs.append((task.task_key, make_hash(result)))
+ return result
+
+ new_value = await inner_rule_fn(track_fetch, task_key, store, *args)
+ store.key_value[task_key] = new_value
+ store.key_info[task_key].append((new_inputs, new_value))
+ return new_value
_rules = Rules()
rule = _rules.rule
rawrule = _rules.rawrule
-cache = _rules.cache()
+hash_cache = _rules.hash_cache
class Store:
+ """Stores a mapping from tasks to their values."""
+
+ __slots__ = "filename", "rules", "key_value", "key_info"
+
@staticmethod
def _fNone():
return None
@@ -231,6 +310,8 @@ class Store:
class Detach:
+ __slots__ = "_background_tasks"
+
def __init__(self):
self._background_tasks = set()
@@ -244,6 +325,8 @@ detach = Detach()
class SuspendingFetch:
+ __slots__ = "store", "done", "waits"
+
def __init__(self, store: Store):
self.store = store
self.done = dict()
@@ -275,6 +358,8 @@ class SuspendingFetch:
class Build:
+ __slots__ = "_store", "_fetch"
+
def __init__(self, filename, rules=_rules):
self._store = Store(filename, rules)
self._fetch = SuspendingFetch(self._store)