pymake

A build system based on Build Systems à la Carte
git clone https://git.grace.moe/pymake
Log | Files | Refs | README

commit 627941f7673222f72eb5c654bcffe08504a2c798
parent 5ce8b7bc6309bf3c1ee0071fb239e79fcc043800
Author: gracefu <81774659+gracefuu@users.noreply.github.com>
Date:   Tue, 15 Apr 2025 22:07:32 +0800

Some docstrings, slots, refactored a bit how cache functions work

Diffstat:
Mexamples.py | 14+++++++-------
Mmake.py | 283+++++++++++++++++++++++++++++++++++++++++++++++++++----------------------------
2 files changed, 191 insertions(+), 106 deletions(-)

diff --git a/examples.py b/examples.py @@ -1,4 +1,4 @@ -from make import cache, rule, detach, Fetch, Task, Build +from make import hash_cache, rule, detach, Fetch, Task, Build import asyncio # Example rules @@ -6,7 +6,7 @@ import asyncio # res = await fetch(rule(task_args...)) -@cache +@hash_cache @rule async def _eg_six(fetch: Fetch): _ = fetch @@ -35,9 +35,9 @@ async def _eg_multiply_add(fetch: Fetch, taskA: Task, taskB: Task, num: int): # When interfacing with inputs or in general anything outside the build system, -# Do NOT add @ctRebuilder, as it makes the task only rerun if a dependency was known to be modified. +# Do NOT add @hash_cache, as it makes the task only rerun if a dependency was known to be modified. # In this case, we have no real dependencies, and our output depends on the filesystem. -# So we leave out @ctRebuilder to ensure we always check that the file has not changed. +# So we leave out @hash_cache to ensure we always check that the file has not changed. @rule async def _eg_file(fetch: Fetch, filename: str): _ = fetch @@ -52,7 +52,7 @@ async def _eg_file(fetch: Fetch, filename: str): _sem = asyncio.Semaphore(4) -@cache +@hash_cache @rule async def _eg_rec(fetch: Fetch, i: int): if i // 3 - 1 >= 0: @@ -70,7 +70,7 @@ async def _eg_rec(fetch: Fetch, i: int): print("- rec", i) -async def run_examples(): +async def main(): # To actually run the build system, # 1) Create the store # Use context manager to ensure the store is saved automatically when exiting @@ -90,4 +90,4 @@ async def run_examples(): if __name__ == "__main__": - asyncio.run(run_examples()) + asyncio.run(main()) diff --git a/make.py b/make.py @@ -1,27 +1,24 @@ """ -pymake ------- +make.py +------- Design inspired by the paper "Build Systems à la Carte" - - https://github.com/snowleopard/build - https://www.microsoft.com/en-us/research/wp-content/uploads/2018/03/build-systems.pdf -As such, we will adopt mostly the same vocabulary: - -- The goal of any build system is to bring up to date a *store* that implements a mapping from *keys* to *values*. -- Keys/values provided by the user are *inputs*, keys/values produced by the build system are *outputs*, and all other keys/values are *intermediate*. -- *Persistent build information* are additional information kept across invocations, in addition to the store itself. -- *Task descriptions* specify how the new value for a key should be computed based on the (current) values of the dependencies. -- A *build system* takes a set of task descriptions, a *target key*, a store, and updates the store such that the target key is up to date. +Key concepts: +- The goal is to maintain an up-to-date *store* mapping *tasks* to *values*. +- Tasks are described using rules, functions from parameters to tasks. +- Each rule can choose its own caching policy, the default being a persistent cache keyed by hashes. +- The scheduler can also be adjusted, but currently the only scheduler is a top-down suspending scheduler. -In our system, we make some slight adjustments +make.py improves upon the paper's design in a few ways: +- Task keys (for book-keeping purposes) are automatically derived from the rule functions +- Supporting per-task cache policies rather than global rebuilders +- Using modern Python async features for concurrent execution -- For convenience, we automatically derive the task key from the task function, see fn_to_key. -- As with the paper, we don't handle dependency cycles, since it's unclear which key to "seed" and with what "seed value". -- While `rebuilder` is a global that's fixed for the whole build system, we reinterpret it as a cache policy, and so it really makes more sense to let each task choose its cache policy as opposed to having it be global. -- This means that there is no such thing as an "input" or "output"/"intermediate" key, an input key is simply a key that hasn't been wrapped by a caching layer. -- We focus on implementing the suspending scheduler and constructive traces cache policy. +TODO: +- Make files on the filesystem a core concept as opposed to merely something you can do. """ import asyncio @@ -31,28 +28,75 @@ import inspect import collections import hashlib -from typing import Awaitable, Callable, Any, Concatenate, Optional +from typing import ( + Any, + Optional, + Callable, + Protocol, + Tuple, + List, + TypedDict, +) + + +class Fetch(Protocol): + """Protocol defining the fetch operation used by tasks.""" + + async def __call__(self, task: "Task") -> Any: ... -Fetch = Callable[["Task"], Awaitable[Any]] -TaskKey = tuple RuleKey = bytes +TaskKey = tuple +ValueHash = bytes +TaskInputs = List[Tuple[TaskKey, ValueHash]] + + +class RuleFn(Protocol): + """Protocol for rule functions that produce task values.""" -RuleFn = Callable[Concatenate[Fetch, TaskKey, "Store", ...], Awaitable[Any]] + async def __call__( + self, + fetch: Fetch, + task_key: TaskKey, + store: "Store", + *args: Any, + ) -> Any: ... -def _make_hash(o: Any) -> bytes: - h = hashlib.sha256() +class NiceRuleFn(Protocol): + """Protocol for simplified rule functions that produce task values.""" + + async def __call__( + self, + fetch: Fetch, + *args: Any, + ) -> Any: ... + + +class CacheFn(Protocol): + """Protocol for cache functions that call rule functions.""" + + async def __call__( + self, + fetch: Fetch, + task_key: TaskKey, + store: "Store", + rule_fn: RuleFn, + *args: Any, + ) -> Any: ... + + +def make_hash(o: Any) -> bytes: if isinstance(o, bytes): - h.update(b"s") + h = hashlib.sha256(b"s") h.update(o) else: - h.update(b"r") + h = hashlib.sha256(b"r") h.update(repr(o).encode("utf-8")) return h.digest() -def _rule_fn_to_key(fn) -> RuleKey: +def rule_fn_to_key(fn: Callable) -> RuleKey: name = fn.__name__ source = inspect.getsource(fn) h = hashlib.sha256(source.encode("utf-8")).hexdigest()[:16] @@ -61,49 +105,51 @@ def _rule_fn_to_key(fn) -> RuleKey: class Task: + """A computation of a value.""" + + __slots__ = "task_key", "rule_fn", "args", "hash" + task_key: TaskKey rule_fn: RuleFn args: tuple hash: int - @staticmethod - def new(rule, *args): - return Task( - ( - rule.rule_key, - *(arg.task_key if isinstance(arg, Task) else arg for arg in args), - ), - rule.rule_fn, - *args, - ) - - def __init__(self, task_key, rule_fn, *args): + def __init__(self, task_key: TaskKey, rule_fn: RuleFn, *args): self.task_key = task_key self.rule_fn = rule_fn self.args = args self.hash = hash(self.task_key) - def __call__(self, fetch: "Fetch", store: "Store"): + def __call__(self, fetch: Fetch, store: "Store"): return self.rule_fn(fetch, self.task_key, store, *self.args) - def __repr__(self): + def __repr__(self) -> str: return repr(self.task_key) - def __eq__(self, other): + def __eq__(self, other: object) -> bool: + if not isinstance(other, Task): + return NotImplemented return self.task_key == other.task_key - def __hash__(self): + def __hash__(self) -> int: return self.hash class Rule: + """A function that returns tasks.""" + + __slots__ = "rule_key", "rule_fn", "hash" + rule_key: RuleKey rule_fn: RuleFn hash: int @staticmethod def new(rule_fn: RuleFn): - return Rule(_rule_fn_to_key(rule_fn), rule_fn) + return Rule( + rule_fn_to_key(rule_fn), + rule_fn, + ) def __init__(self, rule_key: RuleKey, rule_fn: RuleFn): self.rule_key = rule_key @@ -111,9 +157,18 @@ class Rule: self.hash = hash(self.rule_key) def __call__(self, *args): - return Task.new(self, *args) + return Task( + ( + self.rule_key, + *(arg.task_key if isinstance(arg, Task) else arg for arg in args), + ), + self.rule_fn, + *args, + ) def __eq__(self, other): + if not isinstance(other, Rule): + return NotImplemented return self.rule_key == other.rule_key def __hash__(self): @@ -121,12 +176,29 @@ class Rule: class Rules: + """The registry of all rules created.""" + + __slots__ = "rules" + rules: dict[RuleKey, Rule] def __init__(self): self.rules = dict() - def rule(self, rule_fn): + def eval_task_key(self, task_key: TaskKey) -> Optional[Task]: + rule_key, *arg_keys = task_key + if rule_key not in self.rules: + return None + rule = self.rules[rule_key] + args = [] + for arg in arg_keys: + if isinstance(arg, tuple) and arg[0] not in self.rules: + return None + args.append(self.eval_task_key(arg) if isinstance(arg, tuple) else arg) + return rule(*args) + + def rule(self, rule_fn: NiceRuleFn) -> Rule: + @self.hash_cache @self.rawrule @functools.wraps(rule_fn) def wrapped(fetch, task_key, store, *args): @@ -134,74 +206,81 @@ class Rules: return wrapped - def rawrule(self, rule_fn): + def rawrule(self, rule_fn: RuleFn) -> Rule: rule = Rule.new(rule_fn) self.rules[rule.rule_key] = rule return rule - def eval_task_key(self, task_key) -> Optional[Task]: - rule_key, *arg_keys = task_key - if rule_key not in self.rules: - return None - rule = self.rules[rule_key] - args = [] - for arg in arg_keys: - if isinstance(arg, tuple) and arg[0] not in self.rules: - return None - args.append(self.eval_task_key(arg) if isinstance(arg, tuple) else arg) - return rule(*args) + def hash_cache(self, rule: Rule) -> Rule: + """Adds hash based caching to a rule - # Wraps a rule so it only gets rebuilt if the constructive traces don't match - def cache(self): - def decorator(rule: Rule): - @functools.wraps(rule.rule_fn) - async def new_rule_fn(fetch: Fetch, task_key: TaskKey, store: Store, *args): - past_runs = store.key_info[task_key] - output_value = store.key_value[task_key] - possible_values = [] - for past_inputs, past_value in past_runs: - for past_input_key, past_input_hash in past_inputs: - input_task = self.eval_task_key(past_input_key) - if not input_task: - break - current_input_value = await fetch(input_task) - if _make_hash(current_input_value) != past_input_hash: - break - else: - if output_value == past_value: - return past_value - possible_values.append(past_value) - - if possible_values: - store.key_value[task_key] = possible_values[0] - return possible_values[0] - - new_inputs = [] - - async def track_fetch(task: Task): - result = await fetch(task) - new_inputs.append((task.task_key, _make_hash(result))) - return result - - new_value = await rule.rule_fn(track_fetch, task_key, store, *args) - store.key_value[task_key] = new_value - store.key_info[task_key].append((new_inputs, new_value)) - return new_value - - wrapped_rule = Rule(rule.rule_key, new_rule_fn) - self.rules[rule.rule_key] = wrapped_rule - return wrapped_rule - - return decorator + Attempts to replay the rule by checking if the hashes of each input + it would have obtained if run now matches up with a previous run. + + Currently, there is no cache eviction policy (all previous runs are stored forever). + + TODO: Implement some cache eviction. + """ + rule.rule_fn = functools.update_wrapper( + functools.partial(Rules.hash_cache_fn, self, rule.rule_fn), + rule.rule_fn, + ) + return rule + + async def hash_cache_fn( + self, + inner_rule_fn: RuleFn, + fetch: Fetch, + task_key: TaskKey, + store: "Store", + *args, + ): + """Actual implementation of hash_cache""" + if task_key in store.key_info: + past_runs = store.key_info[task_key] + output_value = store.key_value[task_key] + possible_values = [] + for past_inputs, past_value in past_runs: + for past_input_key, past_input_hash in past_inputs: + input_task = self.eval_task_key(past_input_key) + if not input_task: + break + current_input_value = await fetch(input_task) + if make_hash(current_input_value) != past_input_hash: + break + else: + if output_value == past_value: + return past_value + possible_values.append(past_value) + + if possible_values: + store.key_value[task_key] = possible_values[0] + return possible_values[0] + + new_inputs = [] + + async def track_fetch(task: Task): + result = await fetch(task) + new_inputs.append((task.task_key, make_hash(result))) + return result + + new_value = await inner_rule_fn(track_fetch, task_key, store, *args) + store.key_value[task_key] = new_value + store.key_info[task_key].append((new_inputs, new_value)) + return new_value _rules = Rules() rule = _rules.rule rawrule = _rules.rawrule -cache = _rules.cache() +hash_cache = _rules.hash_cache class Store: + """Stores a mapping from tasks to their values.""" + + __slots__ = "filename", "rules", "key_value", "key_info" + @staticmethod def _fNone(): return None @@ -231,6 +310,8 @@ class Store: class Detach: + __slots__ = "_background_tasks" + def __init__(self): self._background_tasks = set() @@ -244,6 +325,8 @@ detach = Detach() class SuspendingFetch: + __slots__ = "store", "done", "waits" + def __init__(self, store: Store): self.store = store self.done = dict() @@ -275,6 +358,8 @@ class SuspendingFetch: class Build: + __slots__ = "_store", "_fetch" + def __init__(self, filename, rules=_rules): self._store = Store(filename, rules) self._fetch = SuspendingFetch(self._store)