Refactor, tweak, support concurrency - pymake - A build system based on Build Systems à la Carte

commit 09ea2700c79218ee15fb8263f0fdf65657619aa4
parent 8c4e9f385ac40871964ea51fe76fa97f41c13180
Author: gracefu <81774659+gracefuu@users.noreply.github.com>
Date:   Tue, 15 Apr 2025 03:33:06 +0800

Refactor, tweak, support concurrency

Diffstat:
M make.py  | 324 ++++++++++++++++++++++++++++++++++++++++++++++++-------------------------------

1 file changed, 199 insertions(+), 125 deletions(-)
diff --git a/make.py b/make.py
@@ -2,7 +2,7 @@
 pymake
 ------
 
-Design inspired by the paper `Build Systems à la Carte'
+Design inspired by the paper "Build Systems à la Carte"
 
 - https://github.com/snowleopard/build
 - https://www.microsoft.com/en-us/research/wp-content/uploads/2018/03/build-systems.pdf
@@ -17,13 +17,11 @@ As such, we will adopt mostly the same vocabulary:
 
 In our system, we make some slight adjustments
 
-- In fact, we don't distinguish tasks and keys -- we pass around the tasks themselves.
-  - For storage purposes, we treat task equality based on string equality of the task function.
-- We focus on implementing the suspending scheduler and constructive traces rebuilder.
+- For convenience, we automatically derive the task key from the task function, see fn_to_key.
 - As with the paper, we don't handle dependency cycles, since it's unclear which key to "seed" and with what "seed value".
-- While `fetch` in the paper is a parameter that's passed around, we just have it be a global function in our case.
-- Similarly, while `rebuilder` is a global that's fixed for the whole build system, we interpret it really as just a fancy way to say @cache, and so it really makes more sense to let each task choose its rebuild strategy.
-- This means that there is no such thing as an "input" or "output"/"intermediate" key, an input key is simply a key that hasn't been wrapped by a rebuilder.
+- While `rebuilder` is a global that's fixed for the whole build system, we reinterpret it as a cache policy, and so it really makes more sense to let each task choose its cache policy as opposed to having it be global.
+- This means that there is no such thing as an "input" or "output"/"intermediate" key, an input key is simply a key that hasn't been wrapped by a caching layer.
+- We focus on implementing the suspending scheduler and constructive traces cache policy.
 """
 
 import asyncio
@@ -35,14 +33,13 @@ import hashlib
 
 from typing import Awaitable, Callable, Any, Concatenate, Optional
 
-# Rules are functions that take in python primitives (bool, int, none, str) and tasks, and output a task.
-# Tasks are coroutine functions with a single argument `build`
-# All rules must be registered with the decorator @rule.
-#
-# For convenience, rules with no arguments can also be created by decorating @task on the coroutine function directly.
 
+FetchFn = Callable[["Task"], Awaitable[Any]]
+TaskKey = str
+RuleKey = str
 
-def make_hash(o: Any) -> bytes:
+
+def _make_hash(o: Any) -> bytes:
     h = hashlib.sha256()
     if isinstance(o, bytes):
         h.update(b"s")
@@ -53,7 +50,20 @@ def make_hash(o: Any) -> bytes:
     return h.digest()
 
 
+def _fn_to_key(fn) -> str:
+    name = fn.__name__
+    source = inspect.getsource(fn)
+    h = hashlib.sha256(source.encode("utf-8")).hexdigest()[:16]
+    key = f"{name}-{len(source)}-{h}"
+    return key
+
+
 class Task:
+    task_key: TaskKey
+    rule_fn: Callable[Concatenate[FetchFn, TaskKey, "Store", ...], Awaitable[Any]]
+    args: tuple
+    hash: int
+
     @staticmethod
     def new(rule, *args):
         return Task(
@@ -61,18 +71,18 @@ class Task:
                 rule.rule_key,
                 *(arg.task_key if hasattr(arg, "task_key") else arg for arg in args),
             ),
-            rule,
+            rule.rule_fn,
             *args,
         )
 
-    def __init__(self, task_key, rule, *args):
+    def __init__(self, task_key, rule_fn, *args):
         self.task_key = task_key
-        self.rule = rule
+        self.rule_fn = rule_fn
         self.args = args
         self.hash = hash(self.task_key)
 
-    def __call__(self, fetch: "Fetch"):
-        return self.rule.rule_fn(fetch, *self.args)
+    def __call__(self, fetch: "FetchFn", store: "Store"):
+        return self.rule_fn(fetch, self.task_key, store, *self.args)
 
     def __repr__(self):
         return repr(self.task_key)
@@ -84,30 +94,14 @@ class Task:
         return self.hash
 
 
-class Fetch:
-    fetch_fn: Callable[[Task], Awaitable[Any]]
-    task: Task
-    build: "Build"
-
-    def __init__(self, fetch_fn, task, build):
-        self.fetch_fn = fetch_fn
-        self.task = task
-        self.build = build
-
-    def __call__(self, dep: Task):
-        return self.fetch_fn(dep)
-
-
 class Rule:
-    rule_key: str
-    rule_fn: Callable[Concatenate[Fetch, ...], Awaitable[Any]]
+    rule_key: RuleKey
+    rule_fn: Callable[Concatenate[FetchFn, TaskKey, "Store", ...], Awaitable[Any]]
+    hash: int
 
     @staticmethod
     def new(rule_fn):
-        name = rule_fn.__name__
-        source = inspect.getsource(rule_fn)
-        h = hashlib.sha256(source.encode("utf-8")).hexdigest()[:16]
-        return Rule(f"{name}-{len(source)}-{h}", rule_fn)
+        return Rule(_fn_to_key(rule_fn), rule_fn)
 
     def __init__(self, rule_key, rule_fn):
         self.rule_key = rule_key
@@ -125,38 +119,43 @@ class Rule:
 
 
 class Rules:
+    rules: dict[RuleKey, Rule]
+
     def __init__(self):
         self.rules = dict()
 
-    def rule(self):
-        def decorator(rule_fn):
-            rule = Rule.new(rule_fn)
-            self.rules[rule.rule_key] = rule
-            return rule
+    def rule(self, rule_fn):
+        @self.rawrule
+        @functools.wraps(rule_fn)
+        def wrapped(fetch, task_key, store, *args):
+            return rule_fn(fetch, *args)
 
-        return decorator
+        return wrapped
+
+    def rawrule(self, rule_fn):
+        rule = Rule.new(rule_fn)
+        self.rules[rule.rule_key] = rule
+        return rule
 
     def eval_task_key(self, task_key) -> Optional[Task]:
         rule_key, *arg_keys = task_key
         if rule_key not in self.rules:
             return None
+        rule = self.rules[rule_key]
+        args = []
         for arg in arg_keys:
             if isinstance(arg, tuple) and arg[0] not in self.rules:
                 return None
-        rule = self.rules[rule_key]
-        args = (
-            self.eval_task_key(arg) if isinstance(arg, tuple) else arg
-            for arg in arg_keys
-        )
+            args.append(self.eval_task_key(arg) if isinstance(arg, tuple) else arg)
         return rule(*args)
 
     # Wraps a rule so it only gets rebuilt if the constructive traces don't match
-    def ctRebuilder(self):
+    def cache(self):
         def decorator(rule: Rule):
             @functools.wraps(rule.rule_fn)
-            async def new_rule_fn(fetch, *args):
-                past_runs = fetch.build.key_info[fetch.task.task_key]
-                output_value = fetch.build.key_value[fetch.task.task_key]
+            async def new_rule_fn(fetch: FetchFn, task_key: str, store: "Store", *args):
+                past_runs = store.key_info[task_key]
+                output_value = store.key_value[task_key]
                 possible_values = []
                 for past_inputs, past_value in past_runs:
                     for past_input_key, past_input_hash in past_inputs:
@@ -164,7 +163,7 @@ class Rules:
                         if not input_task:
                             break
                         current_input_value = await fetch(input_task)
-                        if make_hash(current_input_value) != past_input_hash:
+                        if _make_hash(current_input_value) != past_input_hash:
                             break
                     else:
                         if output_value == past_value:
@@ -172,22 +171,19 @@ class Rules:
                         possible_values.append(past_value)
 
                 if possible_values:
-                    fetch.build.key_value[fetch.task.task_key] = possible_values[0]
+                    store.key_value[task_key] = possible_values[0]
                     return possible_values[0]
 
                 new_inputs = []
 
-                async def track(task: Task):
+                async def track_fetch(task: Task):
                     result = await fetch(task)
-                    new_inputs.append((task.task_key, make_hash(result)))
+                    new_inputs.append((task.task_key, _make_hash(result)))
                     return result
 
-                task = Task.new(rule, *args)
-                new_value = await task(Fetch(track, task, fetch.build))
-                fetch.build.key_value[fetch.task.task_key] = new_value
-                fetch.build.key_info[fetch.task.task_key].append(
-                    (new_inputs, new_value)
-                )
+                new_value = await rule.rule_fn(track_fetch, task_key, store, *args)
+                store.key_value[task_key] = new_value
+                store.key_info[task_key].append((new_inputs, new_value))
                 return new_value
 
             wrapped_rule = Rule(rule.rule_key, new_rule_fn)
@@ -197,53 +193,23 @@ class Rules:
         return decorator
 
 
-rules = Rules()
-rule = rules.rule()
-ctRebuilder = rules.ctRebuilder()
-
-
-# Example rule
-@ctRebuilder
-@rule
-async def eg_six(fetch: Fetch):
-    _ = fetch
-    print(f"{6=}")
-    return 6
-
+_rules = Rules()
+rule = _rules.rule
+rawrule = _rules.rawrule
+cache = _rules.cache()
 
-# Example of a rule with a dependency
-@rule
-async def eg_thirtysix(fetch: Fetch):
-    # Rules should be called to get tasks
-    # In this case, the rule had 0 tasks
-    # task = eg_six()
-    # call fetch to mark a dependency,
-    # (and begins execution of it in parallel if possible.)
-    # `await` it get the result of the dependency
-    six1 = await fetch(eg_six())
-    six2 = await fetch(eg_six())
-    print(f"{six1 * six2=}")
-    return six1 * six2
 
-
-# Tasks can be parameterized based on other tasks or just normal values.
-@rule
-async def eg_multiply_add(fetch: Fetch, taskA: Task, taskB: Task, num: int):
-    a, b = await asyncio.gather(fetch(taskA), fetch(taskB))
-    print(f"{a * b + num=}")
-    return a * b + num
-
-
-def fNone():
+def _fNone():
     return None
 
 
-class Build:
+class Store:
     def __init__(self, filename, rules):
         self.filename = filename
         self.rules = rules
 
-        self.key_value = collections.defaultdict(fNone)
+        self.mutex = asyncio.Semaphore()
+        self.key_value = collections.defaultdict(_fNone)
         self.key_info = collections.defaultdict(list)
 
         try:
@@ -252,42 +218,150 @@ class Build:
         except:
             pass
 
+    def save(self):
+        with open(self.filename, "wb") as f:
+            pickle.dump((self.key_value, self.key_info), f)
+
     def __enter__(self):
         return self
 
     def __exit__(self, exc_type, exc_val, exc_tb):
-        with open(self.filename, "wb") as f:
-            pickle.dump((self.key_value, self.key_info), f)
+        self.save()
+
+
+_background_tasks = set()
+
+
+def detach(*args, **kwargs):
+    task = asyncio.create_task(*args, **kwargs)
+    _background_tasks.add(task)
+    task.add_done_callback(_background_tasks.discard)
+
+
+class SuspendingFetch:
+    def __init__(self, store: Store):
+        self.store = store
+        self.done = dict()
+        self.waits = dict()
+
+    async def __call__(self, task: Task):
+        await self.fetch(task)
+        await self.wait()
+
+    async def wait(self):
+        while _background_tasks:
+            await asyncio.gather(*_background_tasks)
+
+    async def fetch(self, task: Task):
+        task_key = task.task_key
+        wait = None
+        event = None
+        if task_key in self.done:
+            return self.done[task_key]
+        if task_key in self.waits:
+            wait = self.waits[task_key]
+
+        if wait:
+            await wait.wait()
+            return self.done[task_key]
+
+        event = self.waits[task_key] = asyncio.Event()
+        result = await task(self.fetch, self.store)
+        self.done[task_key] = result
+        event.set()
+
+        return result
+
+
+# Example rules
+# Observe the general pattern that every rule is called to get a task, which can then be fetched.
+#     res = await fetch(rule(task_args...))
+
+
+@cache
+@rule
+async def _eg_six(fetch: FetchFn):
+    _ = fetch
+    six = 6
+    print(f"{six=}")
+    return six
 
 
 @rule
-async def eg_file(fetch: Fetch, filename: str):
-    print("file", filename)
+async def _eg_thirtysix(fetch: FetchFn):
+    # Here we await the dependencies serially.
+    # The second dependency cannot start until the first finishes.
+    six1 = await fetch(_eg_six())
+    six2 = await fetch(_eg_six())
+    print(f"{six1*six2=}")
+    return six1 * six2
+
+
+@rule
+async def _eg_multiply_add(fetch: FetchFn, taskA: Task, taskB: Task, num: int):
+    # Here we await the dependencies in parallel.
+    a, b = await asyncio.gather(fetch(taskA), fetch(taskB))
+    await asyncio.sleep(0.1)
+    print(f"{a*b+num=}")
+    return a * b + num
+
+
+# When interfacing with inputs or in general anything outside the build system,
+# Do NOT add @ctRebuilder, as it makes the task only rerun if a dependency was known to be modified.
+# In this case, we have no real dependencies, and our output depends on the filesystem.
+# So we leave out @ctRebuilder to ensure we always check that the file has not changed.
+@rule
+async def _eg_file(fetch: FetchFn, filename: str):
+    _ = fetch
+    await asyncio.sleep(0.1)
     with open(filename, "r") as f:
-        return f.read()
+        contents = f.readlines()
+        print("file", filename, "\n" + "".join(contents[1:5]), end="")
+        return contents
+
+
+# Semaphores can be used to limit concurrency
+_sem = asyncio.Semaphore(4)
 
 
-@ctRebuilder
+@cache
 @rule
-async def eg_rec(fetch: Fetch, i: int):
-    print("rec", i)
-    j = len(await fetch(eg_file("make.py"))) % 2
-    if i > 0:
-        await fetch(eg_rec(i - 1 - j))
-        await fetch(eg_rec(i - 1 - j))
+async def _eg_rec(fetch: FetchFn, i: int):
+    if i // 3 - 1 >= 0:
+        # Instead of awaiting, dependencies can also be detached and run in the background.
+        detach(fetch(_eg_rec(i // 2 - 1)))
+        detach(fetch(_eg_rec(i // 3 - 1)))
     else:
-        print("\n".join((await fetch(eg_file("make.py"))).splitlines()[:9]))
-
+        detach(fetch(_eg_file("make.py")))
+
+    # Use semaphore to limit concurrency easily
+    async with _sem:
+        print("+ rec", i)
+        # Simulate some hard work
+        await asyncio.sleep(0.1)
+        print("- rec", i)
+
+
+async def run_examples():
+    # To actually run the build system,
+    # 1) Create the store
+    #    Use context manager to ensure the store is saved automatically when exiting
+    with Store("make.db", _rules) as store:
+        # 2) Create the fetch callable
+        fetch = SuspendingFetch(store)
+        # 3) Use it to await tasks
+        await fetch(_eg_rec(1234))
+        await asyncio.gather(
+            fetch(_eg_thirtysix()), fetch(_eg_multiply_add(_eg_six(), _eg_six(), 6))
+        )
 
-if __name__ == "__main__":
-    with Build("make.db", rules) as build:
-        done = dict()
+        # Note that `fetch(...)` will wait for all detached jobs to complete before returning.
+        # You may choose to use the lower level `fetch.fetch(...)` function instead, which does not wait for detached jobs.
+        # You must then ensure `fetch.wait()` is called later to wait for detached jobs to complete.
+        await fetch.fetch(_eg_rec(2345))
+        await fetch.fetch(_eg_rec(3456))
+        await fetch.wait()
 
-        async def fetch(task: Task):
-            if task.task_key in done:
-                return done[task.task_key]
-            result = await task(Fetch(fetch, task, build))
-            done[task.task_key] = result
-            return result
 
-        asyncio.run(fetch(eg_rec(10)))
+if __name__ == "__main__":
+    asyncio.run(run_examples())

	pymake A build system based on Build Systems à la Carte
	git clone https://git.grace.moe/pymake
	Log \| Files \| Refs \| README