From d73047e87fc4ad233da1d1466cb1560949606ec5 Mon Sep 17 00:00:00 2001 From: George Sittas Date: Fri, 15 May 2026 17:39:36 +0300 Subject: [PATCH 1/6] [mypyc] Fix missing cross-group header deps in incremental builds MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In separate=True mode, each compiled group writes a set of C files and headers under the build directory. The C files for one group (the "consumer") reach into another group's export-table header via an angled include inside their own __native_internal_.h: __native_internal_caller.h: #include The consumer's .o file bakes in byte offsets from that struct at C compile time. If the other group's struct layout shifts (e.g. a new class is inserted earlier in the source), the consumer's .o must be recompiled to pick up the new layout — otherwise it silently resolves offsets to the wrong class or function at runtime. The old get_header_deps used a regex that collapsed both #include "..." and #include <...> into plain names, then resolved them all relative to target_dir. Two bugs combined to hide the cross-group dep: 1. The resolver only looked at the .c file's direct includes and never opened the resolved headers to follow their own #include directives. The angled cross-group include lives inside __native_internal_.h, not in the .c file itself, so it was never seen. 2. The deps were resolved in a single pass per group, before sibling groups had written their headers to disk, so the cross-group header file didn't yet exist even if the resolver had looked for it. The fix introduces: * _INCLUDE_RE / _extract_includes: a regex that distinguishes the two include forms and returns (is_angled, name) pairs, matching the C preprocessor's own search rules (#include "foo" searches the includer's directory first; #include searches -I paths only). * resolve_cfile_deps: a transitive walker that opens each resolved header and follows its own #include directives, bounded by a visited set. Only headers that exist under target_dir (or, for quoted form, the includer's directory) are included; lib-rt headers like are dropped because they never change between builds. * A two-pass loop in mypyc_build: the first pass writes all groups' C files to disk; the second pass resolves deps so that every group's headers are available when we look for cross-group includes. After the fix, editing a dep module in a way that shifts its export_table struct correctly triggers a recompile of the consumer's .o, preventing silent runtime corruption. --- mypyc/build.py | 136 +++++++++++++++++++++++++++++++----- mypyc/test/test_misc.py | 151 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 271 insertions(+), 16 deletions(-) diff --git a/mypyc/build.py b/mypyc/build.py index 84633086d2724..5b4f2e4de8fa4 100644 --- a/mypyc/build.py +++ b/mypyc/build.py @@ -566,22 +566,114 @@ def construct_groups( return groups -def get_header_deps(cfiles: list[tuple[str, str]]) -> list[str]: - """Find all the headers used by a group of cfiles. +# Single regex that captures both `#include "foo"` and `#include `. The +# alternation lets us tell the two forms apart: the quoted-form match populates +# group 1 and the angle-form match populates group 2. The C preprocessor +# applies different search rules to each kind (see `_extract_includes`), so we +# carry the kind through resolution rather than collapsing them up front. +_INCLUDE_RE = re.compile(r'#\s*include\s+(?:"([^"]+)"|<([^>]+)>)') + + +def _extract_includes(contents: str) -> list[tuple[bool, str]]: + """Return each `#include` directive's (is_angled, name) from `contents`. + + is_angled=False for `#include "foo"`, True for `#include `. + """ + out: list[tuple[bool, str]] = [] + for quoted, angled in _INCLUDE_RE.findall(contents): + if quoted: + out.append((False, quoted)) + else: + out.append((True, angled)) + return out + + +def get_header_deps(cfiles: list[tuple[str, str]]) -> list[tuple[bool, str]]: + """Find all the headers directly included by a group of cfiles. + + Returns a sorted, deduplicated list of `(is_angled, header_name)` pairs. + Callers that only need the names can ignore the bool, but it is needed by + `resolve_cfile_deps` to apply the correct preprocessor search order. We do this by just regexping the source, which is a bit simpler than - properly plumbing the data through. + properly plumbing the data through. Transitive header-to-header includes + are picked up by `resolve_cfile_deps` in `mypyc_build`, which can read + the on-disk headers after every group has written its files. Arguments: cfiles: A list of (file name, file contents) pairs. """ - headers: set[str] = set() + headers: set[tuple[bool, str]] = set() for _, contents in cfiles: - headers.update(re.findall(r'#include [<"]([^>"]+)[>"]', contents)) + headers.update(_extract_includes(contents)) return sorted(headers) +def resolve_cfile_deps( + cfile_dir: str, direct_includes: list[tuple[bool, str]], target_dir: str +) -> set[str]: + """Resolve a .c file's `#include` directives to on-disk paths, walking + transitively through resolved headers. + + The C preprocessor resolves `#include "foo"` against the includer's + directory first, then via -I, while `#include ` only uses -I. We + mirror that exactly: quoted includes are searched in (includer_dir, + target_dir) order, and angled includes are searched in target_dir only. + `target_dir` is the only -I path that holds files we generate; anything + we cannot resolve under it (or, for quoted form, the includer's dir) is + dropped — lib-rt headers like `` and `` live elsewhere + and do not change between builds, so they are not real deps for + incremental purposes. + + The walk is transitive: each resolved header is opened and scanned for + its own `#include` directives. Without this, cross-group export-table + headers reached via `__native_internal_.h` (which includes + ``) would be missed, and edits that shift + struct offsets in `other_group` would not trigger a recompile of the + consumer's .o file. Its baked-in offsets would then resolve to whatever + class/function now occupies that slot — silent runtime corruption. + + Returns a set of resolved paths suitable for use as an Extension.depends + list. + """ + resolved: set[str] = set() + # Worklist of (search_dir, is_angled, header_name). search_dir is the + # includer's directory — for the initial cfile it is the cfile's dir, for + # a transitively-included header it is that header's dir. It is only + # consulted for quoted-form includes. + worklist: list[tuple[str, bool, str]] = [ + (cfile_dir, is_angled, dep) for is_angled, dep in direct_includes + ] + while worklist: + search_dir, is_angled, dep = worklist.pop() + # Quoted form: includer's dir first, then -I (target_dir). + # Angled form: -I only (skips the includer's dir). + search_bases = (target_dir,) if is_angled else (search_dir, target_dir) + for base in search_bases: + candidate = os.path.normpath(os.path.join(base, dep)) + if not os.path.exists(candidate): + continue + if candidate in resolved: + break + resolved.add(candidate) + # Recurse only into headers. Some lib-rt sources are pulled in + # as `#include "init.c"` etc.; those do not resolve under + # target_dir so they get filtered out before we would try to scan + # them, but the .h guard is a cheap belt-and-braces. + if candidate.endswith(".h"): + try: + with open(candidate, encoding="utf-8", errors="replace") as f: + header_contents = f.read() + except OSError: + header_contents = "" + sub_dir = os.path.dirname(candidate) + for sub_angled, sub in _extract_includes(header_contents): + worklist.append((sub_dir, sub_angled, sub)) + break + return resolved + + def mypyc_build( paths: list[str], compiler_options: CompilerOptions, @@ -630,11 +722,17 @@ def mypyc_build( for (path, dirs, internal) in skip_cgen_input[1] ] - # Write out the generated C and collect the files for each group + # Write out the generated C and collect the files for each group. # Should this be here?? group_cfilenames: list[tuple[list[str], list[str]]] = [] + # Per-group list of (full_cfile_path, raw_include_targets). Resolution is + # deferred until every group has written its files: a header in one group + # may include a header generated by another group, so resolving immediately + # would miss cross-group deps for groups processed first. + pending: list[list[tuple[str, list[tuple[bool, str]]]]] = [] for cfiles in group_cfiles: cfilenames = [] + per_cfile_deps: list[tuple[str, list[tuple[bool, str]]]] = [] for cfile, ctext in cfiles: cfile = os.path.join(compiler_options.target_dir, cfile) # Empty contents marks a file the previous run already wrote @@ -643,16 +741,22 @@ def mypyc_build( write_file(cfile, ctext) if os.path.splitext(cfile)[1] == ".c": cfilenames.append(cfile) - - # The header regex matches both quote styles, so the result can - # include system headers like `` that don't live under - # target_dir. Joining those produces non-existent paths which - # would force a full rebuild on every run via Extension.depends. - candidate_deps = ( - os.path.join(compiler_options.target_dir, dep) for dep in get_header_deps(cfiles) - ) - deps = [d for d in candidate_deps if os.path.exists(d)] - group_cfilenames.append((cfilenames, deps)) + per_cfile_deps.append((cfile, get_header_deps([(cfile, ctext)]))) + pending.append(per_cfile_deps) + group_cfilenames.append((cfilenames, [])) + + # Second pass: resolve each .c file's deps transitively now that every + # group's headers are on disk. See resolve_cfile_deps for the rules. + for i, per_cfile in enumerate(pending): + deps_set: set[str] = set() + for cfile_full, dep_names in per_cfile: + deps_set.update( + resolve_cfile_deps( + os.path.dirname(cfile_full), dep_names, compiler_options.target_dir + ) + ) + cfilenames, _ = group_cfilenames[i] + group_cfilenames[i] = (cfilenames, sorted(deps_set)) return groups, group_cfilenames, source_deps diff --git a/mypyc/test/test_misc.py b/mypyc/test/test_misc.py index 4b0bbe5988afb..e52f80dae7355 100644 --- a/mypyc/test/test_misc.py +++ b/mypyc/test/test_misc.py @@ -1,7 +1,10 @@ from __future__ import annotations +import os +import tempfile import unittest +from mypyc.build import get_header_deps, resolve_cfile_deps from mypyc.ir.ops import BasicBlock from mypyc.ir.pprint import format_blocks, generate_names_for_ir from mypyc.irbuild.ll_builder import LowLevelIRBuilder @@ -20,3 +23,151 @@ def test_debug_op(self) -> None: names = generate_names_for_ir([], [block]) code = format_blocks([block], names, {}) assert code[:-1] == ["L0:", " r0 = 'foo'", " CPyDebug_PrintObject(r0)"] + + +class TestHeaderDeps(unittest.TestCase): + """Tests for the header-dependency tracking used to build + `Extension.depends`, which drives setuptools' `newer_group` decision + about whether to recompile a .o file on incremental builds. + + The critical case is cross-group export-table headers: each module's + `__native_internal_.h` does `#include `, + and the consumer's compiled .o file bakes in byte offsets into that + header's `export_table_` struct. If we miss this header in the + deps list, struct-layout changes in `other_group` won't trigger a + rebuild of the consumer, and its baked-in offsets will silently resolve + to whatever now occupies those slots. + """ + + def test_get_header_deps_quoted_includes(self) -> None: + # Quoted includes — the historical form. Used by the .c file to + # reach its own __native_.h / __native_internal_.h. The + # `False` in each tuple marks the include as non-angled, which + # `resolve_cfile_deps` uses to search the includer's directory. + cfile = '#include "__native_caller.h"\n#include "__native_internal_caller.h"\n' + assert get_header_deps([("caller.c", cfile)]) == [ + (False, "__native_caller.h"), + (False, "__native_internal_caller.h"), + ] + + def test_get_header_deps_angle_bracket_includes(self) -> None: + # Angle-bracket includes are also matched, and reported with + # is_angled=True so that the resolver skips the includer's dir + # for them (matching the C preprocessor). The cross-group export + # header is reached via `#include ` + # in __native_internal_.h. Before this was matched the dep + # was missed entirely and the consumer's .o was never invalidated + # when the other group's struct layout shifted. + cfile = "#include \n#include \n" + assert get_header_deps([("caller.c", cfile)]) == [ + (True, "Python.h"), + (True, "lib/__native_functions.h"), + ] + + def test_get_header_deps_mixed_and_whitespace(self) -> None: + # The preprocessor tolerates whitespace and the leading-hash form. + # `get_header_deps` returns sorted tuples — non-angled (False) sorts + # before angled (True), then alphabetical within each kind. + cfile = '# include "a.h"\n# include \n#include\t"c.h"\n' + assert get_header_deps([("x.c", cfile)]) == [(False, "a.h"), (False, "c.h"), (True, "b.h")] + + def test_resolve_walks_transitively_through_headers(self) -> None: + # Reproduces the bug scenario: caller's .c only directly includes + # caller's own headers, but caller's __native_internal_caller.h + # includes the cross-group export header. The resolver must follow + # that chain so setuptools sees the cross-group header as a dep. + with tempfile.TemporaryDirectory() as tmp: + build_dir = tmp + os.makedirs(os.path.join(build_dir, "lib")) + os.makedirs(os.path.join(build_dir, "other_group")) + + # caller.c's directly-included headers — both live alongside + # caller.c under build/ (resolved via target_dir). + internal_h = os.path.join(build_dir, "__native_internal_caller.h") + caller_h = os.path.join(build_dir, "__native_caller.h") + cross_group_h = os.path.join(build_dir, "lib", "__native_functions.h") + unrelated_h = os.path.join(build_dir, "other_group", "__native_other.h") + + with open(caller_h, "w") as f: + # lib-rt headers don't exist on disk under build/, so they + # get dropped during resolution and aren't recursed into. + f.write("#include \n#include \n") + with open(internal_h, "w") as f: + # The smoking gun: this header includes a header in another + # group via angle brackets. Pre-fix, this dep was invisible + # to setuptools. + f.write( + "#include \n" + '#include "__native_caller.h"\n' + "#include \n" + ) + with open(cross_group_h, "w") as f: + f.write("struct export_table_lib___functions { int x; };\n") + with open(unrelated_h, "w") as f: + # Sibling group not reached from caller's chain — must + # NOT appear in the resolved set. + f.write("struct unrelated { int x; };\n") + + # caller.c is in build_dir, so its includer-dir is build_dir. + # Both directly-included headers are quoted (`False`); the + # cross-group header that __native_internal_caller.h reaches + # via `` is found by the recursive + # walk re-reading the on-disk header. + deps = resolve_cfile_deps( + cfile_dir=build_dir, + direct_includes=[ + (False, "__native_caller.h"), + (False, "__native_internal_caller.h"), + ], + target_dir=build_dir, + ) + + assert deps == {caller_h, internal_h, cross_group_h}, ( + f"expected the cross-group header to be reached transitively; " + f"got {sorted(deps)!r}" + ) + + def test_resolve_drops_unresolvable_includes(self) -> None: + # ``, ``, etc. don't live under target_dir, so + # they're dropped from depends. They never change between builds, + # so this is the right behavior — and crucially it stops + # setuptools' `missing="newer"` from treating them as always-newer + # and force-rebuilding every translation unit. + with tempfile.TemporaryDirectory() as tmp: + cfile_dir = tmp + deps = resolve_cfile_deps( + cfile_dir=cfile_dir, + direct_includes=[(True, "Python.h"), (True, "CPy.h"), (False, "init.c")], + target_dir=cfile_dir, + ) + assert deps == set() + + def test_resolve_search_order_matches_preprocessor(self) -> None: + # When the same header name exists both next to the includer and + # under target_dir, the C preprocessor picks the includer-dir copy + # for `#include "shared.h"` and the target_dir copy for + # `#include `. The resolver must record the same path + # the compiler will actually consume, otherwise mtimes of the + # wrong file drive incremental rebuild decisions. + with tempfile.TemporaryDirectory() as tmp: + includer = os.path.join(tmp, "groupA") + target = os.path.join(tmp, "build") + os.makedirs(includer) + os.makedirs(target) + + local_h = os.path.join(includer, "shared.h") + global_h = os.path.join(target, "shared.h") + with open(local_h, "w") as f: + f.write("/* local */\n") + with open(global_h, "w") as f: + f.write("/* global */\n") + + # Quoted form: resolves to the includer-dir copy. + assert resolve_cfile_deps( + cfile_dir=includer, direct_includes=[(False, "shared.h")], target_dir=target + ) == {local_h} + + # Angled form: skips the includer-dir copy, resolves under -I. + assert resolve_cfile_deps( + cfile_dir=includer, direct_includes=[(True, "shared.h")], target_dir=target + ) == {global_h} From 7ed0284c416e4a9c97938534fd8733c31f64c48c Mon Sep 17 00:00:00 2001 From: George Sittas Date: Mon, 18 May 2026 15:53:02 +0300 Subject: [PATCH 2/6] [mypyc] Fix Extension.depends empty for fully-cached groups in incremental builds MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In separate=True mode, when a group is fully cached (generate_c returns empty ctext for all its files), get_header_deps was called with empty content, returning no includes. Extension.depends therefore stayed empty for that group, so cross-group export-table header changes — caused by inserting a new class that shifts struct offsets — never triggered a recompile of the cached consumer's .o. The stale .o then baked in the old offsets, silently resolving them to wrong classes at runtime. Fix: when ctext is empty and the .c file exists on disk (written by the previous build), read it before calling get_header_deps so the dep resolver can walk the transitive header chain and include cross-group headers in Extension.depends. Also adds a test that directly demonstrates the before/after: calling get_header_deps with empty content produces no deps, while calling it with the on-disk content surfaces the cross-group header. --- mypyc/build.py | 11 +++++++ mypyc/test/test_misc.py | 65 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 76 insertions(+) diff --git a/mypyc/build.py b/mypyc/build.py index 5b4f2e4de8fa4..abe3795965464 100644 --- a/mypyc/build.py +++ b/mypyc/build.py @@ -741,6 +741,17 @@ def mypyc_build( write_file(cfile, ctext) if os.path.splitext(cfile)[1] == ".c": cfilenames.append(cfile) + # For fully-cached groups ctext is empty; read the on-disk .c so + # the dep resolver can walk its transitive header chain and populate + # Extension.depends — otherwise cross-group export-table header + # changes (e.g. a new class shifting struct offsets) won't trigger + # a recompile of this cached consumer's .o. + if not ctext and os.path.exists(cfile): + try: + with open(cfile, encoding="utf-8") as _f: + ctext = _f.read() + except OSError: + pass per_cfile_deps.append((cfile, get_header_deps([(cfile, ctext)]))) pending.append(per_cfile_deps) group_cfilenames.append((cfilenames, [])) diff --git a/mypyc/test/test_misc.py b/mypyc/test/test_misc.py index e52f80dae7355..7af68726922b0 100644 --- a/mypyc/test/test_misc.py +++ b/mypyc/test/test_misc.py @@ -142,6 +142,71 @@ def test_resolve_drops_unresolvable_includes(self) -> None: ) assert deps == set() + def test_cached_group_deps_populated_from_disk_cfile(self) -> None: + # Reproduces the scenario where generate_c returns empty ctext for a + # group (the "fully-cached" path), but the .c file from the previous + # build is on disk. Before the fix, get_header_deps was called with + # empty content, returning no includes, so Extension.depends stayed + # empty and cross-group header changes didn't trigger a .o recompile. + # + # Layout: + # target_dir/consumer/__native_consumer.c <- cached group's .c + # target_dir/consumer/__native_internal_consumer.h + # └─ #include <- cross-group dep + # target_dir/provider/__native_provider.h <- layout changed here + with tempfile.TemporaryDirectory() as tmp: + consumer_dir = os.path.join(tmp, "consumer") + os.makedirs(consumer_dir) + provider_dir = os.path.join(tmp, "provider") + os.makedirs(provider_dir) + + consumer_c = os.path.join(consumer_dir, "__native_consumer.c") + consumer_h = os.path.join(consumer_dir, "__native_consumer.h") + internal_h = os.path.join(consumer_dir, "__native_internal_consumer.h") + cross_group_h = os.path.join(provider_dir, "__native_provider.h") + + with open(consumer_c, "w") as f: + f.write('#include "__native_consumer.h"\n#include "__native_internal_consumer.h"\n') + with open(consumer_h, "w") as f: + f.write("#include \n") + with open(internal_h, "w") as f: + f.write( + "#include \n" + '#include "__native_consumer.h"\n' + "#include \n" + ) + with open(cross_group_h, "w") as f: + f.write("struct export_table_provider { int x; };\n") + + # Without the fix: get_header_deps is called with empty ctext, + # so per_cfile_deps gets an empty include list and Extension.depends + # ends up empty — cross-group header changes go undetected. + deps_without_fix: set[str] = set() + for cfile_full, dep_names in [(consumer_c, get_header_deps([(consumer_c, "")]))]: + deps_without_fix.update( + resolve_cfile_deps(os.path.dirname(cfile_full), dep_names, tmp) + ) + assert deps_without_fix == set() + + # With the fix: the on-disk .c is read when ctext is empty, so + # get_header_deps sees the real includes and the resolver walks + # the transitive chain to find the cross-group header. + try: + with open(consumer_c, encoding="utf-8") as _f: + disk_text = _f.read() + except OSError: + disk_text = "" + deps_with_fix: set[str] = set() + for cfile_full, dep_names in [(consumer_c, get_header_deps([(consumer_c, disk_text)]))]: + deps_with_fix.update( + resolve_cfile_deps(os.path.dirname(cfile_full), dep_names, tmp) + ) + + assert cross_group_h in deps_with_fix, ( + f"cross-group header must be in deps so setuptools recompiles the " + f"stale .o when struct offsets shift; got {sorted(deps_with_fix)!r}" + ) + def test_resolve_search_order_matches_preprocessor(self) -> None: # When the same header name exists both next to the includer and # under target_dir, the C preprocessor picks the includer-dir copy From 69239d4d84429ac7d972bb4c0cc7b6b7f53b8814 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 18 May 2026 12:54:43 +0000 Subject: [PATCH 3/6] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- mypyc/test/test_misc.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/mypyc/test/test_misc.py b/mypyc/test/test_misc.py index 7af68726922b0..418db22f0c0e2 100644 --- a/mypyc/test/test_misc.py +++ b/mypyc/test/test_misc.py @@ -166,7 +166,9 @@ def test_cached_group_deps_populated_from_disk_cfile(self) -> None: cross_group_h = os.path.join(provider_dir, "__native_provider.h") with open(consumer_c, "w") as f: - f.write('#include "__native_consumer.h"\n#include "__native_internal_consumer.h"\n') + f.write( + '#include "__native_consumer.h"\n#include "__native_internal_consumer.h"\n' + ) with open(consumer_h, "w") as f: f.write("#include \n") with open(internal_h, "w") as f: @@ -197,7 +199,9 @@ def test_cached_group_deps_populated_from_disk_cfile(self) -> None: except OSError: disk_text = "" deps_with_fix: set[str] = set() - for cfile_full, dep_names in [(consumer_c, get_header_deps([(consumer_c, disk_text)]))]: + for cfile_full, dep_names in [ + (consumer_c, get_header_deps([(consumer_c, disk_text)])) + ]: deps_with_fix.update( resolve_cfile_deps(os.path.dirname(cfile_full), dep_names, tmp) ) From 5e3b5c27c010bb056780996cd2ea645e288d536e Mon Sep 17 00:00:00 2001 From: George Sittas Date: Tue, 19 May 2026 18:01:43 +0300 Subject: [PATCH 4/6] Address feedback --- mypyc/build.py | 35 +++++++++++---------- mypyc/test/test_misc.py | 69 ----------------------------------------- 2 files changed, 19 insertions(+), 85 deletions(-) diff --git a/mypyc/build.py b/mypyc/build.py index abe3795965464..c906f90e12bab 100644 --- a/mypyc/build.py +++ b/mypyc/build.py @@ -601,8 +601,14 @@ def get_header_deps(cfiles: list[tuple[str, str]]) -> list[tuple[bool, str]]: the on-disk headers after every group has written its files. Arguments: - cfiles: A list of (file name, file contents) pairs. + cfiles: A list of (file name, file contents) pairs. Contents must be + non-empty; callers handling cached groups must re-read the .c + from disk before calling, otherwise direct includes are missed + and Extension.depends ends up empty. """ + assert all( + contents for _, contents in cfiles + ), "get_header_deps requires non-empty file contents" headers: set[tuple[bool, str]] = set() for _, contents in cfiles: headers.update(_extract_includes(contents)) @@ -622,7 +628,7 @@ def resolve_cfile_deps( target_dir) order, and angled includes are searched in target_dir only. `target_dir` is the only -I path that holds files we generate; anything we cannot resolve under it (or, for quoted form, the includer's dir) is - dropped — lib-rt headers like `` and `` live elsewhere + dropped — other headers like `` and `` live elsewhere and do not change between builds, so they are not real deps for incremental purposes. @@ -724,14 +730,12 @@ def mypyc_build( # Write out the generated C and collect the files for each group. # Should this be here?? - group_cfilenames: list[tuple[list[str], list[str]]] = [] - # Per-group list of (full_cfile_path, raw_include_targets). Resolution is - # deferred until every group has written its files: a header in one group - # may include a header generated by another group, so resolving immediately - # would miss cross-group deps for groups processed first. + # + # Header resolution is deferred to a second pass: a header in one group + # may include a header generated by another group, so resolving + # immediately would miss cross-group deps for groups processed first. pending: list[list[tuple[str, list[tuple[bool, str]]]]] = [] for cfiles in group_cfiles: - cfilenames = [] per_cfile_deps: list[tuple[str, list[tuple[bool, str]]]] = [] for cfile, ctext in cfiles: cfile = os.path.join(compiler_options.target_dir, cfile) @@ -739,8 +743,6 @@ def mypyc_build( # (fully-cached group): skip the rewrite and just reuse it. if ctext and not options.mypyc_skip_c_generation: write_file(cfile, ctext) - if os.path.splitext(cfile)[1] == ".c": - cfilenames.append(cfile) # For fully-cached groups ctext is empty; read the on-disk .c so # the dep resolver can walk its transitive header chain and populate # Extension.depends — otherwise cross-group export-table header @@ -754,11 +756,13 @@ def mypyc_build( pass per_cfile_deps.append((cfile, get_header_deps([(cfile, ctext)]))) pending.append(per_cfile_deps) - group_cfilenames.append((cfilenames, [])) - # Second pass: resolve each .c file's deps transitively now that every - # group's headers are on disk. See resolve_cfile_deps for the rules. - for i, per_cfile in enumerate(pending): + # Second pass: assemble each group's .c filenames and resolve transitive + # deps now that every group's headers are on disk. See resolve_cfile_deps + # for the rules. + group_cfilenames: list[tuple[list[str], list[str]]] = [] + for per_cfile in pending: + cfilenames = [cf for cf, _ in per_cfile if os.path.splitext(cf)[1] == ".c"] deps_set: set[str] = set() for cfile_full, dep_names in per_cfile: deps_set.update( @@ -766,8 +770,7 @@ def mypyc_build( os.path.dirname(cfile_full), dep_names, compiler_options.target_dir ) ) - cfilenames, _ = group_cfilenames[i] - group_cfilenames[i] = (cfilenames, sorted(deps_set)) + group_cfilenames.append((cfilenames, sorted(deps_set))) return groups, group_cfilenames, source_deps diff --git a/mypyc/test/test_misc.py b/mypyc/test/test_misc.py index 418db22f0c0e2..e52f80dae7355 100644 --- a/mypyc/test/test_misc.py +++ b/mypyc/test/test_misc.py @@ -142,75 +142,6 @@ def test_resolve_drops_unresolvable_includes(self) -> None: ) assert deps == set() - def test_cached_group_deps_populated_from_disk_cfile(self) -> None: - # Reproduces the scenario where generate_c returns empty ctext for a - # group (the "fully-cached" path), but the .c file from the previous - # build is on disk. Before the fix, get_header_deps was called with - # empty content, returning no includes, so Extension.depends stayed - # empty and cross-group header changes didn't trigger a .o recompile. - # - # Layout: - # target_dir/consumer/__native_consumer.c <- cached group's .c - # target_dir/consumer/__native_internal_consumer.h - # └─ #include <- cross-group dep - # target_dir/provider/__native_provider.h <- layout changed here - with tempfile.TemporaryDirectory() as tmp: - consumer_dir = os.path.join(tmp, "consumer") - os.makedirs(consumer_dir) - provider_dir = os.path.join(tmp, "provider") - os.makedirs(provider_dir) - - consumer_c = os.path.join(consumer_dir, "__native_consumer.c") - consumer_h = os.path.join(consumer_dir, "__native_consumer.h") - internal_h = os.path.join(consumer_dir, "__native_internal_consumer.h") - cross_group_h = os.path.join(provider_dir, "__native_provider.h") - - with open(consumer_c, "w") as f: - f.write( - '#include "__native_consumer.h"\n#include "__native_internal_consumer.h"\n' - ) - with open(consumer_h, "w") as f: - f.write("#include \n") - with open(internal_h, "w") as f: - f.write( - "#include \n" - '#include "__native_consumer.h"\n' - "#include \n" - ) - with open(cross_group_h, "w") as f: - f.write("struct export_table_provider { int x; };\n") - - # Without the fix: get_header_deps is called with empty ctext, - # so per_cfile_deps gets an empty include list and Extension.depends - # ends up empty — cross-group header changes go undetected. - deps_without_fix: set[str] = set() - for cfile_full, dep_names in [(consumer_c, get_header_deps([(consumer_c, "")]))]: - deps_without_fix.update( - resolve_cfile_deps(os.path.dirname(cfile_full), dep_names, tmp) - ) - assert deps_without_fix == set() - - # With the fix: the on-disk .c is read when ctext is empty, so - # get_header_deps sees the real includes and the resolver walks - # the transitive chain to find the cross-group header. - try: - with open(consumer_c, encoding="utf-8") as _f: - disk_text = _f.read() - except OSError: - disk_text = "" - deps_with_fix: set[str] = set() - for cfile_full, dep_names in [ - (consumer_c, get_header_deps([(consumer_c, disk_text)])) - ]: - deps_with_fix.update( - resolve_cfile_deps(os.path.dirname(cfile_full), dep_names, tmp) - ) - - assert cross_group_h in deps_with_fix, ( - f"cross-group header must be in deps so setuptools recompiles the " - f"stale .o when struct offsets shift; got {sorted(deps_with_fix)!r}" - ) - def test_resolve_search_order_matches_preprocessor(self) -> None: # When the same header name exists both next to the includer and # under target_dir, the C preprocessor picks the includer-dir copy From bfe6d8f7d737635cc9cd9ea7d3a251f9c797227b Mon Sep 17 00:00:00 2001 From: George Sittas Date: Wed, 20 May 2026 15:03:21 +0300 Subject: [PATCH 5/6] Add test, patch fudge_dir_mtimes to skip linter outputs This patches `fudge_dir_mtimes` to skip linker outputs (.so/.pyd/.o). The existing -1 fudge on every step pushed `.so` mtimes back, which combined with `write_file`'s +1 mtime bump on `.c` files made `newer_group` always trigger a rebuild regardless of `Extension.depends`, masking this class of bugs in the test framework. --- mypyc/test-data/run-multimodule.test | 54 ++++++++++++++++++++++++++++ mypyc/test/testutil.py | 6 ++++ 2 files changed, 60 insertions(+) diff --git a/mypyc/test-data/run-multimodule.test b/mypyc/test-data/run-multimodule.test index aa9dd7eb5ef1c..92b9fa6623fcc 100644 --- a/mypyc/test-data/run-multimodule.test +++ b/mypyc/test-data/run-multimodule.test @@ -1391,6 +1391,60 @@ def translate(b: bytes) -> bytes: import native assert native.translate(b'ABCD') == b'BBCD' +[case testIncrementalCrossGroupExportTableOffsets] +# Regression: under separate=True, each consumer module's IR is +# compiled against the positional layout of its deps' +# `exports_` struct. Reordering the dep's classes keeps the +# same set of public names, so mypy's interface hash for the dep is +# unchanged -- the consumer is not invalidated and stays fully +# cached, which causes `_load_cached_group_files` to return empty +# cfile content for the consumer's group. +# +# Before the fix, `get_header_deps` over empty content returned no +# includes, so `Extension.depends` for the consumer ended up empty +# and setuptools never recompiled the consumer's .o when the dep's +# `__native_.h` shifted struct offsets. The stale .o kept the +# old offsets and silently resolved cross-group calls to the wrong +# class. +from other_classes import Gamma, Delta + +def make_gamma() -> Gamma: + return Gamma() + +def make_delta() -> Delta: + return Delta() + +[file other_classes.py] +class Alpha: + a: int = 1 + +class Beta: + b: int = 2 + +class Gamma: + g: int = 3 + +class Delta: + d: int = 4 + +[file other_classes.py.2] +class Delta: + d: int = 4 + +class Alpha: + a: int = 1 + +class Beta: + b: int = 2 + +class Gamma: + g: int = 3 + +[file driver.py] +import native +assert type(native.make_gamma()).__name__ == "Gamma" +assert type(native.make_delta()).__name__ == "Delta" + [case testCrossModuleAttrDefaults] from other import Parent diff --git a/mypyc/test/testutil.py b/mypyc/test/testutil.py index 0a558d0d0b8ec..9d59993c03402 100644 --- a/mypyc/test/testutil.py +++ b/mypyc/test/testutil.py @@ -232,8 +232,14 @@ def show_c(cfiles: list[list[tuple[str, str]]]) -> None: def fudge_dir_mtimes(dir: str, delta: int) -> None: + # Skip linker outputs. Pushing them back combines with write_file's + # +1 sec bump on .c files to make .c always newer than .so, forcing + # an unconditional rebuild that would mask Extension.depends bugs. + # See setuptools/_distutils/command/build_ext.py:`build_extension`. for dirpath, _, filenames in os.walk(dir): for name in filenames: + if name.endswith((".so", ".pyd", ".o", ".obj")): + continue path = os.path.join(dirpath, name) new_mtime = os.stat(path).st_mtime + delta os.utime(path, times=(new_mtime, new_mtime)) From ca6b2913e1d13e5980f188b40a192a5aeb0f8342 Mon Sep 17 00:00:00 2001 From: George Sittas Date: Wed, 20 May 2026 15:15:01 +0300 Subject: [PATCH 6/6] Clean up comments --- mypyc/build.py | 78 ++++++++++++++++++--------------------- mypyc/test/test_misc.py | 81 +++++++++++++++-------------------------- 2 files changed, 66 insertions(+), 93 deletions(-) diff --git a/mypyc/build.py b/mypyc/build.py index c906f90e12bab..13bd50fef3b1a 100644 --- a/mypyc/build.py +++ b/mypyc/build.py @@ -619,38 +619,34 @@ def get_header_deps(cfiles: list[tuple[str, str]]) -> list[tuple[bool, str]]: def resolve_cfile_deps( cfile_dir: str, direct_includes: list[tuple[bool, str]], target_dir: str ) -> set[str]: - """Resolve a .c file's `#include` directives to on-disk paths, walking - transitively through resolved headers. - - The C preprocessor resolves `#include "foo"` against the includer's - directory first, then via -I, while `#include ` only uses -I. We - mirror that exactly: quoted includes are searched in (includer_dir, - target_dir) order, and angled includes are searched in target_dir only. - `target_dir` is the only -I path that holds files we generate; anything - we cannot resolve under it (or, for quoted form, the includer's dir) is - dropped — other headers like `` and `` live elsewhere - and do not change between builds, so they are not real deps for - incremental purposes. - - The walk is transitive: each resolved header is opened and scanned for - its own `#include` directives. Without this, cross-group export-table - headers reached via `__native_internal_.h` (which includes - ``) would be missed, and edits that shift - struct offsets in `other_group` would not trigger a recompile of the - consumer's .o file. Its baked-in offsets would then resolve to whatever - class/function now occupies that slot — silent runtime corruption. - - Returns a set of resolved paths suitable for use as an Extension.depends - list. + """ + Resolve a .c file's `#include`s to on-disk paths, walking transitively through resolved headers. + + The C preprocessor resolves `#include "foo"` against the includer's directory first, then via + -I, while `#include ` only uses -I. We mirror that exactly: quoted includes are searched + in (includer_dir, target_dir) order, and angled includes are searched in target_dir only. + `target_dir` is the only -I path that holds files we generate; anything we cannot resolve under + it (or, for quoted form, the includer's dir) is dropped. Other headers like `` and + `` live elsewhere and do not change between builds, so they are not real dependencies + for incremental purposes. + + The walk is transitive: each resolved header is opened and scanned for its own `#include` + directives. Without this, cross-group export-table headers reached via `__native_internal_.h` + (which includes ``) would be missed, and edits that shift struct + offsets in `other_group` would not trigger a recompile of the consumer's .o file. Its baked-in + offsets would then resolve to whatever class/function now occupies that slot => runtime corruption. + + Returns a set of resolved paths suitable for use as an Extension.depends list. """ resolved: set[str] = set() - # Worklist of (search_dir, is_angled, header_name). search_dir is the - # includer's directory — for the initial cfile it is the cfile's dir, for - # a transitively-included header it is that header's dir. It is only - # consulted for quoted-form includes. + + # Worklist of (search_dir, is_angled, header_name). search_dir is the includer's directory; for the + # initial cfile it is the cfile's dir, for a transitively-included header it is that header's dir. + # It is only consulted for quoted-form includes. worklist: list[tuple[str, bool, str]] = [ (cfile_dir, is_angled, dep) for is_angled, dep in direct_includes ] + while worklist: search_dir, is_angled, dep = worklist.pop() # Quoted form: includer's dir first, then -I (target_dir). @@ -663,13 +659,12 @@ def resolve_cfile_deps( if candidate in resolved: break resolved.add(candidate) - # Recurse only into headers. Some lib-rt sources are pulled in - # as `#include "init.c"` etc.; those do not resolve under - # target_dir so they get filtered out before we would try to scan + # Recurse only into headers. Some lib-rt sources are pulled in as `#include "init.c"` etc.; + # those do not resolve under target_dir so they get filtered out before we would try to scan # them, but the .h guard is a cheap belt-and-braces. if candidate.endswith(".h"): try: - with open(candidate, encoding="utf-8", errors="replace") as f: + with open(candidate, encoding="utf-8") as f: header_contents = f.read() except OSError: header_contents = "" @@ -731,23 +726,23 @@ def mypyc_build( # Write out the generated C and collect the files for each group. # Should this be here?? # - # Header resolution is deferred to a second pass: a header in one group - # may include a header generated by another group, so resolving - # immediately would miss cross-group deps for groups processed first. + # Header resolution is deferred to a second pass: a header in one group may include a header + # generated by another group, so resolving here misses cross-group deps for groups processed first. pending: list[list[tuple[str, list[tuple[bool, str]]]]] = [] for cfiles in group_cfiles: per_cfile_deps: list[tuple[str, list[tuple[bool, str]]]] = [] for cfile, ctext in cfiles: cfile = os.path.join(compiler_options.target_dir, cfile) + # Empty contents marks a file the previous run already wrote # (fully-cached group): skip the rewrite and just reuse it. if ctext and not options.mypyc_skip_c_generation: write_file(cfile, ctext) - # For fully-cached groups ctext is empty; read the on-disk .c so - # the dep resolver can walk its transitive header chain and populate - # Extension.depends — otherwise cross-group export-table header - # changes (e.g. a new class shifting struct offsets) won't trigger - # a recompile of this cached consumer's .o. + + # For fully-cached groups ctext is empty; read the on-disk .c so the dep resolver + # can walk its transitive header chain and populate Extension.depends. Otherwise, + # cross-group export-table header changes (e.g. a new class shifting struct offsets) + # won't trigger a recompile of this cached consumer's .o. if not ctext and os.path.exists(cfile): try: with open(cfile, encoding="utf-8") as _f: @@ -757,9 +752,8 @@ def mypyc_build( per_cfile_deps.append((cfile, get_header_deps([(cfile, ctext)]))) pending.append(per_cfile_deps) - # Second pass: assemble each group's .c filenames and resolve transitive - # deps now that every group's headers are on disk. See resolve_cfile_deps - # for the rules. + # Second pass: assemble each group's .c filenames and resolve transitive deps now that every group's + # headers are on disk. See resolve_cfile_deps for the rules. group_cfilenames: list[tuple[list[str], list[str]]] = [] for per_cfile in pending: cfilenames = [cf for cf, _ in per_cfile if os.path.splitext(cf)[1] == ".c"] diff --git a/mypyc/test/test_misc.py b/mypyc/test/test_misc.py index e52f80dae7355..816875fcc23db 100644 --- a/mypyc/test/test_misc.py +++ b/mypyc/test/test_misc.py @@ -26,23 +26,14 @@ def test_debug_op(self) -> None: class TestHeaderDeps(unittest.TestCase): - """Tests for the header-dependency tracking used to build - `Extension.depends`, which drives setuptools' `newer_group` decision - about whether to recompile a .o file on incremental builds. - - The critical case is cross-group export-table headers: each module's - `__native_internal_.h` does `#include `, - and the consumer's compiled .o file bakes in byte offsets into that - header's `export_table_` struct. If we miss this header in the - deps list, struct-layout changes in `other_group` won't trigger a - rebuild of the consumer, and its baked-in offsets will silently resolve - to whatever now occupies those slots. + """ + Tests for the header-dependency tracking used to build `Extension.depends`, which drives + setuptools' `newer_group` decision about whether to recompile a .o file on incremental builds. """ def test_get_header_deps_quoted_includes(self) -> None: - # Quoted includes — the historical form. Used by the .c file to - # reach its own __native_.h / __native_internal_.h. The - # `False` in each tuple marks the include as non-angled, which + # Quoted includes: the historical form. Used by the .c file to reach its own __native_.h / + # __native_internal_.h. The `False` in each tuple marks the include as non-angled, which # `resolve_cfile_deps` uses to search the includer's directory. cfile = '#include "__native_caller.h"\n#include "__native_internal_caller.h"\n' assert get_header_deps([("caller.c", cfile)]) == [ @@ -51,13 +42,11 @@ def test_get_header_deps_quoted_includes(self) -> None: ] def test_get_header_deps_angle_bracket_includes(self) -> None: - # Angle-bracket includes are also matched, and reported with - # is_angled=True so that the resolver skips the includer's dir - # for them (matching the C preprocessor). The cross-group export - # header is reached via `#include ` - # in __native_internal_.h. Before this was matched the dep - # was missed entirely and the consumer's .o was never invalidated - # when the other group's struct layout shifted. + # Angle-bracket includes are also matched, and reported with is_angled=True so that the resolver + # skips the includer's dir for them (matching the C preprocessor). The cross-group export header + # is reached via `#include ` in __native_internal_.h. Before + # this was matched the dep was missed entirely and the consumer's .o was never invalidated when + # the other group's struct layout shifted. cfile = "#include \n#include \n" assert get_header_deps([("caller.c", cfile)]) == [ (True, "Python.h"), @@ -65,23 +54,21 @@ def test_get_header_deps_angle_bracket_includes(self) -> None: ] def test_get_header_deps_mixed_and_whitespace(self) -> None: - # The preprocessor tolerates whitespace and the leading-hash form. - # `get_header_deps` returns sorted tuples — non-angled (False) sorts - # before angled (True), then alphabetical within each kind. + # The preprocessor tolerates whitespace and the leading-hash form. `get_header_deps` returns sorted + # tuples — non-angled (False) sorts before angled (True), then alphabetical within each kind. cfile = '# include "a.h"\n# include \n#include\t"c.h"\n' assert get_header_deps([("x.c", cfile)]) == [(False, "a.h"), (False, "c.h"), (True, "b.h")] def test_resolve_walks_transitively_through_headers(self) -> None: - # Reproduces the bug scenario: caller's .c only directly includes - # caller's own headers, but caller's __native_internal_caller.h - # includes the cross-group export header. The resolver must follow - # that chain so setuptools sees the cross-group header as a dep. + # Reproduces the bug scenario: caller's .c only directly includes caller's own headers, but + # caller's __native_internal_caller.h includes the cross-group export header. The resolver + # must follow that chain so setuptools sees the cross-group header as a dep. with tempfile.TemporaryDirectory() as tmp: build_dir = tmp os.makedirs(os.path.join(build_dir, "lib")) os.makedirs(os.path.join(build_dir, "other_group")) - # caller.c's directly-included headers — both live alongside + # caller.c's directly-included headers, both live alongside # caller.c under build/ (resolved via target_dir). internal_h = os.path.join(build_dir, "__native_internal_caller.h") caller_h = os.path.join(build_dir, "__native_caller.h") @@ -89,13 +76,12 @@ def test_resolve_walks_transitively_through_headers(self) -> None: unrelated_h = os.path.join(build_dir, "other_group", "__native_other.h") with open(caller_h, "w") as f: - # lib-rt headers don't exist on disk under build/, so they - # get dropped during resolution and aren't recursed into. + # Headers outside build/ (CPython's , lib-rt's ) don't resolve under + # target_dir, so they get dropped during resolution and aren't recursed into. f.write("#include \n#include \n") with open(internal_h, "w") as f: - # The smoking gun: this header includes a header in another - # group via angle brackets. Pre-fix, this dep was invisible - # to setuptools. + # This header includes a header in another group via angle brackets. Pre-fix, this dep + # was invisible to setuptools. f.write( "#include \n" '#include "__native_caller.h"\n' @@ -104,15 +90,12 @@ def test_resolve_walks_transitively_through_headers(self) -> None: with open(cross_group_h, "w") as f: f.write("struct export_table_lib___functions { int x; };\n") with open(unrelated_h, "w") as f: - # Sibling group not reached from caller's chain — must - # NOT appear in the resolved set. + # Sibling group not reached from caller's chain => must NOT appear in the resolved set. f.write("struct unrelated { int x; };\n") - # caller.c is in build_dir, so its includer-dir is build_dir. - # Both directly-included headers are quoted (`False`); the - # cross-group header that __native_internal_caller.h reaches - # via `` is found by the recursive - # walk re-reading the on-disk header. + # caller.c is in build_dir, so its includer-dir is build_dir. Both directly-included headers + # are quoted (`False`); the cross-group header that __native_internal_caller.h reaches via + # `` is found by the recursive walk re-reading the on-disk header. deps = resolve_cfile_deps( cfile_dir=build_dir, direct_includes=[ @@ -128,11 +111,9 @@ def test_resolve_walks_transitively_through_headers(self) -> None: ) def test_resolve_drops_unresolvable_includes(self) -> None: - # ``, ``, etc. don't live under target_dir, so - # they're dropped from depends. They never change between builds, - # so this is the right behavior — and crucially it stops - # setuptools' `missing="newer"` from treating them as always-newer - # and force-rebuilding every translation unit. + # ``, ``, etc. don't live under target_dir, so they're dropped from depends. They + # never change between builds, so this is the right behavior. Crucially, it stops setuptools' + # `missing="newer"` from treating them as always-newer and force-rebuilding every translation unit. with tempfile.TemporaryDirectory() as tmp: cfile_dir = tmp deps = resolve_cfile_deps( @@ -143,11 +124,9 @@ def test_resolve_drops_unresolvable_includes(self) -> None: assert deps == set() def test_resolve_search_order_matches_preprocessor(self) -> None: - # When the same header name exists both next to the includer and - # under target_dir, the C preprocessor picks the includer-dir copy - # for `#include "shared.h"` and the target_dir copy for - # `#include `. The resolver must record the same path - # the compiler will actually consume, otherwise mtimes of the + # When the same header name exists both next to the includer and under target_dir, the C preprocessor + # picks the includer-dir copy for `#include "shared.h"` and the target_dir copy for `#include `. + # The resolver must record the same path the compiler will actually consume, otherwise mtimes of the # wrong file drive incremental rebuild decisions. with tempfile.TemporaryDirectory() as tmp: includer = os.path.join(tmp, "groupA")