diff --git a/DESIGN.md b/DESIGN.md index 4c71bb9..b991684 100644 --- a/DESIGN.md +++ b/DESIGN.md @@ -884,11 +884,12 @@ v8 introduces **instruction-based command storage** for significant space saving │ content_hash: [u8; 32] │ │ (id computed from array index) │ ├─────────────────────────────────────┤ -│ CommandEntry[] (16 bytes each) │ +│ CommandEntry[] (48 bytes each) │ │ dir_id: u32 │ │ cmd_offset: u32 │ ← Template string with %f/%o patterns (v8) │ display_offset: u32 │ │ env_offset: u32 │ +│ identity: [u8; 32] │ ← Structural identity hash (v11) │ (id = index | 0x80000000) │ ├─────────────────────────────────────┤ │ Edge[] (16 bytes each) │ @@ -929,6 +930,10 @@ Version history: - v6: Compact format: 32-bit IDs/offsets, length-prefixed strings - v7: Tagged ID spaces (files vs commands), ID computed from array index - v8: Instruction-based command storage with operand sections +- v9: Stat cache — mtime_ns per file, save_time_ns header for racy-clean detection +- v10: Removed unused group_cmd_id field from edges +- v11: Per-command structural identity hash (command text + values of vars it depends + on); change detection keys on identity, not the rendered command string Design principles: - Fixed-size entries for O(1) random access @@ -1761,6 +1766,32 @@ Unlike some build systems that filter out system headers: - **Reproducibility** - Same source + same headers = same build - **Minimal overhead** - Header paths stored once per command, hash computed only when mtime differs +### Why Structural Command Identity? + +A command must re-run when its *effective definition* changes. Earlier, a command's +identity was its fully-expanded command string, and change detection asked "is this +exact string in the previous index?" That conflates two distinct questions: + +- **Stable key** — "is there still a command that produces this output?" (used to detect + stale outputs from removed rules) +- **Change signal** — "has this command's definition changed, so it must re-run?" + +The rendered string answers the first but is a *lossy* proxy for the second: any input +that affects the output without appearing in the text is invisible. The canonical case +is an exported environment variable the subprocess reads as `$VAR` — the command line is +byte-identical across values, so a string-keyed detector never re-runs it (stale output, +"Nothing to do"). The same hole applies to a config var that gates an `export`. + +v11 separates the two. Each command carries a **structural identity** — a SHA-256 over the +expanded command text *plus* the values (content hashes) of every variable it depends on, +collected from its `Sticky` edges. Exported vars now contribute a sticky edge, so their +values fold into the identity even when they never appear in the text. Change detection +keys on identity (env/config change → identity changes → rebuild); stale-output detection +keeps using the rendered string (the structural-shape key, correctly env-insensitive so an +env change rebuilds in place rather than deleting and recreating). Two builds yield the +same identity iff the command's effective definition is unchanged, which also makes it a +sound cross-build join key — unlike a positional NodeId or a lossy string. + ### Why Preserve Edges During Ghost→Generated Upgrade? When a Ghost node is upgraded to Generated (because a rule now declares it as output), putup preserves all existing dependency edges rather than deleting them: diff --git a/include/pup/graph/dag.hpp b/include/pup/graph/dag.hpp index b4cec66..792bf42 100644 --- a/include/pup/graph/dag.hpp +++ b/include/pup/graph/dag.hpp @@ -376,6 +376,16 @@ auto expand_instruction(Graph const& graph, NodeId cmd_id) -> StringId; /// Must be called after all commands have their operands set (post-parsing). auto build_command_index(Graph& graph, PathCache& cache) -> void; +/// Compute a command's structural identity: a content hash that determines whether +/// the command must re-run. It folds the fully-expanded command text together with +/// the values (content hashes) of every Variable node the command depends on via a +/// Sticky edge — capturing config/env vars that affect the output without appearing +/// in the rendered text (e.g. an exported env var the subprocess reads via $VAR). +/// Two builds yield the same identity iff the command's effective definition is the +/// same, so it is the correct cross-build join key (unlike the rendered string). +[[nodiscard]] +auto compute_command_identity(Graph const& graph, NodeId cmd_id, PathCache& cache) -> Hash256; + /// Set the build root name (relative path from source root to build root) /// For in-tree builds, this should be empty. For variant builds, e.g. "build". auto set_build_root_name(Graph& graph, std::string_view name) -> void; diff --git a/include/pup/index/entry.hpp b/include/pup/index/entry.hpp index 453fb4f..6bc9f0d 100644 --- a/include/pup/index/entry.hpp +++ b/include/pup/index/entry.hpp @@ -56,6 +56,8 @@ struct CommandEntry { StringId display = StringId::Empty; ///< Display text (from ^ ^ markers) StringId env = StringId::Empty; ///< Environment variables + Hash256 identity = {}; ///< Structural identity: command text + values of vars it depends on + Vec inputs = {}; ///< Input file operands (for %f expansion) Vec outputs = {}; ///< Output file operands (for %o expansion) diff --git a/include/pup/index/format.hpp b/include/pup/index/format.hpp index 4102321..1416443 100644 --- a/include/pup/index/format.hpp +++ b/include/pup/index/format.hpp @@ -25,7 +25,9 @@ inline constexpr auto INDEX_MAGIC = std::array { 'P', 'U', 'P', 'I' }; /// 8 - Template deduplication: commands store template + operands, reconstruct lazily /// 9 - Stat cache: mtime_ns in file entries, save_time_ns in header for racy-clean detection /// 10 - Remove unused group_cmd_id field from edges (reserved bytes expanded) -inline constexpr auto INDEX_VERSION = std::uint32_t { 10 }; +/// 11 - Command structural identity: per-command hash folding command text + the values +/// of vars it depends on (sticky/exported), replacing rendered-string identity +inline constexpr auto INDEX_VERSION = std::uint32_t { 11 }; /// Index file header (56 bytes) - v9 struct alignas(8) RawHeader { @@ -73,9 +75,11 @@ struct alignas(8) RawCommandEntry { std::uint32_t cmd_offset = 0; ///< Offset to template string with %f/%o patterns (v8) std::uint32_t display_offset = 0; ///< Display text offset (length-prefixed) std::uint32_t env_offset = 0; ///< Environment variables offset (length-prefixed) + Hash256 identity = {}; ///< Structural identity (v11): SHA-256 over command text + ///< plus the values of vars it depends on (sticky/exported) }; -static_assert(sizeof(RawCommandEntry) == 16, "RawCommandEntry must be 16 bytes"); +static_assert(sizeof(RawCommandEntry) == 48, "RawCommandEntry must be 48 bytes"); /// Raw edge entry (16 bytes) /// Represents dependencies between nodes diff --git a/src/cli/cmd_build.cpp b/src/cli/cmd_build.cpp index 10025c9..164026f 100644 --- a/src/cli/cmd_build.cpp +++ b/src/cli/cmd_build.cpp @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -586,6 +587,7 @@ auto serialize_command_nodes( .instruction_pattern = pup::graph::get(g, id), .display = pup::graph::get(g, id), .env = pup::StringId::Empty, + .identity = pup::graph::compute_command_identity(g, id, state.path_cache), .inputs = std::move(inputs), .outputs = std::move(outputs), }; @@ -673,12 +675,39 @@ auto preserve_old_implicit_edges( commands_with_new_deps.set(cmd_id, 1); } + // Map a command's structural identity -> its id in the new index. Command ids are + // positional and shift across builds (e.g. when an earlier-created command is + // removed), so the old edge's `to` id cannot be trusted to mean the same command. + // Identity is stable, so we re-resolve each carried edge's command through it. + auto hash_less = [](pup::Hash256 const& a, pup::Hash256 const& b) { + return std::memcmp(a.data(), b.data(), a.size()) < 0; + }; + auto identity_to_new_id = pup::Vec> {}; + identity_to_new_id.reserve(ctx.index.commands().size()); + for (auto const& cmd : ctx.index.commands()) { + identity_to_new_id.emplace_back(cmd.identity, cmd.id); + } + std::sort(identity_to_new_id.begin(), identity_to_new_id.end(), [&](auto const& a, auto const& b) { return hash_less(a.first, b.first); }); + for (auto const& edge : old_index.edges()) { if (edge.type != pup::LinkType::Implicit) { continue; } - if (commands_with_new_deps.contains(edge.to)) { + // Re-resolve the old command to its identity-matched counterpart in the new + // index. If it's gone or its definition changed (no identity match), drop the + // edge: the command either no longer exists or rebuilt and rediscovered its deps. + auto const* old_cmd = old_index.find_command_by_id(edge.to); + if (!old_cmd) { + continue; + } + auto match = std::lower_bound(identity_to_new_id.begin(), identity_to_new_id.end(), old_cmd->identity, [&](auto const& p, pup::Hash256 const& key) { return hash_less(p.first, key); }); + if (match == identity_to_new_id.end() || match->first != old_cmd->identity) { + continue; + } + auto new_to_id = match->second; + + if (commands_with_new_deps.contains(new_to_id)) { continue; } @@ -694,10 +723,10 @@ auto preserve_old_implicit_edges( ? new_file_it->second : create_implicit_file(ctx, abs_path, old_file_path); - if (edge_pair_insert(ctx.added_edges, new_from_id, edge.to)) { + if (edge_pair_insert(ctx.added_edges, new_from_id, new_to_id)) { ctx.index.add_edge(pup::index::EdgeEntry { .from = new_from_id, - .to = edge.to, + .to = new_to_id, .type = pup::LinkType::Implicit, }); } @@ -862,13 +891,27 @@ auto detect_new_commands( { auto const& g = state.graph; auto changed = pup::Vec {}; + + // Identity-keyed membership: a command must (re)build when its structural identity + // is absent from the previous index. Identity folds command text together with the + // values of the vars it depends on, so a change invisible to the rendered string + // (e.g. an exported env var the subprocess reads via $VAR) still flips the identity. + auto hash_less = [](pup::Hash256 const& a, pup::Hash256 const& b) { + return std::memcmp(a.data(), b.data(), a.size()) < 0; + }; + auto old_identities = pup::Vec {}; + old_identities.reserve(idx.commands().size()); + for (auto const& cmd : idx.commands()) { + old_identities.push_back(cmd.identity); + } + std::sort(old_identities.begin(), old_identities.end(), hash_less); + for (auto id : pup::graph::all_nodes(g)) { if (!pup::node_id::is_command(id)) { continue; } - auto cmd_str_id = pup::graph::expand_instruction(g, id, state.path_cache); - auto found = idx.find_command_by_command(pup::global_pool().get(cmd_str_id)); - if (!found) { + auto identity = pup::graph::compute_command_identity(g, id, state.path_cache); + if (!std::binary_search(old_identities.begin(), old_identities.end(), identity, hash_less)) { for (auto output_id : pup::graph::get_outputs(g, id)) { auto output_path_sv = pup::graph::get_full_path(g, output_id, state.path_cache); if (!output_path_sv.empty()) { diff --git a/src/graph/builder.cpp b/src/graph/builder.cpp index 6f7d43b..a2a3b2b 100644 --- a/src/graph/builder.cpp +++ b/src/graph/builder.cpp @@ -522,6 +522,20 @@ auto create_command_node( } } + // Add sticky edges from exported env variables. An exported var reaches the + // command's subprocess environment (read as bare $VAR) and so affects its output + // even when it never appears in the command text. Recording it as a sticky edge + // makes that dependency explicit and folds its value into the command identity, + // so a change to the var triggers a rebuild. (Only imported vars have a value + // node to depend on; export of an untracked env var remains out of the model.) + auto const* exv = ctx.exported_vars.data(); + for (std::size_t i = 0, n = ctx.exported_vars.size(); i < n; ++i) { + auto const* node_id = state.imported_env_var_nodes.find(exv[i]); + if (node_id) { + (void)add_edge(ctx.state->graph, *node_id, cmd_id, LinkType::Sticky); + } + } + return cmd_id; } diff --git a/src/graph/dag.cpp b/src/graph/dag.cpp index c1a200d..728ccbd 100644 --- a/src/graph/dag.cpp +++ b/src/graph/dag.cpp @@ -5,6 +5,7 @@ #include "pup/core/buf.hpp" #include "pup/core/global_pool.hpp" +#include "pup/core/hash.hpp" #include "pup/core/metrics.hpp" #include "pup/core/path_utils.hpp" @@ -21,6 +22,7 @@ #include #include #include +#include #include #include @@ -1059,6 +1061,42 @@ auto build_command_index(Graph& graph, PathCache& cache) -> void } } +auto compute_command_identity(Graph const& graph, NodeId cmd_id, PathCache& cache) -> Hash256 +{ + auto& pool = global_pool(); + auto state = sha256_init(); + + // Base: the fully-expanded command text (instruction + operand paths + in-text vars). + state = sha256_update(state, pool.get(expand_instruction(graph, cmd_id, cache))); + + // Fold in (name, value-hash) of each Variable node reached via a Sticky edge. + // This captures vars that affect output without appearing in the rendered text — + // exported env vars the subprocess reads as $VAR, config vars gating an export, etc. + // Sorted and deduped by name so identity is independent of edge insertion order and + // tolerant of duplicate sticky edges (the graph permits them). + auto vars = Vec> {}; + for (auto var_id : edges_where(graph, cmd_id, EdgeDirection::Backward, edge_mask::sticky)) { + if (get(graph, var_id) != NodeType::Variable) { + continue; + } + vars.emplace_back(pool.get(get(graph, var_id)), get(graph, var_id)); + } + std::sort(vars.begin(), vars.end(), [](auto const& a, auto const& b) { return a.first < b.first; }); + vars.erase( + std::unique(vars.begin(), vars.end(), [](auto const& a, auto const& b) { return a.first == b.first; }), + vars.end() + ); + + auto constexpr SEP = std::byte { 0 }; + for (auto const& [name, value_hash] : vars) { + state = sha256_update(state, std::span { &SEP, 1 }); + state = sha256_update(state, name); + state = sha256_update(state, std::span { value_hash.data(), value_hash.size() }); + } + + return sha256_finalize(state); +} + // ============================================================================= // BuildGraph free functions // ============================================================================= diff --git a/src/index/entry.cpp b/src/index/entry.cpp index ef55bba..3348ac7 100644 --- a/src/index/entry.cpp +++ b/src/index/entry.cpp @@ -66,6 +66,7 @@ auto CommandEntry::to_raw( raw.cmd_offset = instruction_offset; raw.display_offset = display_offset; raw.env_offset = env_offset; + raw.identity = identity; return raw; } @@ -85,6 +86,7 @@ auto CommandEntry::from_raw( .instruction_pattern = global_pool().intern(instruction_pattern), .display = global_pool().intern(display_str), .env = global_pool().intern(env_str), + .identity = raw.identity, .inputs = std::move(inputs), .outputs = std::move(outputs), }; diff --git a/test/e2e/fixtures/env_dep_subprocess/Tupfile.fixture b/test/e2e/fixtures/env_dep_subprocess/Tupfile.fixture new file mode 100644 index 0000000..e447916 --- /dev/null +++ b/test/e2e/fixtures/env_dep_subprocess/Tupfile.fixture @@ -0,0 +1,3 @@ +import MY_GREETING +export MY_GREETING +: |> echo $MY_GREETING > %o |> out.txt diff --git a/test/e2e/fixtures/implicit_dep_idshift/Tupfile.fixture b/test/e2e/fixtures/implicit_dep_idshift/Tupfile.fixture new file mode 100644 index 0000000..0d907e8 --- /dev/null +++ b/test/e2e/fixtures/implicit_dep_idshift/Tupfile.fixture @@ -0,0 +1,4 @@ +# Glob foreach so adding/removing a source changes the command set without a Tupfile +# edit. m_a.o depends on m_a.h (discovered implicitly). +: foreach *.c |> gcc -c %f -o %o |> %B.o {objs} +: {objs} |> gcc %f -o %o |> program diff --git a/test/e2e/fixtures/implicit_dep_idshift/m_a.c b/test/e2e/fixtures/implicit_dep_idshift/m_a.c new file mode 100644 index 0000000..66ab2b8 --- /dev/null +++ b/test/e2e/fixtures/implicit_dep_idshift/m_a.c @@ -0,0 +1,2 @@ +#include "m_a.h" +int aval(void) { return A_VERSION; } diff --git a/test/e2e/fixtures/implicit_dep_idshift/m_a.h b/test/e2e/fixtures/implicit_dep_idshift/m_a.h new file mode 100644 index 0000000..0e5af60 --- /dev/null +++ b/test/e2e/fixtures/implicit_dep_idshift/m_a.h @@ -0,0 +1 @@ +#define A_VERSION 1 diff --git a/test/e2e/fixtures/implicit_dep_idshift/m_b.c b/test/e2e/fixtures/implicit_dep_idshift/m_b.c new file mode 100644 index 0000000..2a97e9e --- /dev/null +++ b/test/e2e/fixtures/implicit_dep_idshift/m_b.c @@ -0,0 +1,3 @@ +// Removable unit with no header. Its command is created before m_a's, so removing +// it shifts m_a's command id downward — exercising the id-keyed carry-forward path. +int unused_b(void) { return 0; } diff --git a/test/e2e/fixtures/implicit_dep_idshift/main.c b/test/e2e/fixtures/implicit_dep_idshift/main.c new file mode 100644 index 0000000..dd9aca4 --- /dev/null +++ b/test/e2e/fixtures/implicit_dep_idshift/main.c @@ -0,0 +1,7 @@ +#include +int aval(void); +int main(void) +{ + printf("A%d\n", aval()); + return 0; +} diff --git a/test/unit/test_e2e.cpp b/test/unit/test_e2e.cpp index d8af037..8360eda 100644 --- a/test/unit/test_e2e.cpp +++ b/test/unit/test_e2e.cpp @@ -1012,6 +1012,48 @@ SCENARIO("Implicit dependencies track header changes", "[e2e][incremental]") } } +SCENARIO("Implicit deps survive command-id shift from a removed source", "[e2e][incremental][idshift]") +{ + // Implicit (header→command) edges discovered last build are carried forward for + // commands that don't rebuild. If that carry-forward keys on the command's array + // position (NodeId), removing a glob-matched source whose command was created + // earlier shifts every later command's id down — and the carried edge gets + // misattributed to whatever command now occupies the old id. A later edit to the + // header then rebuilds the wrong unit and leaves the real output stale. The + // carry-forward must key on the command's structural identity, stable across shifts. + auto env = EnvGuard { "PUP_IMPLICIT_DEPS", "1" }; + + GIVEN("a project where m_a.o has a discovered header dep and m_b.o is a removable unit") + { + auto f = E2EFixture { "implicit_dep_idshift" }; + REQUIRE(f.init().success()); + REQUIRE(f.build().success()); + REQUIRE(f.run("program").stdout_output == "A1\n"); + REQUIRE(f.build().is_noop()); + + WHEN("the earlier-created unit is removed (shifting m_a's command id down), then m_a.h changes") + { + // No Tupfile edit: the glob drops m_b.c. m_a.c is not recompiled this build, + // so its m_a.h dependency must be carried forward — at the new command id. + f.remove_file("m_b.c"); + REQUIRE(f.build().success()); + REQUIRE(f.run("program").stdout_output == "A1\n"); + + f.write_file("m_a.h", "#define A_VERSION 2\n"); + auto result = f.build(); + + THEN("m_a.o rebuilds from its header dependency (no misattribution)") + { + INFO("stdout: " << result.stdout_output); + INFO("stderr: " << result.stderr_output); + REQUIRE(result.success()); + REQUIRE_FALSE(result.is_noop()); + REQUIRE(f.run("program").stdout_output == "A2\n"); + } + } + } +} + SCENARIO("New source file triggers rebuild", "[e2e][incremental]") { GIVEN("a project with one source file using foreach glob") @@ -3976,6 +4018,41 @@ SCENARIO("Imported env vars with equals in value", "[e2e][import]") } } +SCENARIO("Exported env var consumed via subprocess environment triggers rebuild", "[e2e][import][envdep]") +{ + // The command consumes MY_GREETING through the inherited environment (bare $VAR + // in the shell), NOT via $(VAR) substitution. So the rendered command string is + // byte-identical across values: a string-keyed change detector cannot see the + // change. Correctness requires the command's identity to fold in the values of + // the vars it depends on (here, the exported MY_GREETING). + GIVEN("a project already built with an exported env var the shell reads from the environment") + { + auto f = E2EFixture { "env_dep_subprocess" }; + { + auto env = EnvGuard { "MY_GREETING", "hello" }; + REQUIRE(f.init().success()); + REQUIRE(f.build().success()); + REQUIRE(f.read_file("out.txt") == "hello\n"); + REQUIRE(f.build().is_noop()); // stable under no change + } + + WHEN("the exported env var changes (command text stays byte-identical)") + { + auto env = EnvGuard { "MY_GREETING", "world" }; + auto result = f.build(); + + THEN("rebuild is triggered and the output reflects the new value") + { + INFO("stdout: " << result.stdout_output); + INFO("stderr: " << result.stderr_output); + REQUIRE(result.success()); + REQUIRE_FALSE(result.is_noop()); + REQUIRE(f.read_file("out.txt") == "world\n"); + } + } + } +} + SCENARIO("Only commands using changed env var rebuild", "[e2e][import]") { GIVEN("a project already built with two env vars") diff --git a/test/unit/test_index.cpp b/test/unit/test_index.cpp index ef65e37..0990354 100644 --- a/test/unit/test_index.cpp +++ b/test/unit/test_index.cpp @@ -68,9 +68,9 @@ TEST_CASE("Index format struct sizes", "[index]") REQUIRE(sizeof(RawFileEntry) == 64); } - SECTION("RawCommandEntry is 16 bytes") + SECTION("RawCommandEntry is 48 bytes (v11: + identity hash)") { - REQUIRE(sizeof(RawCommandEntry) == 16); + REQUIRE(sizeof(RawCommandEntry) == 48); } SECTION("RawEdge is 16 bytes") @@ -150,12 +150,17 @@ TEST_CASE("FileEntry conversion", "[index]") TEST_CASE("CommandEntry conversion", "[index]") { + auto identity = pup::Hash256 {}; + identity[0] = std::byte { 0xAB }; + identity[31] = std::byte { 0xCD }; + auto cmd = CommandEntry { .id = node_id::make_command(5), .dir_id = 5, .instruction_pattern = intern("gcc -c %f -o %o"), .display = intern("CC main.c"), .env = intern("CC=gcc"), + .identity = identity, .inputs = { 10 }, .outputs = { 20 }, }; @@ -166,6 +171,7 @@ TEST_CASE("CommandEntry conversion", "[index]") REQUIRE(raw.cmd_offset == 0); REQUIRE(raw.display_offset == 50); REQUIRE(raw.env_offset == 100); + REQUIRE(raw.identity == identity); // ID is computed from array index (4 + 1 = 5, then node_id::make_command) auto& pool = global_pool(); @@ -179,6 +185,7 @@ TEST_CASE("CommandEntry conversion", "[index]") REQUIRE(restored.instruction_pattern == cmd.instruction_pattern); REQUIRE(restored.display == cmd.display); REQUIRE(restored.env == cmd.env); + REQUIRE(restored.identity == identity); REQUIRE(restored.inputs == cmd.inputs); REQUIRE(restored.outputs == cmd.outputs); } @@ -359,13 +366,17 @@ TEST_CASE("Index serialization roundtrip", "[e2e][index]") .size = 8192, }); - // Command 1 (v8: template + operands) + // Command 1 (v8: template + operands; v11: + identity hash) + auto cmd_identity = pup::Hash256 {}; + cmd_identity[0] = std::byte { 0x11 }; + cmd_identity[31] = std::byte { 0x99 }; index.add_command(CommandEntry { .id = cmd_id, .dir_id = 0, .instruction_pattern = intern("g++ -c %f -o %o"), .display = intern("CXX main.cpp"), .env = {}, + .identity = cmd_identity, .inputs = { 3 }, // main.cpp .outputs = { 4 }, // main.o }); @@ -433,6 +444,7 @@ TEST_CASE("Index serialization roundtrip", "[e2e][index]") REQUIRE(cmd != nullptr); REQUIRE(cmd->instruction_pattern == intern("g++ -c %f -o %o")); REQUIRE(cmd->display == intern("CXX main.cpp")); + REQUIRE(cmd->identity == cmd_identity); REQUIRE(cmd->inputs == pup::Vec { 3 }); REQUIRE(cmd->outputs == pup::Vec { 4 });