Add the infrastructure to run the plan on Rust

2026-06-05 11:20:50 +02:00 · 2026-06-05 11:20:50 +02:00 · 510662e7c9
commit 510662e7c9
parent 693151b89e
13 changed files with 1006 additions and 10 deletions
--- a/.gitignore
+++ b/.gitignore
@ -81,3 +81,4 @@ tarpaulin-report.html
 .claude/
 .codex
 .agents/
 dist-newstyle/
--- a/.gitmodules
+++ b/.gitmodules
@ -5,4 +5,4 @@
 [submodule "external/geolog"]
 	path = external/geolog
 	url = gitlab@git.sgai.uk:creators/geolog.git
-	branch = query-plan-ir-draft-1
+	branch = query-plan-algebraic
--- a/17
+++ b/17
@ -76,6 +76,23 @@ clean: ## Remove build output
 		cargo clean; \
 	fi
 EXPORTER_DIR := tools/exporter
 EXPORTER_FIXTURES := crates/glog-runner/fixtures
 EXPORTER_SCENARIOS := three-atom-chain
 .PHONY: export-fixtures
 export-fixtures: ## Regenerate JSON plan fixtures from the Haskell exporter (needs Cabal and GHC; use `make shell` first).
 	@if ! command -v cabal >/dev/null 2>&1; then \
 		echo "cabal not found. Enter the dev shell with 'make shell' (or 'nix develop') first."; \
 		exit 1; \
 	fi
 	@cd $(EXPORTER_DIR) && cabal build glog-export
 	@for sc in $(EXPORTER_SCENARIOS); do \
 		out=$(EXPORTER_FIXTURES)/$$(echo $$sc | tr '-' '_').json; \
 		echo "exporting $$sc -> $$out"; \
 		(cd $(EXPORTER_DIR) && cabal run -v0 glog-export -- $$sc) > $$out; \
 	done
 .PHONY: shell
 shell: ## Enter the Nix dev shell defined in flake.nix
 	@nix develop
--- a/crates/glog-runner/Cargo.toml
+++ b/crates/glog-runner/Cargo.toml
@ -0,0 +1,19 @@
 [package]
 name = "glog-runner"
 version = "0.1.0"
 edition.workspace = true
 license.workspace = true
 rust-version.workspace = true
 [lints]
 workspace = true
 [dependencies]
 storage = { path = "../storage" }
 query-ops = { path = "../query-ops" }
 serde = { version = "1", features = ["derive"] }
 serde_json = "1"
 [[bin]]
 name = "glog-run"
 path = "src/main.rs"
--- a/crates/glog-runner/fixtures/three_atom_chain.json
+++ b/crates/glog-runner/fixtures/three_atom_chain.json
@ -0,0 +1,166 @@
 {
    "_scenario": "three-atom-chain",
    "facts": {
        "edge": [
            [
                {
                    "str": "node:1"
                },
                {
                    "str": "node:2"
                },
                {
                    "str": "edge:1"
                }
            ],
            [
                {
                    "str": "node:2"
                },
                {
                    "str": "node:3"
                },
                {
                    "str": "edge:2"
                }
            ]
        ],
        "node": [
            [
                {
                    "str": "node:1"
                }
            ],
            [
                {
                    "str": "node:2"
                }
            ],
            [
                {
                    "str": "node:3"
                }
            ]
        ]
    },
    "query": {
        "nodes": [
            {
                "action": {
                    "scan": {
                        "columns": [
                            {
                                "var": "a"
                            },
                            {
                                "var": "b"
                            },
                            {
                                "var": "_w0_2"
                            }
                        ],
                        "table": "edge"
                    }
                },
                "id": 1
            },
            {
                "action": {
                    "scan": {
                        "columns": [
                            {
                                "var": "b"
                            },
                            {
                                "var": "c"
                            },
                            {
                                "var": "_w1_2"
                            }
                        ],
                        "table": "edge"
                    }
                },
                "id": 2
            },
            {
                "action": {
                    "scan": {
                        "columns": [
                            {
                                "var": "a"
                            }
                        ],
                        "table": "node"
                    }
                },
                "id": 3
            },
            {
                "action": {
                    "join": {
                        "left": 1,
                        "op": "left",
                        "right": 3
                    }
                },
                "id": 4
            },
            {
                "action": {
                    "join": {
                        "left": 2,
                        "op": "left",
                        "right": 4
                    }
                },
                "id": 5
            },
            {
                "action": {
                    "join": {
                        "left": 5,
                        "op": "right",
                        "right": 4
                    }
                },
                "id": 6
            },
            {
                "action": {
                    "join": {
                        "left": 6,
                        "op": "right",
                        "right": 3
                    }
                },
                "id": 7
            },
            {
                "action": {
                    "join": {
                        "left": 6,
                        "op": "natural",
                        "right": 7
                    }
                },
                "id": 8
            },
            {
                "action": {
                    "join": {
                        "left": 5,
                        "op": "natural",
                        "right": 8
                    }
                },
                "id": 9
            }
        ],
        "root": 9
    },
    "schema": {
        "edge": 3,
        "node": 1
    }
 }
--- a/crates/glog-runner/src/lib.rs
+++ b/crates/glog-runner/src/lib.rs
@ -0,0 +1,344 @@
 //! End-to-end runner that executes a `geolog-lang` conjunctive-query plan
 //! against this workspace's storage and `query-ops` operators.
 //!
 //! The upstream Haskell planner in `external/geolog/geolog-lang`
 //! (`Geolog.DB.Plan`) builds a Yannakakis-style join DAG over `QAtom`s. This
 //! crate accepts that DAG as JSON, materializes the input relations through
 //! the [`Storage`] trait, and walks the DAG using
 //! [`query_ops::atom::scan_atom`], [`query_ops::join::semijoin`], and
 //! [`query_ops::join::natural_join`]. The result is a binding
 //! [`Relation`](query_ops::relation::Relation) over the query's variables.
 //!
 //! The JSON IR mirrors `Geolog.DB.Plan.JoinPlan` and `Geolog.DB.InMemory.QAtom`
 //! without depending on the Haskell side at build time. A Haskell exporter
 //! that dumps `(schema, facts, JoinPlan)` to this shape is the planned
 //! follow-up that completes the round trip; the IR is the contract.
 //!
 //! Mapping from the Haskell planner:
 //!
 //! | `Geolog.DB.Plan`            | this crate                                    |
 //! |-----------------------------|-----------------------------------------------|
 //! | `PlanEvalAtom`              | [`Action::Scan`] → `scan_atom`                |
 //! | `PlanJoin LeftJoin a b`     | [`Action::Join`] with [`JoinOp::Left`] → `semijoin(rel[a], rel[b])` |
 //! | `PlanJoin RightJoin a b`    | [`Action::Join`] with [`JoinOp::Right`] → `semijoin(rel[b], rel[a])` |
 //! | `PlanJoin NaturalJoin a b`  | [`Action::Join`] with [`JoinOp::Natural`] → `natural_join(rel[a], rel[b])` |
 //!
 //! The atom side covers `evalAtom` (`Geolog.DB.InMemory`): a [`Term::Var`]
 //! repeated across positions enforces equality, [`Term::Lit`] filters by
 //! constant, and distinct variables project in first-occurrence order.
 use std::collections::HashMap;
 use serde::Deserialize;
 use query_ops::atom::{AtomPattern, Term, scan_atom};
 use query_ops::join::{natural_join, semijoin};
 use query_ops::relation::Relation;
 use storage::value::Value;
 use storage::{MemoryStorage, Storage, StorageError, scan_as_table};
 /// A single fixture: schema, ground facts, and a query plan to execute.
 #[derive(Debug, Clone, Deserialize)]
 pub struct Plan {
    /// Relation name → arity (column count).
    pub schema: HashMap<String, usize>,
    /// Relation name → list of ground tuples to insert before execution.
    pub facts: HashMap<String, Vec<Vec<JsonValue>>>,
    /// The join DAG itself.
    pub query: Query,
 }
 /// Mirrors `Geolog.DB.Plan.JoinPlan`: a set of nodes plus the id of the
 /// rooted result node.
 #[derive(Debug, Clone, Deserialize)]
 pub struct Query {
    pub root: u32,
    pub nodes: Vec<Node>,
 }
 /// One node of the plan DAG. `id`s are dense within a `Query` but don't need
 /// to start at any particular value, mirroring the Haskell `PlanNodeId`.
 #[derive(Debug, Clone, Deserialize)]
 pub struct Node {
    pub id: u32,
    pub action: Action,
 }
 /// What to compute at a node. Tagged externally so JSON reads as
 /// `{"action": {"scan": {...}}}` or `{"action": {"join": {...}}}`.
 #[derive(Debug, Clone, Deserialize)]
 #[serde(rename_all = "snake_case")]
 pub enum Action {
    Scan(Atom),
    Join(Join),
 }
 /// A flat atom pattern, one entry per column of the target relation.
 /// Matches the `toFlatArgs` view used by `Geolog.DB.InMemory.evalAtom`:
 /// `qaValues` positions are filled in directly, and the entity-id column
 /// (if any) appears at the last position. Wildcard positions in the
 /// Haskell `QAtom` (a `Map Int QVal` with a missing key) translate to a
 /// fresh, unique variable name on this side, which the operator binds but
 /// never joins against.
 #[derive(Debug, Clone, Deserialize)]
 pub struct Atom {
    pub table: String,
    pub columns: Vec<JsonTerm>,
 }
 #[derive(Debug, Clone, Deserialize)]
 #[serde(rename_all = "snake_case")]
 pub enum JsonTerm {
    Var(String),
    Lit(JsonValue),
 }
 /// Wire-level value tag. Restricted to what `storage::value::Value` carries.
 /// Entity identities from the Haskell side (`ValEntity path id`) round-trip
 /// through `Str` for now using a `"path:id"` convention; that's a fixture
 /// concern, not a runner concern.
 #[derive(Debug, Clone, Deserialize)]
 #[serde(rename_all = "snake_case")]
 pub enum JsonValue {
    Int(i64),
    Str(String),
 }
 #[derive(Debug, Clone, Deserialize)]
 pub struct Join {
    pub op: JoinOp,
    pub left: u32,
    pub right: u32,
 }
 #[derive(Debug, Clone, Copy, Deserialize)]
 #[serde(rename_all = "snake_case")]
 pub enum JoinOp {
    /// `Geolog.DB.Plan.LeftJoin`: result is `left` rows whose shared columns
    /// appear in `right`. Lowered to `semijoin(left, right)`.
    Left,
    /// `Geolog.DB.Plan.RightJoin`: result is `right` rows whose shared
    /// columns appear in `left`. Lowered to `semijoin(right, left)`.
    Right,
    /// `Geolog.DB.Plan.NaturalJoin`. Lowered to `natural_join(left, right)`.
    Natural,
 }
 /// Errors a runner can produce in addition to storage failures.
 #[derive(Debug)]
 pub enum RunError {
    /// A fact references a relation that isn't declared in `schema`.
    UnknownRelation(String),
    /// A node id appears in a `Join` action but no node with that id exists.
    MissingNode(u32),
    /// `Query.root` doesn't match any node in `nodes`.
    MissingRoot(u32),
    /// Two nodes share the same id.
    DuplicateNode(u32),
    /// A join node references its left or right side before that side has
    /// been computed: the DAG isn't actually topologically sorted by id, or
    /// it has a cycle.
    UnresolvedDependency { node: u32, depends_on: u32 },
    /// Storage layer rejected an operation.
    Storage(StorageError),
 }
 impl std::fmt::Display for RunError {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            Self::UnknownRelation(name) => {
                write!(f, "facts reference relation {name:?} not in schema")
            }
            Self::MissingNode(id) => write!(f, "plan references missing node id {id}"),
            Self::MissingRoot(id) => write!(f, "plan root id {id} matches no node"),
            Self::DuplicateNode(id) => write!(f, "duplicate node id {id} in plan"),
            Self::UnresolvedDependency { node, depends_on } => write!(
                f,
                "node {node} depends on {depends_on}, which has not been computed yet"
            ),
            Self::Storage(err) => write!(f, "storage error: {err}"),
        }
    }
 }
 impl std::error::Error for RunError {
    fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
        match self {
            Self::Storage(err) => Some(err),
            _ => None,
        }
    }
 }
 impl From<StorageError> for RunError {
    fn from(err: StorageError) -> Self {
        Self::Storage(err)
    }
 }
 impl From<JsonValue> for Value {
    fn from(jv: JsonValue) -> Self {
        match jv {
            JsonValue::Int(n) => Self::Int(n),
            JsonValue::Str(s) => Self::Str(s),
        }
    }
 }
 impl From<JsonTerm> for Term {
    fn from(t: JsonTerm) -> Self {
        match t {
            JsonTerm::Var(name) => Self::Var(name),
            JsonTerm::Lit(value) => Self::Lit(value.into()),
        }
    }
 }
 /// Parse a [`Plan`] from a JSON string.
 ///
 /// # Errors
 /// Returns a [`serde_json::Error`] if the input isn't valid JSON in the
 /// expected shape.
 pub fn parse_plan(json: &str) -> Result<Plan, serde_json::Error> {
    serde_json::from_str(json)
 }
 /// Load schema and facts from a [`Plan`] into a fresh [`MemoryStorage`].
 ///
 /// All facts are inserted in a single transaction; commit is atomic so a
 /// failure on row N leaves the storage empty.
 ///
 /// # Errors
 /// Returns [`RunError::UnknownRelation`] if facts mention a relation not
 /// declared in `schema`. Wraps storage failures (arity mismatch, transaction
 /// errors) in [`RunError::Storage`].
 pub fn load_into_memory(plan: &Plan) -> Result<MemoryStorage, RunError> {
    let mut storage = MemoryStorage::default();
    for (name, arity) in &plan.schema {
        storage.create_relation(name, *arity)?;
    }
    {
        let mut tx = storage.transaction()?;
        for (name, rows) in &plan.facts {
            if !plan.schema.contains_key(name) {
                return Err(RunError::UnknownRelation(name.clone()));
            }
            for row in rows {
                let cells: Vec<Value> = row.iter().cloned().map(Value::from).collect();
                tx.insert(name, cells)?;
            }
        }
        let _ = tx.commit()?;
    }
    Ok(storage)
 }
 /// Execute a plan against a storage backend, returning the bindings
 /// [`Relation`] for the rooted plan node.
 ///
 /// Nodes are executed in ascending `id` order. For a Yannakakis plan as
 /// emitted by `Geolog.DB.Plan` this is equivalent to a topological sort,
 /// since `insertJoin` only references node ids that have already been
 /// allocated. A non-monotone id ordering is rejected with
 /// [`RunError::UnresolvedDependency`].
 ///
 /// # Errors
 /// Returns [`RunError::DuplicateNode`] for repeated ids,
 /// [`RunError::MissingNode`] for join references to unknown ids,
 /// [`RunError::MissingRoot`] if `query.root` isn't present, and storage
 /// errors during the per-scan `scan_as_table` call.
 pub fn execute<S: Storage>(storage: &S, query: &Query) -> Result<Relation, RunError> {
    let mut seen_ids: std::collections::HashSet<u32> =
        std::collections::HashSet::with_capacity(query.nodes.len());
    for node in &query.nodes {
        if !seen_ids.insert(node.id) {
            return Err(RunError::DuplicateNode(node.id));
        }
    }
    if !seen_ids.contains(&query.root) {
        return Err(RunError::MissingRoot(query.root));
    }
    let mut ordered: Vec<&Node> = query.nodes.iter().collect();
    ordered.sort_by_key(|n| n.id);
    let mut results: HashMap<u32, Relation> = HashMap::with_capacity(ordered.len());
    for node in ordered {
        let computed = match &node.action {
            Action::Scan(atom) => {
                let table = scan_as_table(storage, &atom.table)?;
                let pattern = AtomPattern {
                    columns: atom.columns.iter().cloned().map(Term::from).collect(),
                };
                scan_atom(&table, &pattern)
            }
            Action::Join(join) => {
                let left = require_dep(&results, &seen_ids, node.id, join.left)?;
                let right = require_dep(&results, &seen_ids, node.id, join.right)?;
                match join.op {
                    JoinOp::Left => semijoin(left, right),
                    JoinOp::Right => semijoin(right, left),
                    JoinOp::Natural => natural_join(left, right),
                }
            }
        };
        results.insert(node.id, computed);
    }
    results
        .remove(&query.root)
        .ok_or(RunError::MissingRoot(query.root))
 }
 fn require_dep<'a>(
    results: &'a HashMap<u32, Relation>,
    seen: &std::collections::HashSet<u32>,
    node: u32,
    depends_on: u32,
 ) -> Result<&'a Relation, RunError> {
    if let Some(rel) = results.get(&depends_on) {
        Ok(rel)
    } else if seen.contains(&depends_on) {
        Err(RunError::UnresolvedDependency { node, depends_on })
    } else {
        Err(RunError::MissingNode(depends_on))
    }
 }
 /// Convenience: parse JSON, load it into a fresh in-memory storage, and
 /// execute, returning the root binding relation.
 ///
 /// # Errors
 /// Returns a JSON parse error if the input is malformed, or a [`RunError`]
 /// for any later step.
 pub fn run_json(json: &str) -> Result<Relation, RunFromJsonError> {
    let plan = parse_plan(json).map_err(RunFromJsonError::Parse)?;
    let storage = load_into_memory(&plan).map_err(RunFromJsonError::Run)?;
    let bindings = execute(&storage, &plan.query).map_err(RunFromJsonError::Run)?;
    Ok(bindings)
 }
 /// Combined error from [`run_json`].
 #[derive(Debug)]
 pub enum RunFromJsonError {
    Parse(serde_json::Error),
    Run(RunError),
 }
 impl std::fmt::Display for RunFromJsonError {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            Self::Parse(err) => write!(f, "parse error: {err}"),
            Self::Run(err) => write!(f, "run error: {err}"),
        }
    }
 }
 impl std::error::Error for RunFromJsonError {
    fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
        match self {
            Self::Parse(err) => Some(err),
            Self::Run(err) => Some(err),
        }
    }
 }
--- a/crates/glog-runner/src/main.rs
+++ b/crates/glog-runner/src/main.rs
@ -0,0 +1,59 @@
 //! `glog-run` CLI: read a JSON plan from a file (or stdin if `-`), execute
 //! it against a fresh in-memory store, and print the resulting binding
 //! relation as JSON on stdout.
 use std::io::{self, Read};
 use std::process::ExitCode;
 fn main() -> ExitCode {
    let mut args = std::env::args().skip(1);
    let Some(path) = args.next() else {
        eprintln!("usage: glog-run <plan.json | ->");
        return ExitCode::from(2);
    };
    let input = match read_input(&path) {
        Ok(s) => s,
        Err(err) => {
            eprintln!("failed to read {path}: {err}");
            return ExitCode::from(1);
        }
    };
    let relation = match glog_runner::run_json(&input) {
        Ok(r) => r,
        Err(err) => {
            eprintln!("{err}");
            return ExitCode::from(1);
        }
    };
    let payload = serde_json::json!({
        "columns": relation.columns,
        "rows": relation
            .rows
            .iter()
            .map(|row| row.iter().map(value_to_json).collect::<Vec<_>>())
            .collect::<Vec<_>>(),
    });
    println!("{payload}");
    ExitCode::SUCCESS
 }
 fn read_input(path: &str) -> io::Result<String> {
    if path == "-" {
        let mut buf = String::new();
        io::stdin().read_to_string(&mut buf)?;
        Ok(buf)
    } else {
        std::fs::read_to_string(path)
    }
 }
 fn value_to_json(value: &storage::value::Value) -> serde_json::Value {
    match value {
        storage::value::Value::Int(n) => serde_json::Value::Number((*n).into()),
        storage::value::Value::Str(s) => serde_json::Value::String(s.clone()),
        storage::value::Value::Id(id) => serde_json::Value::String(id.to_string()),
    }
 }
--- a/crates/glog-runner/tests/three_atom_chain.rs
+++ b/crates/glog-runner/tests/three_atom_chain.rs
@ -0,0 +1,73 @@
 //! End-to-end check: run the JSON fixture and verify the resulting bindings
 //! match the `DB.InMemoryTest` "matches evalConjunction on three-atom chain"
 //! case from `external/geolog/geolog-lang/test/DB/InMemoryTest.hs`.
 //!
 //! For `node = {e1, e2, e3}` and `edge = {(e1,e2,ee1), (e2,e3,ee2)}` the
 //! conjunction `node(a), edge(a, b, _), edge(b, c, _)` has exactly one
 //! solution: `(a=e1, b=e2, c=e3)`.
 use std::collections::BTreeMap;
 use glog_runner::run_json;
 use storage::value::Value;
 fn fixture() -> &'static str {
    include_str!("../fixtures/three_atom_chain.json")
 }
 fn ent(path: &str, id: u32) -> Value {
    Value::Str(format!("{path}:{id}"))
 }
 fn project<'a>(
    columns: &'a [String],
    row: &'a [Value],
    keep: &'a [&'a str],
 ) -> BTreeMap<&'a str, &'a Value> {
    keep.iter()
        .map(|name| {
            let pos = columns
                .iter()
                .position(|c| c == name)
                .expect("column missing");
            (*name, &row[pos])
        })
        .collect()
 }
 #[test]
 fn three_atom_chain_matches_haskell_oracle() {
    let result = run_json(fixture()).expect("fixture should execute");
    // The plan's root keeps every variable, including the per-atom wildcards
    // `_r1` and `_r2`. The oracle only asserts the (a, b, c) projection.
    let keep = ["a", "b", "c"];
    let mut projected: Vec<BTreeMap<&str, &Value>> = result
        .rows
        .iter()
        .map(|row| project(&result.columns, row, &keep))
        .collect();
    projected.sort_by_key(|m| format!("{m:?}"));
    let e1 = ent("node", 1);
    let e2 = ent("node", 2);
    let e3 = ent("node", 3);
    let expected = vec![BTreeMap::from([("a", &e1), ("b", &e2), ("c", &e3)])];
    assert_eq!(projected, expected);
 }
 #[test]
 fn root_columns_cover_a_b_c_plus_two_wildcards() {
    // The exporter emits unique wildcard variable names for the entity-id
    // column of each edge atom (e.g. `_w0_2`, `_w1_2`); their exact spelling
    // is an implementation detail of the exporter, so this test only checks
    // that the named variables are all present and that the total column
    // count is the three named ones plus two anonymous wildcards.
    let result = run_json(fixture()).expect("fixture should execute");
    let cols: std::collections::HashSet<&str> = result.columns.iter().map(String::as_str).collect();
    for expected in ["a", "b", "c"] {
        assert!(cols.contains(expected), "missing column {expected}");
    }
    assert_eq!(result.columns.len(), 5, "expected 3 named + 2 wildcards");
 }
--- a/external/geolog
+++ b/external/geolog
@ -1 +1 @@
-Subproject commit 99d4006f4655d8a6815a9156fe4d9304515f356d
+Subproject commit 426d4c96d6031ccaf5e14c12c3dab496e3b4c365
--- a/flake.nix
+++ b/flake.nix
@ -1,5 +1,5 @@
 {
-  description = "Storage engine playground: Rust workspace for FlowLog, DBSP, CRDT, and Geomerge experiments.";
+  description = "Storage engine playground";
  inputs = {
    nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable";
@ -29,12 +29,13 @@
          packages = [
            rustToolchain
            # Diagram regeneration in crates/geomerge-demo/docs/diagrams.
            pkgs.graphviz
            # Cargo helpers.
            pkgs.cargo-watch
            pkgs.cargo-nextest
-            # Pre-commit hooks (see .pre-commit-config.yaml, Makefile setup-hooks).
+            pkgs.haskell.compiler.ghc912
            pkgs.cabal-install
            pkgs.pkg-config
            pkgs.zlib
            pkgs.pre-commit
            pkgs.python3
          ];
@ -44,10 +45,14 @@
          };
          shellHook = ''
-            echo "storage-engine-playground dev shell"
+            # Banner goes to stderr so `nix develop --command` invocations
-            echo "  rustc: $(rustc --version)"
+            # that pipe stdout (e.g. tools/exporter producing JSON) stay clean.
-            echo "  cargo: $(cargo --version)"
+            >&2 echo "storage-engine-playground dev shell"
-            echo "  dot:   $(dot -V 2>&1)"
+            >&2 echo "  rustc: $(rustc --version)"
            >&2 echo "  cargo: $(cargo --version)"
            >&2 echo "  ghc:   $(ghc --version)"
            >&2 echo "  cabal: $(cabal --version | head -1)"
            >&2 echo "  dot:   $(dot -V 2>&1)"
          '';
        };
--- a/tools/exporter/cabal.project
+++ b/tools/exporter/cabal.project
@ -0,0 +1,24 @@
 -- cabal.project for the geolog -> Rust JSON exporter.
 --
 -- This file points at the geolog-lang library inside the external/geolog
 -- submodule, plus the sibling packages it depends on (data-partition,
 -- diagnostician, fnotation). It mirrors the submodule's own cabal.project
 -- so the exporter sees the same source set the submodule's tests build
 -- against.
 packages:
  glog-exporter.cabal
  ../../external/geolog/geolog-lang/geolog-lang.cabal
  ../../external/geolog/data-partition/data-partition.cabal
  ../../external/geolog/diagnostician/diagnostician.cabal
  ../../external/geolog/fnotation/fnotation.cabal
 -- geolog-lang's DB.Plan.Render module uses a patched diagrams-graphviz.
 -- Same pin as external/geolog/cabal.project.
 source-repository-package
  type: git
  location: https://github.com/georgefst/diagrams-graphviz.git
  tag: 993533c564861f9d0663d719eafd56efd95f59ba
 jobs: $ncpus
 semaphore: true
--- a/tools/exporter/glog-exporter.cabal
+++ b/tools/exporter/glog-exporter.cabal
@ -0,0 +1,36 @@
 cabal-version:   3.4
 name:            glog-exporter
 version:         0.1.0.0
 license:         MIT OR Apache-2.0
 author:          storage-engine-playground
 synopsis:        Export geolog-lang join plans as JSON for the Rust runner.
 description:
  Builds a FlatTheory + facts + a list of QAtoms for a named scenario,
  runs Geolog.DB.Plan.planConjunction, and emits a JSON document that
  crates/glog-runner consumes. This allows the playground use query-ops and
  storage end-to-end with a real Yannakakis plan produced by the geolog
  frontend, not a hand-written fixture.
 build-type:      Simple
 executable glog-export
  main-is:          Main.hs
  hs-source-dirs:   src
  default-language: GHC2024
  default-extensions:
    BlockArguments
    LambdaCase
    OverloadedRecordDot
    OverloadedStrings
  ghc-options:      -Wall
  build-depends:
    , aeson             >=2.2
    , aeson-pretty      >=0.8
    , algebraic-graphs  >=0.7
    , base
    , bytestring
    , containers
    , geolog-lang
    , text
--- a/tools/exporter/src/Main.hs
+++ b/tools/exporter/src/Main.hs
@ -0,0 +1,252 @@
 -- | Exports a geolog-lang join plan as JSON for the Rust runner in
 -- @crates/glog-runner@.
 --
 -- Invocation:
 --
 -- @
 --   cabal run glog-export -- <scenario> > plan.json
 -- @
 --
 -- Available scenarios: @three-atom-chain@.
 --
 -- The output shape is documented in @crates\/glog-runner\/src\/lib.rs@.
 -- This program is the canonical producer: any change to the IR should
 -- start here, with the Rust runner updated to match.
 module Main (main) where
 import Algebra.Graph qualified as AG
 import Data.Aeson ((.=))
 import Data.Aeson qualified as Aeson
 import Data.Aeson.Encode.Pretty qualified as AesonPretty
 import Data.Aeson.Key qualified as Key
 import Data.ByteString.Lazy.Char8 qualified as LBS8
 import Data.List (sortOn)
 import Data.Map.Strict (Map)
 import Data.Map.Strict qualified as Map
 import Data.Set qualified as Set
 import Data.Text (Text)
 import Data.Text qualified as T
 import Geolog.DB.InMemory
 import Geolog.DB.Plan
 import Geolog.IR qualified as IR
 import System.Environment (getArgs)
 import System.Exit (die)
 import System.IO (hPutStrLn, stderr)
 -- * Scenario plumbing
 --
 -- A scenario fixes a schema, a set of ground facts, and a conjunction of
 -- query atoms. The exporter is intentionally code-driven (not @.glog@
 -- driven): @.glog@ files declare theories, not queries, so the query
 -- side has to live in Haskell either way.
 data Scenario = Scenario
  { scName :: String
  , scTheory :: IR.FlatTheory
  , scFacts :: [(IR.Path, [Val])]
  , scAtoms :: [QAtom]
  }
 -- * three-atom-chain
 --
 -- Mirrors @DB.InMemoryTest@ "matches evalConjunction on three-atom chain".
 -- node = {e1, e2, e3}, edge = {(e1,e2,ee1), (e2,e3,ee2)}.
 -- Conjunction: node(a), edge(a, b, _), edge(b, c, _).
 nodePath, edgePath :: IR.Path
 nodePath = ["node"]
 edgePath = ["edge"]
 threeAtomChain :: Scenario
 threeAtomChain =
  Scenario
    { scName = "three-atom-chain"
    , scTheory =
        IR.FlatTheory
          { tables =
              Map.fromList
                [ (nodePath, IR.Table {columns = [IR.EntityType nodePath], primaryKey = Nothing})
                , (edgePath, IR.Table {columns = [IR.EntityType nodePath, IR.EntityType nodePath, IR.EntityType edgePath], primaryKey = Nothing})
                ]
          , laws = Map.empty
          }
    , scFacts =
        [ (nodePath, [ValEntity nodePath 1])
        , (nodePath, [ValEntity nodePath 2])
        , (nodePath, [ValEntity nodePath 3])
        , (edgePath, [ValEntity nodePath 1, ValEntity nodePath 2, ValEntity edgePath 1])
        , (edgePath, [ValEntity nodePath 2, ValEntity nodePath 3, ValEntity edgePath 2])
        ]
    , scAtoms =
        [ QAtom {qaTable = nodePath, qaRowId = Nothing, qaValues = Map.singleton 0 (QVar (Var "a"))}
        , QAtom {qaTable = edgePath, qaRowId = Nothing, qaValues = Map.fromList [(0, QVar (Var "a")), (1, QVar (Var "b"))]}
        , QAtom {qaTable = edgePath, qaRowId = Nothing, qaValues = Map.fromList [(0, QVar (Var "b")), (1, QVar (Var "c"))]}
        ]
    }
 scenarios :: [Scenario]
 scenarios = [threeAtomChain]
 -- * JSON encoding
 --
 -- The shape mirrors the IR in @crates/glog-runner/src/lib.rs@:
 --
 -- > {
 -- >   "schema": {<name>: <arity>, ...},
 -- >   "facts":  {<name>: [[<value>, ...], ...], ...},
 -- >   "query":  {"root": <id>, "nodes": [{"id": <id>, "action": <action>}, ...]}
 -- > }
 -- | Render a 'Geolog.IR.Path' (a list of 'FNotation.Names.Name') as a flat
 -- string for use as a relation name on the Rust side. Each 'Name' is
 -- already shown with @\/@ between its own init segments and last, so we
 -- reuse 'show' and join Names with @\/@ too.
 pathText :: IR.Path -> Text
 pathText = T.intercalate "/" . map (T.pack . show)
 pathKey :: IR.Path -> Aeson.Key
 pathKey = Key.fromText . pathText
 encodeValue :: Val -> Aeson.Value
 encodeValue =
  Aeson.object . pure . \case
    ValInt n -> "int" .= n
    ValText t -> "str" .= t
    ValEntity p n -> "str" .= (pathText p <> ":" <> T.pack (show n))
 encodeTerm :: QVal -> Aeson.Value
 encodeTerm = \case
  QVar (Var name) -> Aeson.object ["var" .= name]
  QLit v -> Aeson.object ["lit" .= encodeValue v]
 -- | Flatten an atom into one term per stored column, mirroring
 -- @Geolog.DB.InMemory.toFlatArgs@: @qaValues@ keys map to positions
 -- @0..n-2@, @qaRowId@ (if present) maps to position @n-1@, and any
 -- missing positions become wildcard variables with locally-unique names.
 flattenAtom :: Int -> Int -> QAtom -> [Aeson.Value]
 flattenAtom atomIdx arity qa =
  [ encodeTerm (Map.findWithDefault (wildcard atomIdx pos) pos merged)
  | pos <- [0 .. arity - 1]
  ]
  where
    merged = case qa.qaRowId of
      Nothing -> qa.qaValues
      Just v -> Map.insert (arity - 1) v qa.qaValues
    wildcard a p = QVar (Var (T.pack ("_w" <> show a <> "_" <> show p)))
 encodeAtom :: Map IR.Path IR.Table -> Int -> QAtom -> Aeson.Value
 encodeAtom tables atomIdx qa =
  Aeson.object
    [ "table" .= pathText qa.qaTable
    , "columns" .= flattenAtom atomIdx arity qa
    ]
  where
    arity = case Map.lookup qa.qaTable tables of
      Just t -> length t.columns
      Nothing -> error ("encodeAtom: unknown table " <> show qa.qaTable)
 -- | Stable atom indexing keyed by atom identity, so the wildcard names in
 -- @flattenAtom@ are deterministic across runs even if the planner's node
 -- ordering changes.
 atomIndex :: [QAtom] -> Map QAtom Int
 atomIndex atoms = Map.fromList (zip (Set.toList (Set.fromList atoms)) [0 ..])
 encodeJoinOp :: JoinType -> Aeson.Value
 encodeJoinOp = \case
  LeftJoin -> "left"
  RightJoin -> "right"
  NaturalJoin -> "natural"
 encodeNode :: Map IR.Path IR.Table -> Map QAtom Int -> PlanNode -> Aeson.Value
 encodeNode tables idx n =
  Aeson.object
    [ "id" .= n.graphId.unPlanNodeId
    , "action" .= case n.action of
        PlanEvalAtom qa ->
          let i = Map.findWithDefault 0 qa idx
           in Aeson.object ["scan" .= encodeAtom tables i qa]
        PlanJoin jt (PlanNodeId a) (PlanNodeId b) ->
          Aeson.object
            [ "join"
                .= Aeson.object
                  [ "op" .= encodeJoinOp jt
                  , "left" .= a
                  , "right" .= b
                  ]
            ]
    ]
 -- | Render a 'PlanGraph' as the JSON the runner consumes. Empty graphs
 -- produce @{"root": 0, "nodes": []}@, which the runner treats as a
 -- well-formed but empty query.
 encodeQuery :: Map IR.Path IR.Table -> Map QAtom Int -> PlanGraph -> Aeson.Value
 encodeQuery tables idx (PlanGraph g)
  | null nodes =
      Aeson.object ["root" .= (0 :: Int), "nodes" .= ([] :: [Aeson.Value])]
  | otherwise =
      Aeson.object
        [ "root" .= rootId
        , "nodes" .= map (encodeNode tables idx) nodes
        ]
  where
    nodes = sortOn (.graphId.unPlanNodeId) (AG.vertexList g)
    rootId = case graphRoot (PlanGraph g) of
      Just (PlanNodeId i) -> i
      -- Non-empty graph with no topological root means a cycle, which
      -- planConjunction never produces. Fall back to the last id rather
      -- than crashing so a bug here is still inspectable.
      Nothing -> (.graphId.unPlanNodeId) (last nodes)
 encodePlan :: Scenario -> Aeson.Value
 encodePlan sc =
  Aeson.object
    [ "_scenario" .= sc.scName
    , "schema" .= Aeson.object
        [pathKey p .= length t.columns | (p, t) <- Map.toList sc.scTheory.tables]
    , "facts" .= Aeson.object
        [pathKey p .= map (map encodeValue) rows | (p, rows) <- groupedFacts sc.scFacts]
    , "query" .= encodeQuery sc.scTheory.tables (atomIndex sc.scAtoms) (planConjunction sc.scAtoms)
    ]
 -- | Group facts by table while preserving table-first-seen order and
 -- per-table insertion order.
 groupedFacts :: [(IR.Path, [Val])] -> [(IR.Path, [[Val]])]
 groupedFacts = go []
  where
    go acc [] = reverse [(p, reverse rs) | (p, rs) <- acc]
    go acc ((p, row) : rest) =
      let acc' = case break (\(q, _) -> q == p) acc of
            (before, (q, rs) : after) -> before ++ (q, row : rs) : after
            (before, []) -> before ++ [(p, [row])]
       in go acc' rest
 -- * Self-check
 --
 -- Run the planner's @evalConjunctionPlanned@ against the scenario's DB
 -- to confirm the plan we're about to emit is well-formed and produces
 -- non-error output. Catches malformed scenarios before they hand a bad
 -- plan to the Rust runner.
 selfCheck :: Scenario -> IO ()
 selfCheck sc = do
  let db = foldl (\d (p, row) -> insertRow p row d) (fromTheory sc.scTheory) sc.scFacts
  case evalConjunctionPlanned db sc.scAtoms of
    Left err -> die ("self-check failed for " <> sc.scName <> ": " <> show err)
    Right _ -> pure ()
 -- * Entry point
 main :: IO ()
 main = do
  args <- getArgs
  case args of
    [name] -> case lookup name [(s.scName, s) | s <- scenarios] of
      Just sc -> do
        selfCheck sc
        LBS8.putStrLn (AesonPretty.encodePretty (encodePlan sc))
      Nothing ->
        die ("unknown scenario: " <> name <> "\navailable: " <> unwords (map (.scName) scenarios))
    _ -> do
      hPutStrLn stderr "usage: glog-export <scenario>"
      hPutStrLn stderr ("scenarios: " <> unwords (map (.scName) scenarios))
      die ""
		`@ -1 +1 @@`
			`Subproject commit 99d4006f4655d8a6815a9156fe4d9304515f356d`				`Subproject commit 426d4c96d6031ccaf5e14c12c3dab496e3b4c365`