diff --git a/.editorconfig b/.editorconfig index 8958945..9dd1efb 100644 --- a/.editorconfig +++ b/.editorconfig @@ -8,7 +8,7 @@ indent_size = 4 insert_final_newline = true trim_trailing_whitespace = true -[*.rs] +[*.{rs,hs,py}] max_line_length = 100 [*.md] diff --git a/AGENTS.md b/AGENTS.md index 44cf7ae..26b91bf 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -50,7 +50,7 @@ Expected durable areas may include: - `src/`: Rust source for parser, catalog, planner, execution experiments, and storage prototypes. - `tests/`: integration tests for rule planning, evaluation, and storage behavior. -- `examples/`: small runnable Datalog-like programs or storage scenarios. +- `tools/exporter/examples/`: hand-authored scenario JSON consumed by the Haskell exporter to produce runner fixtures. - `fixtures/`: committed input facts and expected outputs. - `notes/`: local design notes that belong to this project. - `flowlog/`: project-local notes or sketches derived from the FlowLog line of work. diff --git a/Cargo.lock b/Cargo.lock index 4e72142..00c6fbb 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -555,16 +555,6 @@ dependencies = [ "wasip3", ] -[[package]] -name = "glog-runner" -version = "0.1.0" -dependencies = [ - "query-ops", - "serde", - "serde_json", - "storage", -] - [[package]] name = "guardian" version = "1.3.0" @@ -1156,6 +1146,17 @@ version = "0.3.33" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "19f132c84eca552bf34cab8ec81f1c1dcc229b811638f9d283dceabe58c5569e" +[[package]] +name = "plan-runner" +version = "0.1.0" +dependencies = [ + "query-ops", + "serde", + "serde_json", + "storage", + "tempfile", +] + [[package]] name = "plotters" version = "0.3.7" diff --git a/Makefile b/Makefile index bb4bb46..3af3df8 100644 --- a/Makefile +++ b/Makefile @@ -77,22 +77,28 @@ clean: ## Remove build output fi EXPORTER_DIR := tools/exporter -EXPORTER_FIXTURES := crates/glog-runner/fixtures -EXPORTER_SCENARIOS := three-atom-chain +EXPORTER_FIXTURES := crates/plan-runner/fixtures +EXAMPLES_DIR := $(EXPORTER_DIR)/examples .PHONY: export-fixtures -export-fixtures: ## Regenerate JSON plan fixtures from the Haskell exporter (needs Cabal and GHC; use `make shell` first). +export-fixtures: ## Regenerate plan JSON for every tools/exporter/examples/*.scenario.json (needs Cabal and GHC; use `make shell` first). @if ! command -v cabal >/dev/null 2>&1; then \ echo "cabal not found. Enter the dev shell with 'make shell' (or 'nix develop') first."; \ exit 1; \ fi - @cd $(EXPORTER_DIR) && cabal build glog-export - @for sc in $(EXPORTER_SCENARIOS); do \ - out=$(EXPORTER_FIXTURES)/$$(echo $$sc | tr '-' '_').json; \ + @cd $(EXPORTER_DIR) && cabal build plan-export + @mkdir -p $(EXPORTER_FIXTURES) + @for sc in $(EXAMPLES_DIR)/*.scenario.json; do \ + base=$$(basename $$sc .scenario.json); \ + out=$(EXPORTER_FIXTURES)/$$base.json; \ echo "exporting $$sc -> $$out"; \ - (cd $(EXPORTER_DIR) && cabal run -v0 glog-export -- $$sc) > $$out; \ + (cd $(EXPORTER_DIR) && cabal run -v0 plan-export -- examples/$$base.scenario.json) > $$out; \ done +.PHONY: examples +examples: export-fixtures ## Regenerate fixtures from scenarios and run them through plan-runner against their oracles. + @cargo test -p plan-runner --test examples + .PHONY: shell shell: ## Enter the Nix dev shell defined in flake.nix @nix develop diff --git a/README.md b/README.md index 28e7520..3ff6bb4 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ ## Storage Engine Playground -This repo is a playground for running small experiments related to things like FlowLog, DBSP, Geomerge, etc. +This repo is a playground for running small experiments related to storage side of things. ### Development @@ -23,3 +23,7 @@ make test # Run tests ``` Run `make help` to see all the available targets. + +### Crates + +Check out the [crates](crates) directory for more details. diff --git a/crates/README.md b/crates/README.md index fd57dda..1ce26d8 100644 --- a/crates/README.md +++ b/crates/README.md @@ -1,12 +1,8 @@ -## Crates +## Crate Overview -Each subdirectory should be a normal Cargo package (or crate) with its own `Cargo.toml`. -Something like this: - -```text -crates/ - app-name/ - Cargo.toml - src/ - main.rs -``` +| Crate | Kind | Responsibility | +|-----------------|--------------------|-----------------------------------------------------------------------------------------------------------------| +| `storage` | library | Defines a unified interface to use different storage backends (like Geomerge, SQLite, LMDB, etc.) | +| `query-ops` | library | Provides a set of operators (like different types of joins) to execute a query plan. | +| `plan-runner` | library and binary | Provides a CLI to run a query plan against a given storage backend using the operatirs provided by `query-ops`. | +| `geomerge-demo` | binary | An example that shows how to write and read to Geomerge (as a storage). | diff --git a/crates/geomerge-demo/docs/diagrams/workflow.svg b/crates/geomerge-demo/docs/diagrams/workflow.svg index aded812..925ad80 100644 --- a/crates/geomerge-demo/docs/diagrams/workflow.svg +++ b/crates/geomerge-demo/docs/diagrams/workflow.svg @@ -1,385 +1,243 @@ + "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd"> - - GeomergeDemoWorkflow - - - cluster_inputs - - Inputs - - - - cluster_demo - - geomerge-demo (run_demo) - - - - cluster_loading - - Theory Loading - - - - cluster_store - - Storage and Transaction - - - - cluster_persist - - Persistence Round Trip - - - - cluster_report - - Report - - - - - paths_schema - - - paths.json - - - (compiled schema) - - - - - load_theory - - - load_paths_theory - - - (serde_json) - - - - - paths_schema->load_theory - - - include_str! - - - - - fixture_rows - - - Fixture Rows - - - (graphs, vertices, edge) - - - - - transact - - add_paths_data (tx.insert ×7) - - - • insert Graphs rows - - - • insert G0, G1 rows - - - • insert G.V vertices - - • - insert G.E edge - - • - pending RowIds reused as FKs - - - - - fixture_rows->transact - - - - - - flat_theory - - FlatTheory - - • - 10 tables - - • - 12 laws - - - - - load_theory->flat_theory - - - - - - build_store - - - GeomergeStorage::from_theory - - - (Store::try_from_theory) - - - - - flat_theory->build_store - - - - - - demo_report - - DemoReport - - • table_count, law_count - - • graph, vertex, edge counts - - • edge endpoints - - • persisted_bytes - - - - - flat_theory->demo_report - - - counts - - - - - build_store->transact - - - - - - commit - - tx.commit() - - • law validation - - • CommittedTx resolves pending RowIds - - - - - transact->commit - - - - - - assert_edge - - assert_edge_was_stored - - (storage.scan(G.E)) - - - - - commit->assert_edge - - - - - - dump_before - - store.dump() - - (before persist) - - - - - assert_edge->dump_before - - - - - - encode - - pst::encode_store - - -> bytes - - - - - dump_before->encode - - - - - - compare - - dump equality check - - - - - dump_before->compare - - - expected - - - - - decode - - pst::decode_store - - -> restored Store - - - - - encode->decode - - - bytes - - - - - decode->compare - - - - - - compare->demo_report - - - - - - stdout - - stdout - - (println! lines) - - - - - demo_report->stdout - - - - + viewBox="0.00 0.00 3020.25 407.00" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink"> + +GeomergeDemoWorkflow + + +cluster_inputs + +Inputs + + +cluster_demo + +geomerge-demo (run_demo) + + +cluster_loading + +Theory Loading + + +cluster_store + +Storage and Transaction + + +cluster_persist + +Persistence Round Trip + + +cluster_report + +Report + + + +paths_schema + +paths.json +(compiled schema) + + + +load_theory + +load_paths_theory +(serde_json) + + + +paths_schema->load_theory + + +include_str! + + + +fixture_rows + +Fixture Rows +(graphs, vertices, edge) + + + +transact + +add_paths_data (tx.insert ×7) +• insert Graphs rows +• insert G0, G1 rows +• insert G.V vertices +• insert G.E edge +• pending RowIds reused as FKs + + + +fixture_rows->transact + + + + + +flat_theory + +FlatTheory +• 10 tables +• 12 laws + + + +load_theory->flat_theory + + + + + +build_store + +GeomergeStorage::from_theory +(Store::try_from_theory) + + + +flat_theory->build_store + + + + + +demo_report + +DemoReport +• table_count, law_count +• graph, vertex, edge counts +• edge endpoints +• persisted_bytes + + + +flat_theory->demo_report + + +counts + + + +build_store->transact + + + + + +commit + +tx.commit() +• law validation +• CommittedTx resolves pending RowIds + + + +transact->commit + + + + + +assert_edge + +assert_edge_was_stored +(storage.scan(G.E)) + + + +commit->assert_edge + + + + + +dump_before + +store.dump() +(before persist) + + + +assert_edge->dump_before + + + + + +encode + +pst::encode_store +-> bytes + + + +dump_before->encode + + + + + +compare + +dump equality check + + + +dump_before->compare + + +expected + + + +decode + +pst::decode_store +-> restored Store + + + +encode->decode + + +bytes + + + +decode->compare + + + + + +compare->demo_report + + + + + +stdout + +stdout +(println! lines) + + + +demo_report->stdout + + + + diff --git a/crates/glog-runner/fixtures/three_atom_chain.json b/crates/glog-runner/fixtures/three_atom_chain.json deleted file mode 100644 index c8d3f7e..0000000 --- a/crates/glog-runner/fixtures/three_atom_chain.json +++ /dev/null @@ -1,166 +0,0 @@ -{ - "_scenario": "three-atom-chain", - "facts": { - "edge": [ - [ - { - "str": "node:1" - }, - { - "str": "node:2" - }, - { - "str": "edge:1" - } - ], - [ - { - "str": "node:2" - }, - { - "str": "node:3" - }, - { - "str": "edge:2" - } - ] - ], - "node": [ - [ - { - "str": "node:1" - } - ], - [ - { - "str": "node:2" - } - ], - [ - { - "str": "node:3" - } - ] - ] - }, - "query": { - "nodes": [ - { - "action": { - "scan": { - "columns": [ - { - "var": "a" - }, - { - "var": "b" - }, - { - "var": "_w0_2" - } - ], - "table": "edge" - } - }, - "id": 1 - }, - { - "action": { - "scan": { - "columns": [ - { - "var": "b" - }, - { - "var": "c" - }, - { - "var": "_w1_2" - } - ], - "table": "edge" - } - }, - "id": 2 - }, - { - "action": { - "scan": { - "columns": [ - { - "var": "a" - } - ], - "table": "node" - } - }, - "id": 3 - }, - { - "action": { - "join": { - "left": 1, - "op": "left", - "right": 3 - } - }, - "id": 4 - }, - { - "action": { - "join": { - "left": 2, - "op": "left", - "right": 4 - } - }, - "id": 5 - }, - { - "action": { - "join": { - "left": 5, - "op": "right", - "right": 4 - } - }, - "id": 6 - }, - { - "action": { - "join": { - "left": 6, - "op": "right", - "right": 3 - } - }, - "id": 7 - }, - { - "action": { - "join": { - "left": 6, - "op": "natural", - "right": 7 - } - }, - "id": 8 - }, - { - "action": { - "join": { - "left": 5, - "op": "natural", - "right": 8 - } - }, - "id": 9 - } - ], - "root": 9 - }, - "schema": { - "edge": 3, - "node": 1 - } -} diff --git a/crates/glog-runner/src/lib.rs b/crates/glog-runner/src/lib.rs deleted file mode 100644 index c7da6e4..0000000 --- a/crates/glog-runner/src/lib.rs +++ /dev/null @@ -1,344 +0,0 @@ -//! End-to-end runner that executes a `geolog-lang` conjunctive-query plan -//! against this workspace's storage and `query-ops` operators. -//! -//! The upstream Haskell planner in `external/geolog/geolog-lang` -//! (`Geolog.DB.Plan`) builds a Yannakakis-style join DAG over `QAtom`s. This -//! crate accepts that DAG as JSON, materializes the input relations through -//! the [`Storage`] trait, and walks the DAG using -//! [`query_ops::atom::scan_atom`], [`query_ops::join::semijoin`], and -//! [`query_ops::join::natural_join`]. The result is a binding -//! [`Relation`](query_ops::relation::Relation) over the query's variables. -//! -//! The JSON IR mirrors `Geolog.DB.Plan.JoinPlan` and `Geolog.DB.InMemory.QAtom` -//! without depending on the Haskell side at build time. A Haskell exporter -//! that dumps `(schema, facts, JoinPlan)` to this shape is the planned -//! follow-up that completes the round trip; the IR is the contract. -//! -//! Mapping from the Haskell planner: -//! -//! | `Geolog.DB.Plan` | this crate | -//! |-----------------------------|-----------------------------------------------| -//! | `PlanEvalAtom` | [`Action::Scan`] → `scan_atom` | -//! | `PlanJoin LeftJoin a b` | [`Action::Join`] with [`JoinOp::Left`] → `semijoin(rel[a], rel[b])` | -//! | `PlanJoin RightJoin a b` | [`Action::Join`] with [`JoinOp::Right`] → `semijoin(rel[b], rel[a])` | -//! | `PlanJoin NaturalJoin a b` | [`Action::Join`] with [`JoinOp::Natural`] → `natural_join(rel[a], rel[b])` | -//! -//! The atom side covers `evalAtom` (`Geolog.DB.InMemory`): a [`Term::Var`] -//! repeated across positions enforces equality, [`Term::Lit`] filters by -//! constant, and distinct variables project in first-occurrence order. - -use std::collections::HashMap; - -use serde::Deserialize; - -use query_ops::atom::{AtomPattern, Term, scan_atom}; -use query_ops::join::{natural_join, semijoin}; -use query_ops::relation::Relation; -use storage::value::Value; -use storage::{MemoryStorage, Storage, StorageError, scan_as_table}; - -/// A single fixture: schema, ground facts, and a query plan to execute. -#[derive(Debug, Clone, Deserialize)] -pub struct Plan { - /// Relation name → arity (column count). - pub schema: HashMap, - /// Relation name → list of ground tuples to insert before execution. - pub facts: HashMap>>, - /// The join DAG itself. - pub query: Query, -} - -/// Mirrors `Geolog.DB.Plan.JoinPlan`: a set of nodes plus the id of the -/// rooted result node. -#[derive(Debug, Clone, Deserialize)] -pub struct Query { - pub root: u32, - pub nodes: Vec, -} - -/// One node of the plan DAG. `id`s are dense within a `Query` but don't need -/// to start at any particular value, mirroring the Haskell `PlanNodeId`. -#[derive(Debug, Clone, Deserialize)] -pub struct Node { - pub id: u32, - pub action: Action, -} - -/// What to compute at a node. Tagged externally so JSON reads as -/// `{"action": {"scan": {...}}}` or `{"action": {"join": {...}}}`. -#[derive(Debug, Clone, Deserialize)] -#[serde(rename_all = "snake_case")] -pub enum Action { - Scan(Atom), - Join(Join), -} - -/// A flat atom pattern, one entry per column of the target relation. -/// Matches the `toFlatArgs` view used by `Geolog.DB.InMemory.evalAtom`: -/// `qaValues` positions are filled in directly, and the entity-id column -/// (if any) appears at the last position. Wildcard positions in the -/// Haskell `QAtom` (a `Map Int QVal` with a missing key) translate to a -/// fresh, unique variable name on this side, which the operator binds but -/// never joins against. -#[derive(Debug, Clone, Deserialize)] -pub struct Atom { - pub table: String, - pub columns: Vec, -} - -#[derive(Debug, Clone, Deserialize)] -#[serde(rename_all = "snake_case")] -pub enum JsonTerm { - Var(String), - Lit(JsonValue), -} - -/// Wire-level value tag. Restricted to what `storage::value::Value` carries. -/// Entity identities from the Haskell side (`ValEntity path id`) round-trip -/// through `Str` for now using a `"path:id"` convention; that's a fixture -/// concern, not a runner concern. -#[derive(Debug, Clone, Deserialize)] -#[serde(rename_all = "snake_case")] -pub enum JsonValue { - Int(i64), - Str(String), -} - -#[derive(Debug, Clone, Deserialize)] -pub struct Join { - pub op: JoinOp, - pub left: u32, - pub right: u32, -} - -#[derive(Debug, Clone, Copy, Deserialize)] -#[serde(rename_all = "snake_case")] -pub enum JoinOp { - /// `Geolog.DB.Plan.LeftJoin`: result is `left` rows whose shared columns - /// appear in `right`. Lowered to `semijoin(left, right)`. - Left, - /// `Geolog.DB.Plan.RightJoin`: result is `right` rows whose shared - /// columns appear in `left`. Lowered to `semijoin(right, left)`. - Right, - /// `Geolog.DB.Plan.NaturalJoin`. Lowered to `natural_join(left, right)`. - Natural, -} - -/// Errors a runner can produce in addition to storage failures. -#[derive(Debug)] -pub enum RunError { - /// A fact references a relation that isn't declared in `schema`. - UnknownRelation(String), - /// A node id appears in a `Join` action but no node with that id exists. - MissingNode(u32), - /// `Query.root` doesn't match any node in `nodes`. - MissingRoot(u32), - /// Two nodes share the same id. - DuplicateNode(u32), - /// A join node references its left or right side before that side has - /// been computed: the DAG isn't actually topologically sorted by id, or - /// it has a cycle. - UnresolvedDependency { node: u32, depends_on: u32 }, - /// Storage layer rejected an operation. - Storage(StorageError), -} - -impl std::fmt::Display for RunError { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - match self { - Self::UnknownRelation(name) => { - write!(f, "facts reference relation {name:?} not in schema") - } - Self::MissingNode(id) => write!(f, "plan references missing node id {id}"), - Self::MissingRoot(id) => write!(f, "plan root id {id} matches no node"), - Self::DuplicateNode(id) => write!(f, "duplicate node id {id} in plan"), - Self::UnresolvedDependency { node, depends_on } => write!( - f, - "node {node} depends on {depends_on}, which has not been computed yet" - ), - Self::Storage(err) => write!(f, "storage error: {err}"), - } - } -} - -impl std::error::Error for RunError { - fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { - match self { - Self::Storage(err) => Some(err), - _ => None, - } - } -} - -impl From for RunError { - fn from(err: StorageError) -> Self { - Self::Storage(err) - } -} - -impl From for Value { - fn from(jv: JsonValue) -> Self { - match jv { - JsonValue::Int(n) => Self::Int(n), - JsonValue::Str(s) => Self::Str(s), - } - } -} - -impl From for Term { - fn from(t: JsonTerm) -> Self { - match t { - JsonTerm::Var(name) => Self::Var(name), - JsonTerm::Lit(value) => Self::Lit(value.into()), - } - } -} - -/// Parse a [`Plan`] from a JSON string. -/// -/// # Errors -/// Returns a [`serde_json::Error`] if the input isn't valid JSON in the -/// expected shape. -pub fn parse_plan(json: &str) -> Result { - serde_json::from_str(json) -} - -/// Load schema and facts from a [`Plan`] into a fresh [`MemoryStorage`]. -/// -/// All facts are inserted in a single transaction; commit is atomic so a -/// failure on row N leaves the storage empty. -/// -/// # Errors -/// Returns [`RunError::UnknownRelation`] if facts mention a relation not -/// declared in `schema`. Wraps storage failures (arity mismatch, transaction -/// errors) in [`RunError::Storage`]. -pub fn load_into_memory(plan: &Plan) -> Result { - let mut storage = MemoryStorage::default(); - for (name, arity) in &plan.schema { - storage.create_relation(name, *arity)?; - } - { - let mut tx = storage.transaction()?; - for (name, rows) in &plan.facts { - if !plan.schema.contains_key(name) { - return Err(RunError::UnknownRelation(name.clone())); - } - for row in rows { - let cells: Vec = row.iter().cloned().map(Value::from).collect(); - tx.insert(name, cells)?; - } - } - let _ = tx.commit()?; - } - Ok(storage) -} - -/// Execute a plan against a storage backend, returning the bindings -/// [`Relation`] for the rooted plan node. -/// -/// Nodes are executed in ascending `id` order. For a Yannakakis plan as -/// emitted by `Geolog.DB.Plan` this is equivalent to a topological sort, -/// since `insertJoin` only references node ids that have already been -/// allocated. A non-monotone id ordering is rejected with -/// [`RunError::UnresolvedDependency`]. -/// -/// # Errors -/// Returns [`RunError::DuplicateNode`] for repeated ids, -/// [`RunError::MissingNode`] for join references to unknown ids, -/// [`RunError::MissingRoot`] if `query.root` isn't present, and storage -/// errors during the per-scan `scan_as_table` call. -pub fn execute(storage: &S, query: &Query) -> Result { - let mut seen_ids: std::collections::HashSet = - std::collections::HashSet::with_capacity(query.nodes.len()); - for node in &query.nodes { - if !seen_ids.insert(node.id) { - return Err(RunError::DuplicateNode(node.id)); - } - } - if !seen_ids.contains(&query.root) { - return Err(RunError::MissingRoot(query.root)); - } - - let mut ordered: Vec<&Node> = query.nodes.iter().collect(); - ordered.sort_by_key(|n| n.id); - - let mut results: HashMap = HashMap::with_capacity(ordered.len()); - for node in ordered { - let computed = match &node.action { - Action::Scan(atom) => { - let table = scan_as_table(storage, &atom.table)?; - let pattern = AtomPattern { - columns: atom.columns.iter().cloned().map(Term::from).collect(), - }; - scan_atom(&table, &pattern) - } - Action::Join(join) => { - let left = require_dep(&results, &seen_ids, node.id, join.left)?; - let right = require_dep(&results, &seen_ids, node.id, join.right)?; - match join.op { - JoinOp::Left => semijoin(left, right), - JoinOp::Right => semijoin(right, left), - JoinOp::Natural => natural_join(left, right), - } - } - }; - results.insert(node.id, computed); - } - - results - .remove(&query.root) - .ok_or(RunError::MissingRoot(query.root)) -} - -fn require_dep<'a>( - results: &'a HashMap, - seen: &std::collections::HashSet, - node: u32, - depends_on: u32, -) -> Result<&'a Relation, RunError> { - if let Some(rel) = results.get(&depends_on) { - Ok(rel) - } else if seen.contains(&depends_on) { - Err(RunError::UnresolvedDependency { node, depends_on }) - } else { - Err(RunError::MissingNode(depends_on)) - } -} - -/// Convenience: parse JSON, load it into a fresh in-memory storage, and -/// execute, returning the root binding relation. -/// -/// # Errors -/// Returns a JSON parse error if the input is malformed, or a [`RunError`] -/// for any later step. -pub fn run_json(json: &str) -> Result { - let plan = parse_plan(json).map_err(RunFromJsonError::Parse)?; - let storage = load_into_memory(&plan).map_err(RunFromJsonError::Run)?; - let bindings = execute(&storage, &plan.query).map_err(RunFromJsonError::Run)?; - Ok(bindings) -} - -/// Combined error from [`run_json`]. -#[derive(Debug)] -pub enum RunFromJsonError { - Parse(serde_json::Error), - Run(RunError), -} - -impl std::fmt::Display for RunFromJsonError { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - match self { - Self::Parse(err) => write!(f, "parse error: {err}"), - Self::Run(err) => write!(f, "run error: {err}"), - } - } -} - -impl std::error::Error for RunFromJsonError { - fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { - match self { - Self::Parse(err) => Some(err), - Self::Run(err) => Some(err), - } - } -} diff --git a/crates/glog-runner/src/main.rs b/crates/glog-runner/src/main.rs deleted file mode 100644 index 2c121ec..0000000 --- a/crates/glog-runner/src/main.rs +++ /dev/null @@ -1,59 +0,0 @@ -//! `glog-run` CLI: read a JSON plan from a file (or stdin if `-`), execute -//! it against a fresh in-memory store, and print the resulting binding -//! relation as JSON on stdout. - -use std::io::{self, Read}; -use std::process::ExitCode; - -fn main() -> ExitCode { - let mut args = std::env::args().skip(1); - let Some(path) = args.next() else { - eprintln!("usage: glog-run "); - return ExitCode::from(2); - }; - - let input = match read_input(&path) { - Ok(s) => s, - Err(err) => { - eprintln!("failed to read {path}: {err}"); - return ExitCode::from(1); - } - }; - - let relation = match glog_runner::run_json(&input) { - Ok(r) => r, - Err(err) => { - eprintln!("{err}"); - return ExitCode::from(1); - } - }; - - let payload = serde_json::json!({ - "columns": relation.columns, - "rows": relation - .rows - .iter() - .map(|row| row.iter().map(value_to_json).collect::>()) - .collect::>(), - }); - println!("{payload}"); - ExitCode::SUCCESS -} - -fn read_input(path: &str) -> io::Result { - if path == "-" { - let mut buf = String::new(); - io::stdin().read_to_string(&mut buf)?; - Ok(buf) - } else { - std::fs::read_to_string(path) - } -} - -fn value_to_json(value: &storage::value::Value) -> serde_json::Value { - match value { - storage::value::Value::Int(n) => serde_json::Value::Number((*n).into()), - storage::value::Value::Str(s) => serde_json::Value::String(s.clone()), - storage::value::Value::Id(id) => serde_json::Value::String(id.to_string()), - } -} diff --git a/crates/glog-runner/tests/three_atom_chain.rs b/crates/glog-runner/tests/three_atom_chain.rs deleted file mode 100644 index 9a318ea..0000000 --- a/crates/glog-runner/tests/three_atom_chain.rs +++ /dev/null @@ -1,73 +0,0 @@ -//! End-to-end check: run the JSON fixture and verify the resulting bindings -//! match the `DB.InMemoryTest` "matches evalConjunction on three-atom chain" -//! case from `external/geolog/geolog-lang/test/DB/InMemoryTest.hs`. -//! -//! For `node = {e1, e2, e3}` and `edge = {(e1,e2,ee1), (e2,e3,ee2)}` the -//! conjunction `node(a), edge(a, b, _), edge(b, c, _)` has exactly one -//! solution: `(a=e1, b=e2, c=e3)`. - -use std::collections::BTreeMap; - -use glog_runner::run_json; -use storage::value::Value; - -fn fixture() -> &'static str { - include_str!("../fixtures/three_atom_chain.json") -} - -fn ent(path: &str, id: u32) -> Value { - Value::Str(format!("{path}:{id}")) -} - -fn project<'a>( - columns: &'a [String], - row: &'a [Value], - keep: &'a [&'a str], -) -> BTreeMap<&'a str, &'a Value> { - keep.iter() - .map(|name| { - let pos = columns - .iter() - .position(|c| c == name) - .expect("column missing"); - (*name, &row[pos]) - }) - .collect() -} - -#[test] -fn three_atom_chain_matches_haskell_oracle() { - let result = run_json(fixture()).expect("fixture should execute"); - - // The plan's root keeps every variable, including the per-atom wildcards - // `_r1` and `_r2`. The oracle only asserts the (a, b, c) projection. - let keep = ["a", "b", "c"]; - let mut projected: Vec> = result - .rows - .iter() - .map(|row| project(&result.columns, row, &keep)) - .collect(); - projected.sort_by_key(|m| format!("{m:?}")); - - let e1 = ent("node", 1); - let e2 = ent("node", 2); - let e3 = ent("node", 3); - let expected = vec![BTreeMap::from([("a", &e1), ("b", &e2), ("c", &e3)])]; - - assert_eq!(projected, expected); -} - -#[test] -fn root_columns_cover_a_b_c_plus_two_wildcards() { - // The exporter emits unique wildcard variable names for the entity-id - // column of each edge atom (e.g. `_w0_2`, `_w1_2`); their exact spelling - // is an implementation detail of the exporter, so this test only checks - // that the named variables are all present and that the total column - // count is the three named ones plus two anonymous wildcards. - let result = run_json(fixture()).expect("fixture should execute"); - let cols: std::collections::HashSet<&str> = result.columns.iter().map(String::as_str).collect(); - for expected in ["a", "b", "c"] { - assert!(cols.contains(expected), "missing column {expected}"); - } - assert_eq!(result.columns.len(), 5, "expected 3 named + 2 wildcards"); -} diff --git a/crates/glog-runner/Cargo.toml b/crates/plan-runner/Cargo.toml similarity index 62% rename from crates/glog-runner/Cargo.toml rename to crates/plan-runner/Cargo.toml index 29b75d5..5663f3f 100644 --- a/crates/glog-runner/Cargo.toml +++ b/crates/plan-runner/Cargo.toml @@ -1,5 +1,5 @@ [package] -name = "glog-runner" +name = "plan-runner" version = "0.1.0" edition.workspace = true license.workspace = true @@ -9,11 +9,18 @@ rust-version.workspace = true workspace = true [dependencies] -storage = { path = "../storage" } query-ops = { path = "../query-ops" } +storage = { path = "../storage", features = [ + "lmdb", + "redb", + "fjall", + "sqlite", + "geomerge", +] } serde = { version = "1", features = ["derive"] } serde_json = "1" +tempfile = "3" [[bin]] -name = "glog-run" +name = "plan-run" path = "src/main.rs" diff --git a/crates/plan-runner/README.md b/crates/plan-runner/README.md new file mode 100644 index 0000000..5c9c387 --- /dev/null +++ b/crates/plan-runner/README.md @@ -0,0 +1,101 @@ +## Plan Runner + +This crate implements an executor for (conjunctive) query plans. +The implementation is a CLI tool. +It reads a JSON plan (which currently is a DAG of scan and join nodes plus the input facts), +walks the DAG using the operators from [`query-ops`](../query-ops), +and prints the resulting relation as JSON to stdout. + +### Pipeline + +End-to-end, scenarios become runner output through three stages: + +```text +tools/exporter/examples/*.scenario.json + └── (Haskell exporter; runs Geolog.DB.Plan.planConjunction + and Geolog.DB.InMemory.evalConjunctionPlanned as a self-check) + └── crates/plan-runner/fixtures/*.json (JSON IR; checked in) + └── (plan-runner; this crate) + └── stdout JSON, with row-for-row oracle check +``` + +The exporter (`tools/exporter`) is the only producer of runner IR today; +it's where atoms are planned and rejected if they don't fit the supported subset. +Fixtures are regenerated with `make export-fixtures`, and the full loop is `make examples`. + +What happens inside the runner once a JSON plan arrives: + +
+ + Workflow + +
+ +### Storage Backends + +The CLI takes a `--backend` flag. +The `memory` backend is the pure in-memory path; +every other backend routes facts through the [`Storage`](../storage) trait +via `build_tables_via_storage`, then scans tables back out before executing. + +| Backend | Storage | Location | +|------------------|-------------------|-----------------------| +| `memory` | none | n/a | +| `memory-storage` | `MemoryStorage` | in-process | +| `lmdb` | `LmdbStorage` | fresh tempdir per run | +| `redb` | `RedbStorage` | fresh tempdir per run | +| `fjall` | `FjallStorage` | fresh tempdir per run | +| `sqlite` | `SqliteStorage` | fresh tempdir per run | +| `geomerge` | `GeomergeStorage` | in-process | + +### Execute a Query Plan + +```sh +# Run a plan with the default backend (no storage) +cargo run -p plan-runner -- crates/plan-runner/fixtures/two_atom_join.json + +# Run the same plan with every supported backend +cargo run -p plan-runner -- --backend memory-storage crates/plan-runner/fixtures/two_atom_join.json +cargo run -p plan-runner -- --backend lmdb crates/plan-runner/fixtures/two_atom_join.json +cargo run -p plan-runner -- --backend redb crates/plan-runner/fixtures/two_atom_join.json +cargo run -p plan-runner -- --backend fjall crates/plan-runner/fixtures/two_atom_join.json +cargo run -p plan-runner -- --backend sqlite crates/plan-runner/fixtures/two_atom_join.json +cargo run -p plan-runner -- --backend geomerge crates/plan-runner/fixtures/two_atom_join.json +``` + +A sample run: + +```sh +$ plan-run crates/plan-runner/fixtures/two_atom_join.json +{"columns":["a","b","_w0_2"],"rows":[["node:1","node:2","edge:1"],["node:2","node:1","edge:2"]]} +``` + +The `_w_` columns are wildcards the exporter named so the runner can bind them. +The scenario's `expected_bindings` block names only the variables the test cares about, +and `verify` projects the runner output to that subset before comparing as a multiset. + +### Run the Tests + +```sh +cargo test -p plan-runner +``` + +### Notes + +- **IR contract.** + The runner is backend-agnostic and frontend-agnostic. + It consumes JSON in the shape documented in `src/lib.rs` and produces a binding relation. + Anything that emits the same JSON can drive it. +- **No optimizer.** + Plans are executed as written. + Node ordering, join shape, and antijoin scheduling are all the producer's responsibility. + This crate's job ends at faithful execution of the IR. +- **Wildcard columns survive.** + `scan_atom` keeps every distinct variable that appears in the pattern, + including the exporter's synthetic `_w_` names. + The runner does not project them out; + oracle verification handles that on the comparison side. +- **Bulk, not streaming.** + Each node materializes its full output as a `Relation`. + This matches `query-ops`' execution model; + it's not designed for incremental or maintained-view workloads. diff --git a/crates/plan-runner/docs/diagrams/make_figures.sh b/crates/plan-runner/docs/diagrams/make_figures.sh new file mode 100755 index 0000000..6d30150 --- /dev/null +++ b/crates/plan-runner/docs/diagrams/make_figures.sh @@ -0,0 +1,14 @@ +#!/usr/bin/env bash + +# You need to have Graphviz installed to run this script +# On Debian-based OSes, you can install it using: sudo apt-get install graphviz + +# Directory containing .dot files. Defaults to the script's own directory so the +# script works regardless of the caller's working directory. +SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +ASSET_DIR=${1:-"${SCRIPT_DIR}"} + +# Make figures from .dot files +for f in "${ASSET_DIR}"/*.dot; do + dot -Tsvg "$f" -o "${f%.dot}.svg" +done diff --git a/crates/plan-runner/docs/diagrams/workflow.dot b/crates/plan-runner/docs/diagrams/workflow.dot new file mode 100644 index 0000000..831c94b --- /dev/null +++ b/crates/plan-runner/docs/diagrams/workflow.dot @@ -0,0 +1,136 @@ +digraph PlanRunnerWorkflow { +fontname = "Helvetica,Arial,sans-serif" +layout = dot +rankdir = LR +ranksep = 0.9; +nodesep = 0.7; +splines = true; +compound = true; +bgcolor = "white" + +node [ +fontname = "Helvetica,Arial,sans-serif", +shape = box, +style = "filled,rounded", +color = "#555555", +fillcolor = "white", +penwidth = 1.5 +] +edge [ +fontname = "Helvetica,Arial,sans-serif", +color = "#333333", +fontsize = 9, +fontcolor = "#555555", +labeldistance = 2.0, +penwidth = 1.2 +] + +subgraph cluster_input { +label = "Input" +style = "dashed" +color = "#888888" +fontcolor = "#555555" +margin = 18 +json_plan [label = < + + + + + +
JSON Plan
• schema: name -> arity
• facts: name -> rows
• query: { root, nodes }
• expected_bindings (optional oracle)
>, fillcolor = "#E8F4FD", color = "#2196F3"] +} + +subgraph cluster_parse { +label = "Parse" +style = "dashed" +color = "#9C27B0" +fontcolor = "#7B1FA2" +margin = 14 +parse_plan [label = "parse_plan(json)\n-> Plan", fillcolor = "#F3E5F5", color = "#9C27B0"] +} + +subgraph cluster_load { +label = "Load Tables (--backend selects the path)" +style = "dashed" +color = "#4CAF50" +fontcolor = "#388E3C" +margin = 14 +build_pure [label = < + + + +
build_tables(plan)
--backend memory
direct from plan.facts
>, fillcolor = "#E8F5E9", color = "#4CAF50"] +build_storage [label = < + + + + + +
build_tables_via_storage<S: Storage>
--backend memory-storage
--backend lmdb / redb / fjall
--backend sqlite / geomerge
create_relation → tx.insert → scan_as_table
>, fillcolor = "#E8F5E9", color = "#4CAF50"] +tables_map [label = < + + +
HashMap<String, Table>
positional rows per relation
>, fillcolor = "#E8F4FD", color = "#2196F3"] +} + +subgraph cluster_execute { +label = "Execute (walk node DAG in id order)" +style = "dashed" +color = "#FF9800" +fontcolor = "#F57C00" +margin = 14 +execute_node [label = < + + + + + + +
execute(tables, query)
Action::Scan → scan_atom
Action::Join Left → semijoin(l, r)
Action::Join Right → semijoin(r, l)
Action::Join Natural → natural_join(l, r)
cache per-node Relation; return root
>, fillcolor = "#FFF3E0", color = "#FF9800"] +relation_out [label = < + + + +
Relation
columns: variables + wildcards
rows: bindings
>, fillcolor = "#FFF3E0", color = "#FF9800"] +} + +subgraph cluster_verify { +label = "Verify (when expected_bindings is present)" +style = "dashed" +color = "#9C27B0" +fontcolor = "#7B1FA2" +margin = 14 +verify_node [label = < + + + +
verify(plan, relation)
project to expected.columns
multiset compare against expected.rows
>, fillcolor = "#F3E5F5", color = "#9C27B0"] +} + +subgraph cluster_output { +label = "Output" +style = "dashed" +color = "#888888" +fontcolor = "#555555" +margin = 18 +stdout_json [label = < + + +
stdout JSON
{ columns, rows }
>, fillcolor = "#ECEFF1", color = "#607D8B"] +oracle_pass [label = "Ok(true) / VerifyError\n(used by tests/examples.rs)", fillcolor = "#ECEFF1", color = "#607D8B"] +} + +// Pipeline edges +json_plan -> parse_plan [color = "#2196F3"] +parse_plan -> build_pure [label = "Backend::Memory", color = "#9C27B0"] +parse_plan -> build_storage [label = "Backend::*Storage", color = "#9C27B0"] +build_pure -> tables_map [color = "#4CAF50"] +build_storage -> tables_map [color = "#4CAF50"] +tables_map -> execute_node [color = "#2196F3"] +parse_plan -> execute_node [style = "dashed", label = "plan.query", color = "#9C27B0"] +execute_node -> relation_out [color = "#FF9800"] +relation_out -> stdout_json [color = "#607D8B"] +relation_out -> verify_node [style = "dashed", color = "#FF9800"] +parse_plan -> verify_node [style = "dashed", label = "plan.expected_bindings", color = "#9C27B0"] +verify_node -> oracle_pass [color = "#9C27B0"] +} diff --git a/crates/plan-runner/docs/diagrams/workflow.svg b/crates/plan-runner/docs/diagrams/workflow.svg new file mode 100644 index 0000000..11a0ee4 --- /dev/null +++ b/crates/plan-runner/docs/diagrams/workflow.svg @@ -0,0 +1,202 @@ + + + + + + +PlanRunnerWorkflow + + +cluster_input + +Input + + +cluster_parse + +Parse + + +cluster_load + +Load Tables  (--backend selects the path) + + +cluster_execute + +Execute  (walk node DAG in id order) + + +cluster_verify + +Verify  (when expected_bindings is present) + + +cluster_output + +Output + + + +json_plan + +JSON Plan +• schema: name -> arity +• facts: name -> rows +• query: { root, nodes } +• expected_bindings (optional oracle) + + + +parse_plan + +parse_plan(json) +-> Plan + + + +json_plan->parse_plan + + + + + +build_pure + +build_tables(plan) +--backend memory +direct from plan.facts + + + +parse_plan->build_pure + + +Backend::Memory + + + +build_storage + +build_tables_via_storage<S: Storage> +--backend memory-storage +--backend lmdb / redb / fjall +--backend sqlite / geomerge +create_relation → tx.insert → scan_as_table + + + +parse_plan->build_storage + + +Backend::*Storage + + + +execute_node + +execute(tables, query) +Action::Scan  → scan_atom +Action::Join Left  → semijoin(l, r) +Action::Join Right → semijoin(r, l) +Action::Join Natural → natural_join(l, r) +cache per-node Relation; return root + + + +parse_plan->execute_node + + +plan.query + + + +verify_node + +verify(plan, relation) +project to expected.columns +multiset compare against expected.rows + + + +parse_plan->verify_node + + +plan.expected_bindings + + + +tables_map + +HashMap<String, Table> +positional rows per relation + + + +build_pure->tables_map + + + + + +build_storage->tables_map + + + + + +tables_map->execute_node + + + + + +relation_out + +Relation +columns: variables + wildcards +rows: bindings + + + +execute_node->relation_out + + + + + +relation_out->verify_node + + + + + +stdout_json + +stdout JSON +{ columns, rows } + + + +relation_out->stdout_json + + + + + +oracle_pass + +Ok(true)  /  VerifyError +(used by tests/examples.rs) + + + +verify_node->oracle_pass + + + + + diff --git a/crates/plan-runner/fixtures/cartesian.json b/crates/plan-runner/fixtures/cartesian.json new file mode 100644 index 0000000..ea4188e --- /dev/null +++ b/crates/plan-runner/fixtures/cartesian.json @@ -0,0 +1,114 @@ +{ + "_scenario": "cartesian", + "expected_bindings": { + "columns": [ + "a", + "b" + ], + "rows": [ + [ + { + "str": "left:1" + }, + { + "str": "right:10" + } + ], + [ + { + "str": "left:1" + }, + { + "str": "right:20" + } + ], + [ + { + "str": "left:2" + }, + { + "str": "right:10" + } + ], + [ + { + "str": "left:2" + }, + { + "str": "right:20" + } + ] + ] + }, + "facts": { + "left": [ + [ + { + "str": "left:1" + } + ], + [ + { + "str": "left:2" + } + ] + ], + "right": [ + [ + { + "str": "right:10" + } + ], + [ + { + "str": "right:20" + } + ] + ] + }, + "query": { + "nodes": [ + { + "action": { + "scan": { + "columns": [ + { + "var": "a" + } + ], + "table": "left" + } + }, + "id": 1 + }, + { + "action": { + "scan": { + "columns": [ + { + "var": "b" + } + ], + "table": "right" + } + }, + "id": 2 + }, + { + "action": { + "join": { + "left": 1, + "op": "natural", + "right": 2 + } + }, + "id": 3 + } + ], + "root": 3 + }, + "schema": { + "left": 1, + "right": 1 + } +} diff --git a/crates/plan-runner/fixtures/self_loop.json b/crates/plan-runner/fixtures/self_loop.json new file mode 100644 index 0000000..f0b4fca --- /dev/null +++ b/crates/plan-runner/fixtures/self_loop.json @@ -0,0 +1,84 @@ +{ + "_scenario": "self-loop", + "expected_bindings": { + "columns": [ + "x" + ], + "rows": [ + [ + { + "str": "node:2" + } + ], + [ + { + "str": "node:3" + } + ] + ] + }, + "facts": { + "edge": [ + [ + { + "str": "node:1" + }, + { + "str": "node:2" + }, + { + "str": "edge:1" + } + ], + [ + { + "str": "node:2" + }, + { + "str": "node:2" + }, + { + "str": "edge:2" + } + ], + [ + { + "str": "node:3" + }, + { + "str": "node:3" + }, + { + "str": "edge:3" + } + ] + ] + }, + "query": { + "nodes": [ + { + "action": { + "scan": { + "columns": [ + { + "var": "x" + }, + { + "var": "x" + }, + { + "var": "_w0_2" + } + ], + "table": "edge" + } + }, + "id": 1 + } + ], + "root": 1 + }, + "schema": { + "edge": 3 + } +} diff --git a/crates/plan-runner/fixtures/three_atom_chain.json b/crates/plan-runner/fixtures/three_atom_chain.json new file mode 100644 index 0000000..4d0812a --- /dev/null +++ b/crates/plan-runner/fixtures/three_atom_chain.json @@ -0,0 +1,186 @@ +{ + "_scenario": "three-atom-chain", + "expected_bindings": { + "columns": [ + "a", + "b", + "c" + ], + "rows": [ + [ + { + "str": "node:1" + }, + { + "str": "node:2" + }, + { + "str": "node:3" + } + ] + ] + }, + "facts": { + "edge": [ + [ + { + "str": "node:1" + }, + { + "str": "node:2" + }, + { + "str": "edge:1" + } + ], + [ + { + "str": "node:2" + }, + { + "str": "node:3" + }, + { + "str": "edge:2" + } + ] + ], + "node": [ + [ + { + "str": "node:1" + } + ], + [ + { + "str": "node:2" + } + ], + [ + { + "str": "node:3" + } + ] + ] + }, + "query": { + "nodes": [ + { + "action": { + "scan": { + "columns": [ + { + "var": "a" + }, + { + "var": "b" + }, + { + "var": "_w0_2" + } + ], + "table": "edge" + } + }, + "id": 1 + }, + { + "action": { + "scan": { + "columns": [ + { + "var": "b" + }, + { + "var": "c" + }, + { + "var": "_w1_2" + } + ], + "table": "edge" + } + }, + "id": 2 + }, + { + "action": { + "scan": { + "columns": [ + { + "var": "a" + } + ], + "table": "node" + } + }, + "id": 3 + }, + { + "action": { + "join": { + "left": 1, + "op": "left", + "right": 3 + } + }, + "id": 4 + }, + { + "action": { + "join": { + "left": 2, + "op": "left", + "right": 4 + } + }, + "id": 5 + }, + { + "action": { + "join": { + "left": 5, + "op": "right", + "right": 4 + } + }, + "id": 6 + }, + { + "action": { + "join": { + "left": 6, + "op": "right", + "right": 3 + } + }, + "id": 7 + }, + { + "action": { + "join": { + "left": 6, + "op": "natural", + "right": 7 + } + }, + "id": 8 + }, + { + "action": { + "join": { + "left": 5, + "op": "natural", + "right": 8 + } + }, + "id": 9 + } + ], + "root": 9 + }, + "schema": { + "edge": 3, + "node": 1 + } +} diff --git a/crates/plan-runner/fixtures/two_atom_join.json b/crates/plan-runner/fixtures/two_atom_join.json new file mode 100644 index 0000000..a3e4ab4 --- /dev/null +++ b/crates/plan-runner/fixtures/two_atom_join.json @@ -0,0 +1,136 @@ +{ + "_scenario": "two-atom-join", + "expected_bindings": { + "columns": [ + "a", + "b" + ], + "rows": [ + [ + { + "str": "node:1" + }, + { + "str": "node:2" + } + ], + [ + { + "str": "node:2" + }, + { + "str": "node:1" + } + ] + ] + }, + "facts": { + "edge": [ + [ + { + "str": "node:1" + }, + { + "str": "node:2" + }, + { + "str": "edge:1" + } + ], + [ + { + "str": "node:2" + }, + { + "str": "node:1" + }, + { + "str": "edge:2" + } + ] + ], + "node": [ + [ + { + "str": "node:1" + } + ], + [ + { + "str": "node:2" + } + ] + ] + }, + "query": { + "nodes": [ + { + "action": { + "scan": { + "columns": [ + { + "var": "a" + }, + { + "var": "b" + }, + { + "var": "_w0_2" + } + ], + "table": "edge" + } + }, + "id": 1 + }, + { + "action": { + "scan": { + "columns": [ + { + "var": "a" + } + ], + "table": "node" + } + }, + "id": 2 + }, + { + "action": { + "join": { + "left": 1, + "op": "left", + "right": 2 + } + }, + "id": 3 + }, + { + "action": { + "join": { + "left": 3, + "op": "right", + "right": 2 + } + }, + "id": 4 + }, + { + "action": { + "join": { + "left": 3, + "op": "natural", + "right": 4 + } + }, + "id": 5 + } + ], + "root": 5 + }, + "schema": { + "edge": 3, + "node": 1 + } +} diff --git a/crates/plan-runner/src/lib.rs b/crates/plan-runner/src/lib.rs new file mode 100644 index 0000000..7cdcf28 --- /dev/null +++ b/crates/plan-runner/src/lib.rs @@ -0,0 +1,540 @@ +//! Snapshot executor for conjunctive-query plans. +//! +//! Takes a structural plan (a DAG of `Scan` and `Join` nodes), the input +//! tables it scans, and walks the DAG via [`query_ops::atom::scan_atom`], +//! [`query_ops::join::semijoin`], and [`query_ops::join::natural_join`]. +//! The result is a binding [`Relation`](query_ops::relation::Relation) over +//! the query's variables. +//! +//! The runner is intentionally backend-agnostic: it depends only on +//! `query-ops`, and the planner that emits the JSON IR is decoupled from +//! the storage backend that produced the facts. To execute a plan against +//! a [`Storage`](storage::Storage) backend, materialize each input table +//! with [`storage::scan_as_table`] and call [`execute`] with the resulting +//! map. The in-tree `tests/storage_roundtrip.rs` is the canonical example. +//! +//! The JSON IR mirrors `Geolog.DB.Plan.PlanGraph` and +//! `Geolog.DB.InMemory.QAtom` from the `external/geolog` submodule, but the +//! shape is the contract: any frontend that emits this JSON can use the +//! runner. +//! +//! Operator mapping from the Haskell planner: +//! +//! | `Geolog.DB.Plan` | this crate | +//! |-----------------------------|-----------------------------------------------| +//! | `PlanEvalAtom` | [`Action::Scan`] → `scan_atom` | +//! | `PlanJoin LeftJoin a b` | [`Action::Join`] with [`JoinOp::Left`] → `semijoin(rel[a], rel[b])` | +//! | `PlanJoin RightJoin a b` | [`Action::Join`] with [`JoinOp::Right`] → `semijoin(rel[b], rel[a])` | +//! | `PlanJoin NaturalJoin a b` | [`Action::Join`] with [`JoinOp::Natural`] → `natural_join(rel[a], rel[b])` | +//! +//! The atom side covers `evalAtom` (`Geolog.DB.InMemory`): a [`Term::Var`] +//! repeated across positions enforces equality, [`Term::Lit`] filters by +//! constant, and distinct variables project in first-occurrence order. + +use std::collections::HashMap; + +use serde::Deserialize; + +use query_ops::atom::{AtomPattern, Term, scan_atom}; +use query_ops::join::{natural_join, semijoin}; +use query_ops::relation::Relation; +use storage::table::Table; +use storage::value::Value; +use storage::{Storage, StorageError, scan_as_table}; + +/// A single fixture: schema, ground facts, and a query plan to execute. +#[derive(Debug, Clone, Deserialize)] +pub struct Plan { + /// Relation name → arity (column count). + pub schema: HashMap, + /// Relation name → list of ground tuples to insert before execution. + pub facts: HashMap>>, + /// The join DAG itself. + pub query: Query, + /// Optional oracle: if present, [`verify`] cross-checks an executed + /// [`Relation`] against this projection. The exporter lifts the + /// scenario's `expected_bindings` block into this field. + #[serde(default)] + pub expected_bindings: Option, +} + +/// Expected query result, projected to a named subset of variables. The +/// columns named here must all appear in the runner's output; any extra +/// columns (typically per-atom wildcards) are ignored. +#[derive(Debug, Clone, Deserialize)] +pub struct ExpectedBindings { + pub columns: Vec, + pub rows: Vec>, +} + +/// Mirrors `Geolog.DB.Plan.PlanGraph`: a set of nodes plus the id of the +/// rooted result node (the last node in topological order). +#[derive(Debug, Clone, Deserialize)] +pub struct Query { + pub root: u32, + pub nodes: Vec, +} + +/// One node of the plan DAG. `id`s don't need to start at any particular +/// value, mirroring the Haskell `PlanNodeId`. +#[derive(Debug, Clone, Deserialize)] +pub struct Node { + pub id: u32, + pub action: Action, +} + +/// What to compute at a node. Tagged externally so JSON reads as +/// `{"action": {"scan": {...}}}` or `{"action": {"join": {...}}}`. +#[derive(Debug, Clone, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum Action { + Scan(Atom), + Join(Join), +} + +/// A flat atom pattern, one entry per column of the target relation. +/// Matches the `toFlatArgs` view used by `Geolog.DB.InMemory.evalAtom`: +/// `qaValues` positions are filled in directly, and the entity-id column +/// (if any) appears at the last position. Wildcard positions in the +/// Haskell `QAtom` (a `Map Int QVal` with a missing key) translate to a +/// fresh, unique variable name on this side, which the operator binds but +/// never joins against. +#[derive(Debug, Clone, Deserialize)] +pub struct Atom { + pub table: String, + pub columns: Vec, +} + +#[derive(Debug, Clone, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum JsonTerm { + Var(String), + Lit(JsonValue), +} + +/// Wire-level value tag. Restricted to what +/// [`storage::value::Value`](storage::value::Value) carries. Entity identities from +/// the Haskell side (`ValEntity path id`) round-trip through `Str` using a +/// `"path:id"` convention; that's a fixture concern, not a runner concern. +#[derive(Debug, Clone, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum JsonValue { + Int(i64), + Str(String), +} + +#[derive(Debug, Clone, Deserialize)] +pub struct Join { + pub op: JoinOp, + pub left: u32, + pub right: u32, +} + +#[derive(Debug, Clone, Copy, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum JoinOp { + /// `Geolog.DB.Plan.LeftJoin`: result is `left` rows whose shared columns + /// appear in `right`. Lowered to `semijoin(left, right)`. + Left, + /// `Geolog.DB.Plan.RightJoin`: result is `right` rows whose shared + /// columns appear in `left`. Lowered to `semijoin(right, left)`. + Right, + /// `Geolog.DB.Plan.NaturalJoin`. Lowered to `natural_join(left, right)`. + Natural, +} + +/// Errors produced by [`verify`] when actual bindings don't match the +/// scenario's `expected_bindings` projection. +#[derive(Debug)] +pub enum VerifyError { + /// An expected column wasn't produced by the plan. + MissingColumn(String), + /// An expected row's width didn't match the column count. + ExpectedRowArity { expected: usize, got: usize }, + /// The expected and actual rows (after projection) differ as multisets. + BindingsMismatch { + expected: Vec>, + actual: Vec>, + }, +} + +impl std::fmt::Display for VerifyError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::MissingColumn(name) => { + write!(f, "expected column {name:?} not in plan output") + } + Self::ExpectedRowArity { expected, got } => write!( + f, + "expected row has {got} cells but columns has {expected} entries" + ), + Self::BindingsMismatch { expected, actual } => write!( + f, + "bindings mismatch:\n expected: {expected:?}\n actual: {actual:?}" + ), + } + } +} + +impl std::error::Error for VerifyError {} + +/// Cross-check an executed [`Relation`] against a [`Plan`]'s +/// `expected_bindings`. Projects `actual` to the expected columns (so the +/// runner is free to surface wildcard columns the oracle doesn't name) and +/// compares as a multiset. +/// +/// Returns `Ok(true)` if the plan carried an oracle and it matched, +/// `Ok(false)` if there was no oracle (caller decides whether that's an +/// error). Returns [`VerifyError`] on mismatch. +/// +/// # Errors +/// See [`VerifyError`]. +pub fn verify(plan: &Plan, actual: &Relation) -> Result { + let Some(expected) = &plan.expected_bindings else { + return Ok(false); + }; + let mut projection: Vec = Vec::with_capacity(expected.columns.len()); + for col in &expected.columns { + let idx = actual + .columns + .iter() + .position(|c| c == col) + .ok_or_else(|| VerifyError::MissingColumn(col.clone()))?; + projection.push(idx); + } + let mut actual_proj: Vec> = actual + .rows + .iter() + .map(|row| projection.iter().map(|&i| row[i].clone()).collect()) + .collect(); + let mut expected_proj: Vec> = Vec::with_capacity(expected.rows.len()); + for row in &expected.rows { + if row.len() != expected.columns.len() { + return Err(VerifyError::ExpectedRowArity { + expected: expected.columns.len(), + got: row.len(), + }); + } + expected_proj.push(row.iter().cloned().map(Value::from).collect()); + } + // Value is not Ord; use Debug-derived sort keys to compare as a multiset. + let key = |row: &[Value]| -> String { format!("{row:?}") }; + actual_proj.sort_by_key(|r| key(r)); + expected_proj.sort_by_key(|r| key(r)); + if actual_proj == expected_proj { + Ok(true) + } else { + Err(VerifyError::BindingsMismatch { + expected: expected_proj, + actual: actual_proj, + }) + } +} + +/// Errors a runner can produce during plan validation and execution. +#[derive(Debug)] +pub enum RunError { + /// A fact or scan references a relation that isn't declared in `schema`. + UnknownRelation(String), + /// A scan refers to a table that wasn't supplied in the input map. + MissingTable(String), + /// A fact row's length doesn't match the schema's declared arity. + ArityMismatch { + relation: String, + expected: usize, + got: usize, + }, + /// A scan's atom pattern doesn't match the table's arity. + PatternArityMismatch { + table: String, + table_arity: usize, + pattern_arity: usize, + }, + /// A join node references a node id that doesn't exist. + MissingNode(u32), + /// `Query.root` doesn't match any node in `nodes`. + MissingRoot(u32), + /// Two nodes share the same id. + DuplicateNode(u32), + /// A join node references its left or right side before that side has + /// been computed: the DAG isn't actually topologically sorted by id, or + /// it has a cycle. + UnresolvedDependency { node: u32, depends_on: u32 }, + /// A [`Storage`] backend used to materialize tables returned an error. + Storage(StorageError), +} + +impl From for RunError { + fn from(err: StorageError) -> Self { + Self::Storage(err) + } +} + +impl std::fmt::Display for RunError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::UnknownRelation(name) => { + write!(f, "facts reference relation {name:?} not in schema") + } + Self::MissingTable(name) => write!(f, "scan references missing table {name:?}"), + Self::ArityMismatch { + relation, + expected, + got, + } => write!( + f, + "relation {relation:?}: row arity {got} differs from schema arity {expected}" + ), + Self::PatternArityMismatch { + table, + table_arity, + pattern_arity, + } => write!( + f, + "scan of {table:?}: pattern has {pattern_arity} columns, table has {table_arity}" + ), + Self::MissingNode(id) => write!(f, "plan references missing node id {id}"), + Self::MissingRoot(id) => write!(f, "plan root id {id} matches no node"), + Self::DuplicateNode(id) => write!(f, "duplicate node id {id} in plan"), + Self::UnresolvedDependency { node, depends_on } => write!( + f, + "node {node} depends on {depends_on}, which has not been computed yet" + ), + Self::Storage(err) => write!(f, "storage backend error: {err}"), + } + } +} + +impl std::error::Error for RunError {} + +impl From for Value { + fn from(jv: JsonValue) -> Self { + match jv { + JsonValue::Int(n) => Self::Int(n), + JsonValue::Str(s) => Self::Str(s), + } + } +} + +impl From for Term { + fn from(t: JsonTerm) -> Self { + match t { + JsonTerm::Var(name) => Self::Var(name), + JsonTerm::Lit(value) => Self::Lit(value.into()), + } + } +} + +/// Parse a [`Plan`] from a JSON string. +/// +/// # Errors +/// Returns a [`serde_json::Error`] if the input isn't valid JSON in the +/// expected shape. +pub fn parse_plan(json: &str) -> Result { + serde_json::from_str(json) +} + +/// Build the input [`Table`] for each relation declared in a [`Plan`]'s +/// schema, populating rows from the plan's `facts` map. Relations with no +/// facts get an empty table at the declared arity. +/// +/// # Errors +/// Returns [`RunError::UnknownRelation`] if `facts` mentions a relation +/// not in `schema`, or [`RunError::ArityMismatch`] if a row's width doesn't +/// match the declared arity. +pub fn build_tables(plan: &Plan) -> Result, RunError> { + let mut tables: HashMap = plan + .schema + .iter() + .map(|(name, arity)| (name.clone(), Table::new(*arity))) + .collect(); + for (name, rows) in &plan.facts { + let Some(table) = tables.get_mut(name) else { + return Err(RunError::UnknownRelation(name.clone())); + }; + for row in rows { + if row.len() != table.arity { + return Err(RunError::ArityMismatch { + relation: name.clone(), + expected: table.arity, + got: row.len(), + }); + } + let cells: Vec = row.iter().cloned().map(Value::from).collect(); + table.push(cells); + } + } + Ok(tables) +} + +/// Populate a [`Storage`] backend from a [`Plan`]'s schema and facts, then +/// materialize each declared relation back into an in-memory [`Table`] via +/// [`scan_as_table`]. The returned map is the same shape [`execute`] +/// consumes, so this is the storage-backed analogue of [`build_tables`]. +/// +/// Adding a new backend means constructing a different `S` at the call +/// site; the body here doesn't need to change. +/// +/// # Errors +/// Returns [`RunError::UnknownRelation`] or [`RunError::ArityMismatch`] on +/// the same conditions as [`build_tables`], or [`RunError::Storage`] when +/// the backend itself rejects an operation. +pub fn build_tables_via_storage( + plan: &Plan, + storage: &mut S, +) -> Result, RunError> { + for (name, arity) in &plan.schema { + storage.create_relation(name, *arity)?; + } + { + let mut tx = storage.transaction()?; + for (name, rows) in &plan.facts { + let Some(&arity) = plan.schema.get(name) else { + return Err(RunError::UnknownRelation(name.clone())); + }; + for row in rows { + if row.len() != arity { + return Err(RunError::ArityMismatch { + relation: name.clone(), + expected: arity, + got: row.len(), + }); + } + let cells: Vec = row.iter().cloned().map(Value::from).collect(); + tx.insert(name, cells)?; + } + } + tx.commit()?; + } + let mut tables: HashMap = HashMap::with_capacity(plan.schema.len()); + for name in plan.schema.keys() { + let table = scan_as_table(storage as &dyn Storage, name)?; + tables.insert(name.clone(), table); + } + Ok(tables) +} + +/// Execute a query DAG against the supplied input tables, returning the +/// bindings [`Relation`] for the rooted plan node. +/// +/// Nodes are executed in ascending `id` order. For a Yannakakis plan as +/// emitted by `Geolog.DB.Plan` this is equivalent to a topological sort, +/// since `insertJoin` only references node ids that have already been +/// allocated. A non-monotone id ordering is rejected with +/// [`RunError::UnresolvedDependency`]. +/// +/// # Errors +/// Returns [`RunError::DuplicateNode`] for repeated ids, +/// [`RunError::MissingNode`] for join references to unknown ids, +/// [`RunError::MissingRoot`] if `query.root` isn't present, +/// [`RunError::MissingTable`] if a scan references a table not in the map, +/// or [`RunError::PatternArityMismatch`] if a scan's pattern doesn't match +/// the table's arity. +pub fn execute( + tables: &HashMap, + query: &Query, +) -> Result { + let mut seen_ids: std::collections::HashSet = + std::collections::HashSet::with_capacity(query.nodes.len()); + for node in &query.nodes { + if !seen_ids.insert(node.id) { + return Err(RunError::DuplicateNode(node.id)); + } + } + if !seen_ids.contains(&query.root) { + return Err(RunError::MissingRoot(query.root)); + } + + let mut ordered: Vec<&Node> = query.nodes.iter().collect(); + ordered.sort_by_key(|n| n.id); + + let mut results: HashMap = HashMap::with_capacity(ordered.len()); + for node in ordered { + let computed = match &node.action { + Action::Scan(atom) => { + let table = tables + .get(&atom.table) + .ok_or_else(|| RunError::MissingTable(atom.table.clone()))?; + if atom.columns.len() != table.arity { + return Err(RunError::PatternArityMismatch { + table: atom.table.clone(), + table_arity: table.arity, + pattern_arity: atom.columns.len(), + }); + } + let pattern = AtomPattern { + columns: atom.columns.iter().cloned().map(Term::from).collect(), + }; + scan_atom(table, &pattern) + } + Action::Join(join) => { + let left = require_dep(&results, &seen_ids, node.id, join.left)?; + let right = require_dep(&results, &seen_ids, node.id, join.right)?; + match join.op { + JoinOp::Left => semijoin(left, right), + JoinOp::Right => semijoin(right, left), + JoinOp::Natural => natural_join(left, right), + } + } + }; + results.insert(node.id, computed); + } + + results + .remove(&query.root) + .ok_or(RunError::MissingRoot(query.root)) +} + +fn require_dep<'a>( + results: &'a HashMap, + seen: &std::collections::HashSet, + node: u32, + depends_on: u32, +) -> Result<&'a Relation, RunError> { + if let Some(rel) = results.get(&depends_on) { + Ok(rel) + } else if seen.contains(&depends_on) { + Err(RunError::UnresolvedDependency { node, depends_on }) + } else { + Err(RunError::MissingNode(depends_on)) + } +} + +/// Convenience: parse JSON, build tables from the embedded facts, and +/// execute, returning the root binding relation. Equivalent to +/// `parse_plan` + [`build_tables`] + [`execute`]. +/// +/// # Errors +/// Returns a JSON parse error if the input is malformed, or a [`RunError`] +/// for any later step. +pub fn run_json(json: &str) -> Result { + let plan = parse_plan(json).map_err(RunFromJsonError::Parse)?; + let tables = build_tables(&plan).map_err(RunFromJsonError::Run)?; + let bindings = execute(&tables, &plan.query).map_err(RunFromJsonError::Run)?; + Ok(bindings) +} + +/// Combined error from [`run_json`]. +#[derive(Debug)] +pub enum RunFromJsonError { + Parse(serde_json::Error), + Run(RunError), +} + +impl std::fmt::Display for RunFromJsonError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::Parse(err) => write!(f, "parse error: {err}"), + Self::Run(err) => write!(f, "run error: {err}"), + } + } +} + +impl std::error::Error for RunFromJsonError { + fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { + match self { + Self::Parse(err) => Some(err), + Self::Run(err) => Some(err), + } + } +} diff --git a/crates/plan-runner/src/main.rs b/crates/plan-runner/src/main.rs new file mode 100644 index 0000000..c76b440 --- /dev/null +++ b/crates/plan-runner/src/main.rs @@ -0,0 +1,225 @@ +//! `plan-run` CLI: read a JSON plan from a file (or stdin if `-`), execute +//! it against the chosen backend, and print the resulting binding relation +//! as JSON on stdout. +//! +//! Backends: +//! +//! - `memory` (default): build tables straight from the plan's `facts` +//! block, no `Storage` trait involved. Pure in-memory path. +//! - `memory-storage`: load the same facts through `storage::MemoryStorage` +//! via the `Storage` trait, then materialize tables back out with +//! `scan_as_table` before executing. +//! - `lmdb`, `redb`, `fjall`, `sqlite`: file-backed `Storage` adapters. +//! Each invocation creates a fresh tempdir for the store and drops it on +//! exit; the runner is one-shot, so persistent paths aren't needed. +//! - `geomerge`: CRDT-backed adapter. Constructed in-memory; alpha-status +//! upstream. + +use std::collections::HashMap; +use std::io::{self, Read}; +use std::process::ExitCode; + +use plan_runner::{JsonValue, Plan, build_tables, build_tables_via_storage, execute, parse_plan}; +use storage::MemoryStorage; +use storage::adapters::fjall::FjallStorage; +use storage::adapters::geomerge::{ColumnKind, GeomergeStorage}; +use storage::adapters::lmdb::LmdbStorage; +use storage::adapters::redb::RedbStorage; +use storage::adapters::sqlite::SqliteStorage; +use storage::table::Table; +use storage::value::Value; +use tempfile::TempDir; + +#[derive(Debug, Clone, Copy)] +enum Backend { + Memory, + MemoryStorage, + Lmdb, + Redb, + Fjall, + Sqlite, + Geomerge, +} + +impl Backend { + fn parse(s: &str) -> Option { + match s { + "memory" => Some(Self::Memory), + "memory-storage" => Some(Self::MemoryStorage), + "lmdb" => Some(Self::Lmdb), + "redb" => Some(Self::Redb), + "fjall" => Some(Self::Fjall), + "sqlite" => Some(Self::Sqlite), + "geomerge" => Some(Self::Geomerge), + _ => None, + } + } +} + +const BACKENDS_HELP: &str = "memory|memory-storage|lmdb|redb|fjall|sqlite|geomerge"; + +fn main() -> ExitCode { + let mut backend = Backend::Memory; + let mut input_path: Option = None; + let mut args = std::env::args().skip(1); + while let Some(arg) = args.next() { + match arg.as_str() { + "--backend" => { + let Some(value) = args.next() else { + eprintln!("--backend requires a value ({BACKENDS_HELP})"); + return ExitCode::from(2); + }; + let Some(parsed) = Backend::parse(&value) else { + eprintln!("unknown backend {value:?} (try {BACKENDS_HELP})"); + return ExitCode::from(2); + }; + backend = parsed; + } + other if input_path.is_none() => input_path = Some(other.to_string()), + other => { + eprintln!("unexpected argument: {other}"); + return ExitCode::from(2); + } + } + } + let Some(path) = input_path else { + eprintln!("usage: plan-run [--backend {BACKENDS_HELP}] "); + return ExitCode::from(2); + }; + + let input = match read_input(&path) { + Ok(s) => s, + Err(err) => { + eprintln!("failed to read {path}: {err}"); + return ExitCode::from(1); + } + }; + + let plan = match parse_plan(&input) { + Ok(p) => p, + Err(err) => { + eprintln!("parse error: {err}"); + return ExitCode::from(1); + } + }; + + let tables = match build_tables_for(&plan, backend) { + Ok(t) => t, + Err(err) => { + eprintln!("{err}"); + return ExitCode::from(1); + } + }; + + let relation = match execute(&tables, &plan.query) { + Ok(r) => r, + Err(err) => { + eprintln!("execute error: {err}"); + return ExitCode::from(1); + } + }; + + let payload = serde_json::json!({ + "columns": relation.columns, + "rows": relation + .rows + .iter() + .map(|row| row.iter().map(value_to_json).collect::>()) + .collect::>(), + }); + println!("{payload}"); + ExitCode::SUCCESS +} + +/// Build the input tables for `plan` using `backend`. Path-based adapters +/// allocate a fresh tempdir; it drops at the end of this function, which is +/// safe because `build_tables_via_storage` fully materializes the tables +/// into owned `Vec` before returning. +fn build_tables_for(plan: &Plan, backend: Backend) -> Result, String> { + match backend { + Backend::Memory => build_tables(plan).map_err(|e| format!("build error: {e}")), + Backend::MemoryStorage => { + let mut storage = MemoryStorage::default(); + build_tables_via_storage(plan, &mut storage) + .map_err(|e| format!("build error (memory-storage): {e}")) + } + Backend::Lmdb => { + let dir = TempDir::new().map_err(|e| format!("tempdir: {e}"))?; + let mut storage = LmdbStorage::open(dir.path()) + .map_err(|e| format!("failed to open lmdb backend: {e}"))?; + build_tables_via_storage(plan, &mut storage) + .map_err(|e| format!("build error (lmdb): {e}")) + } + Backend::Redb => { + let dir = TempDir::new().map_err(|e| format!("tempdir: {e}"))?; + let mut storage = RedbStorage::open(dir.path().join("data.redb")) + .map_err(|e| format!("failed to open redb backend: {e}"))?; + build_tables_via_storage(plan, &mut storage) + .map_err(|e| format!("build error (redb): {e}")) + } + Backend::Fjall => { + let dir = TempDir::new().map_err(|e| format!("tempdir: {e}"))?; + let mut storage = FjallStorage::open(dir.path()) + .map_err(|e| format!("failed to open fjall backend: {e}"))?; + build_tables_via_storage(plan, &mut storage) + .map_err(|e| format!("build error (fjall): {e}")) + } + Backend::Sqlite => { + let dir = TempDir::new().map_err(|e| format!("tempdir: {e}"))?; + let mut storage = SqliteStorage::open(dir.path().join("data.sqlite")) + .map_err(|e| format!("failed to open sqlite backend: {e}"))?; + build_tables_via_storage(plan, &mut storage) + .map_err(|e| format!("build error (sqlite): {e}")) + } + Backend::Geomerge => { + let relations = plan + .schema + .iter() + .map(|(name, &arity)| (name.clone(), infer_column_kinds(plan, name, arity))); + let mut storage = GeomergeStorage::with_relations(relations) + .map_err(|e| format!("failed to open geomerge backend: {e}"))?; + build_tables_via_storage(plan, &mut storage) + .map_err(|e| format!("build error (geomerge): {e}")) + } + } +} + +/// Best-effort column type inference for `geomerge`'s synthesized theory. +/// The runner IR carries only arity, so we peek at the first fact row of +/// the relation. Columns without a sample default to `String`, which +/// matches every checked-in fixture (entity identities are encoded as +/// strings by the exporter). +fn infer_column_kinds(plan: &Plan, name: &str, arity: usize) -> Vec { + let mut kinds = vec![ColumnKind::String; arity]; + let Some(rows) = plan.facts.get(name) else { + return kinds; + }; + let Some(first) = rows.first() else { + return kinds; + }; + for (i, cell) in first.iter().take(arity).enumerate() { + kinds[i] = match cell { + JsonValue::Int(_) => ColumnKind::Int, + JsonValue::Str(_) => ColumnKind::String, + }; + } + kinds +} + +fn read_input(path: &str) -> io::Result { + if path == "-" { + let mut buf = String::new(); + io::stdin().read_to_string(&mut buf)?; + Ok(buf) + } else { + std::fs::read_to_string(path) + } +} + +fn value_to_json(value: &Value) -> serde_json::Value { + match value { + Value::Int(n) => serde_json::Value::Number((*n).into()), + Value::Str(s) => serde_json::Value::String(s.clone()), + Value::Id(id) => serde_json::Value::String(id.to_string()), + } +} diff --git a/crates/plan-runner/tests/examples.rs b/crates/plan-runner/tests/examples.rs new file mode 100644 index 0000000..1128431 --- /dev/null +++ b/crates/plan-runner/tests/examples.rs @@ -0,0 +1,77 @@ +//! Walks every JSON fixture under `crates/plan-runner/fixtures/` and +//! verifies it against the `expected_bindings` the exporter lifted from +//! the matching `tools/exporter/examples/*.scenario.json`. A fixture without an oracle +//! is reported as a failure (every checked-in fixture is expected to +//! carry one). + +use std::collections::BTreeMap; +use std::fs; +use std::path::PathBuf; + +use plan_runner::{parse_plan, run_json, verify}; + +fn fixtures_dir() -> PathBuf { + PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("fixtures") +} + +fn collect_fixtures() -> BTreeMap { + let mut out = BTreeMap::new(); + for entry in fs::read_dir(fixtures_dir()).expect("read fixtures/") { + let path = entry.expect("dir entry").path(); + if path.extension().and_then(|e| e.to_str()) != Some("json") { + continue; + } + let name = path + .file_stem() + .and_then(|s| s.to_str()) + .expect("ascii fixture name") + .to_string(); + let contents = fs::read_to_string(&path).expect("read fixture"); + out.insert(name, contents); + } + out +} + +#[test] +fn every_fixture_runs_and_matches_its_oracle() { + let fixtures = collect_fixtures(); + assert!( + !fixtures.is_empty(), + "no fixtures found in {}", + fixtures_dir().display() + ); + + let mut failures: Vec = Vec::new(); + for (name, json) in &fixtures { + let plan = match parse_plan(json) { + Ok(p) => p, + Err(err) => { + failures.push(format!("{name}: parse error: {err}")); + continue; + } + }; + if plan.expected_bindings.is_none() { + failures.push(format!("{name}: fixture has no expected_bindings")); + continue; + } + let relation = match run_json(json) { + Ok(r) => r, + Err(err) => { + failures.push(format!("{name}: run error: {err}")); + continue; + } + }; + match verify(&plan, &relation) { + Ok(true) => {} + Ok(false) => failures.push(format!("{name}: verify returned no-oracle unexpectedly")), + Err(err) => failures.push(format!("{name}: {err}")), + } + } + + assert!( + failures.is_empty(), + "{} fixture(s) failed:\n {}", + failures.len(), + failures.join("\n ") + ); +} diff --git a/crates/plan-runner/tests/storage_roundtrip.rs b/crates/plan-runner/tests/storage_roundtrip.rs new file mode 100644 index 0000000..ba2b37a --- /dev/null +++ b/crates/plan-runner/tests/storage_roundtrip.rs @@ -0,0 +1,52 @@ +//! Cross-checks the two paths [`plan-runner`] exposes for materializing +//! input tables: the pure [`build_tables`] path and the [`Storage`]-routed +//! [`build_tables_via_storage`] path. Same fixture, same plan, must agree +//! row-for-row. +//! +//! This is the visible proof of the layer boundary: any new `Storage` +//! backend (LMDB, fjall, geomerge) keeps this test honest by re-running it +//! with a different `S`. + +use plan_runner::{build_tables, build_tables_via_storage, execute, parse_plan, run_json}; +use storage::MemoryStorage; +use storage::value::Value; + +const FIXTURE: &str = include_str!("../fixtures/three_atom_chain.json"); + +#[test] +fn storage_backed_execution_matches_pure_path() { + let plan = parse_plan(FIXTURE).expect("parse plan"); + + let pure_tables = build_tables(&plan).expect("build_tables"); + let pure = execute(&pure_tables, &plan.query).expect("pure execute"); + + let mut storage = MemoryStorage::default(); + let storage_tables = + build_tables_via_storage(&plan, &mut storage).expect("build_tables_via_storage"); + let via_storage = execute(&storage_tables, &plan.query).expect("storage execute"); + + assert_eq!(pure.columns, via_storage.columns); + // Scan order between MemoryStorage and the direct-from-JSON path isn't + // required to match; compare rows as a multiset. `Value` is not `Ord` + // (it carries `RowId` and `String`), so use a Debug-derived sort key. + assert_eq!(sorted_rows(&pure.rows), sorted_rows(&via_storage.rows)); +} + +#[test] +fn storage_backed_execution_matches_run_json_oracle() { + let plan = parse_plan(FIXTURE).expect("parse plan"); + let oracle = run_json(FIXTURE).expect("run_json"); + + let mut storage = MemoryStorage::default(); + let tables = build_tables_via_storage(&plan, &mut storage).expect("build_tables_via_storage"); + let via_storage = execute(&tables, &plan.query).expect("storage execute"); + + assert_eq!(oracle.columns, via_storage.columns); + assert_eq!(sorted_rows(&oracle.rows), sorted_rows(&via_storage.rows)); +} + +fn sorted_rows(rows: &[Vec]) -> Vec { + let mut keys: Vec = rows.iter().map(|r| format!("{r:?}")).collect(); + keys.sort(); + keys +} diff --git a/crates/query-ops/README.md b/crates/query-ops/README.md index bde49be..35494b3 100644 --- a/crates/query-ops/README.md +++ b/crates/query-ops/README.md @@ -121,7 +121,7 @@ How it works (logically):
- Types + Workflow
diff --git a/crates/query-ops/docs/diagrams/types.dot b/crates/query-ops/docs/diagrams/types.dot index 6b983a8..6a0b909 100644 --- a/crates/query-ops/docs/diagrams/types.dot +++ b/crates/query-ops/docs/diagrams/types.dot @@ -24,7 +24,7 @@ penwidth = 1.2 ] table_node [label = < - +
Table (struct)
Table (struct, from storage)
arity: usize
rows: Vec<Vec<Value>>
>, fillcolor = "#E8F4FD", color = "#2196F3"] @@ -47,7 +47,7 @@ term_node [label = <>, fillcolor = "#F3E5F5", color = "#9C27B0"] value_node [label = <
- + diff --git a/crates/query-ops/docs/diagrams/types.svg b/crates/query-ops/docs/diagrams/types.svg index 2a08f7d..ea3177c 100644 --- a/crates/query-ops/docs/diagrams/types.svg +++ b/crates/query-ops/docs/diagrams/types.svg @@ -1,147 +1,86 @@ + "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd"> - - - QueryOpsTypes - - - - table_node - - Table - - -  (struct) - - - arity: usize - - rows: - Vec<Vec<Value>> - - - - - value_node - - Value - - -  (enum) - - - Int(i64) - - - Str(String) - - - Id(RowId) - - - - - table_node->value_node - - - Vec<Vec<Value>> - - - - - relation_node - - Relation - - -  (struct) - - - columns: Vec<String> - - - rows: Vec<Vec<Value>> - - - - - relation_node->value_node - - - Vec<Vec<Value>> - - - - - atom_pattern_node - - AtomPattern - - -  (struct) - - - columns: Vec<Term> - - - - - term_node - - Term - - -  (enum) - - - Var(String) - - - Lit(Value) - - - - - atom_pattern_node->term_node - - - Vec<Term> - - - - - term_node->value_node - - - Lit(Value) - - - + + +QueryOpsTypes + + + +table_node + +Table +  (struct, from storage) +arity: usize +rows: Vec<Vec<Value>> + + + +value_node + +Value +  (enum, from storage) +Int(i64) +Str(String) +Id(RowId) + + + +table_node->value_node + + +Vec<Vec<Value>> + + + +relation_node + +Relation +  (struct) +columns: Vec<String> +rows: Vec<Vec<Value>> + + + +relation_node->value_node + + +Vec<Vec<Value>> + + + +atom_pattern_node + +AtomPattern +  (struct) +columns: Vec<Term> + + + +term_node + +Term +  (enum) +Var(String) +Lit(Value) + + + +atom_pattern_node->term_node + + +Vec<Term> + + + +term_node->value_node + + +Lit(Value) + + diff --git a/crates/query-ops/docs/diagrams/workflow.svg b/crates/query-ops/docs/diagrams/workflow.svg index 56c7759..f02b646 100644 --- a/crates/query-ops/docs/diagrams/workflow.svg +++ b/crates/query-ops/docs/diagrams/workflow.svg @@ -1,257 +1,159 @@ + "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd"> - - QueryOpsHandPlan - - - cluster_inputs - - Inputs (positional tables) - - - - cluster_atoms - - Atom Scans  (scan_atom: Table × AtomPattern → Relation) - - - - cluster_joins - - Joins  (shared cols = matching column names) - - - - cluster_output - - Output (binding relation) - - - - - author_table - - Table: author - - • - arity 2 - - • - rows: (name, book) - - - - - author_rel - - author_rel - - - pattern: [Var name, Var book] - - - cols: [name, book] - - - - - author_table->author_rel - - - - - - bestseller_table - - Table: bestseller - - • - arity 1 - - • - rows: (book) - - - - - bestseller_rel - - bestseller_rel - - - pattern: [Var book] - - - cols: [book] - - - - - bestseller_table->bestseller_rel - - - - - - price_table - - Table: price - - • - arity 2 - - • - rows: (book, dollars) - - - - - price_rel - - price_rel - - - pattern: [Var book, Var dollars] - - - cols: [book, dollars] - - - - - price_table->price_rel - - - - - - semijoin_step - - semijoin - - - authors of bestsellers - - - shared: book - - - cols: [name, book] - - - - - author_rel->semijoin_step - - - left - - - - - bestseller_rel->semijoin_step - - - right - - - - - natural_join_step - - natural_join - - - attach each book's price - - - shared: book - - - cols: [name, book, dollars] - - - - - price_rel->natural_join_step - - - right - - - - - semijoin_step->natural_join_step - - - left - - - - - result - - Q result - - - authors of bestsellers with each book's price - - - cols: [name, book, dollars] - - - - - natural_join_step->result - - - - + viewBox="0.00 0.00 1481.75 471.00" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink"> + +QueryOpsHandPlan + + +cluster_inputs + +Inputs (positional tables) + + +cluster_atoms + +Atom Scans  (scan_atom: Table × AtomPattern → Relation) + + +cluster_joins + +Joins  (shared cols = matching column names) + + +cluster_output + +Output (binding relation) + + + +author_table + +Table: author +• arity 2 +• rows: (name, book) + + + +author_rel + +author_rel +pattern: [Var name, Var book] +cols: [name, book] + + + +author_table->author_rel + + + + + +bestseller_table + +Table: bestseller +• arity 1 +• rows: (book) + + + +bestseller_rel + +bestseller_rel +pattern: [Var book] +cols: [book] + + + +bestseller_table->bestseller_rel + + + + + +price_table + +Table: price +• arity 2 +• rows: (book, dollars) + + + +price_rel + +price_rel +pattern: [Var book, Var dollars] +cols: [book, dollars] + + + +price_table->price_rel + + + + + +semijoin_step + +semijoin +authors of bestsellers +shared: book +cols: [name, book] + + + +author_rel->semijoin_step + + +left + + + +bestseller_rel->semijoin_step + + +right + + + +natural_join_step + +natural_join +attach each book's price +shared: book +cols: [name, book, dollars] + + + +price_rel->natural_join_step + + +right + + + +semijoin_step->natural_join_step + + +left + + + +result + +Q result +authors of bestsellers with each book's price +cols: [name, book, dollars] + + + +natural_join_step->result + + + + diff --git a/crates/query-ops/src/lib.rs b/crates/query-ops/src/lib.rs index f06b4b1..ffee8b2 100644 --- a/crates/query-ops/src/lib.rs +++ b/crates/query-ops/src/lib.rs @@ -2,9 +2,9 @@ //! //! Three operators are in scope: //! -//! - [`atom::scan_atom`] scans a [`Table`](storage::table::Table) under -//! an [`atom::AtomPattern`], filtering for repeated-variable equality and -//! literal equality, and outputs a binding [`relation::Relation`]. +//! - [`atom::scan_atom`] scans a [`Table`] under an [`atom::AtomPattern`], +//! filtering for repeated-variable equality and literal equality, and +//! outputs a binding [`relation::Relation`]. //! - [`join::semijoin`] keeps rows of one relation whose shared-column values //! appear in another. //! - [`join::natural_join`] combines rows that agree on shared columns, @@ -14,10 +14,8 @@ //! is just an expression like //! `natural_join(&semijoin(&a, &b), &scan_atom(&t, &p))`. //! -//! Foundational types [`Value`](storage::value::Value) and -//! [`Table`](storage::table::Table) live in `storage`, the -//! storage-layer crate this crate is built on; storage backends produce -//! `Table`s that operators here consume. +//! `Value` and `Table` live in the `storage` crate; consumers that build +//! inputs depend on `storage` directly. pub mod atom; pub mod join; diff --git a/crates/storage/README.md b/crates/storage/README.md index ae8354d..c968acf 100644 --- a/crates/storage/README.md +++ b/crates/storage/README.md @@ -7,24 +7,25 @@ This crates helps with decoupling the query execution logic from the underlying ### Public API -| Item | Kind | Description | -|--------------------------------------------------------------------|---------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| `Storage` | trait | Backend-agnostic interface for storing and retrieving rows. Required methods: `create_relation`, `arity`, `scan_iter`, and `transaction`. The rest (`scan`, `scan_where`, `insert`, `delete`) have default implementations. | -| `Transaction` | trait | Atomic batch of inserts and deletes against a `Storage`. `insert` returns a pending `RowId`; `commit` consumes the boxed transaction and returns a `CommittedTx`; dropping without committing rolls back. | -| `CommittedTx` | struct | Result of a successful `Transaction::commit`. Resolves pending `RowId`s returned during the transaction to their post-commit form via `resolve`. Empty for KV adapters where pending equals real; populated for `geomerge`. | -| `StorageError` | enum | Error type returned by every fallible method. Variants: `RelationNotFound`, `RelationExists`, `ArityMismatch`, `Validation`, `Decode`, `Unsupported`, and `Backend`. | -| `CodecError` | enum | Wire-format failure reported as `StorageError::Decode`. Variants describe truncation, unknown tags, length overruns, and UTF-8 errors. | -| `RowStream<'a>` | type alias | `Box), StorageError>> + 'a>`. The value yielded by `Storage::scan_iter` and `Storage::scan_where`. | -| `RowId` | struct | Opaque, backend-assigned row identifier. Bytes are inline up to 36 bytes (covers every encoding the workspace produces today) and spill to the heap otherwise. Construct with `RowId::new(bytes)` or `RowId::from(u64)`. | -| `Value` | enum | Single cell value. Variants: `Int(i64)`, `Str(String)`, and `Id(RowId)`. `Value::Id` is the foreign-key reference used by `geomerge` and any future referencing backend. | -| `Table` | struct | Positional input relation with fixed arity. Produced from a backend scan by `scan_as_table`. Consumed by `query-ops` operators. | -| `scan_as_table(&dyn Storage, &str) -> Result` | function | Materialize a relation from a `Storage` backend into a `Table` for query-language operators. Row IDs are dropped; only cell values remain. | -| `MemoryStorage` | struct | In-process backend kept in `HashMap`. Always available; useful for tests and snapshot oracles. | -| `adapters::sqlite::SqliteStorage` | struct (feat) | `SQLite`-backed `Storage`, behind the `sqlite` feature. Uses `rusqlite` with bundled libsqlite3; supports a single connection with native write transactions. | -| `adapters::redb::RedbStorage` | struct (feat) | Single-file B-tree backed `Storage`, behind the `redb` feature. Wraps `redb::WriteTransaction` for native atomic commits. | -| `adapters::fjall::FjallStorage` | struct (feat) | LSM-tree backed `Storage`, behind the `fjall` feature. Each relation gets a partition; transactions buffer inserts and apply them on commit. | -| `adapters::lmdb::LmdbStorage` | struct (feat) | mmap'd B-tree backed `Storage`, behind the `lmdb` feature. Wraps `heed`'s `RwTxn` for native atomic commits. | -| `adapters::geomerge::GeomergeStorage` | struct (feat) | CRDT-backed `Storage` over the workspace's `geomerge` crate, behind the `geomerge` feature. Wraps `geomerge::Transaction` and resolves pending row IDs via `CommittedTx`. Deletion is not supported (append-only log). | +| Item | Kind | Description | +|--------------------------------------------------------------------|---------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `Storage` | trait | Backend-agnostic interface for storing and retrieving rows. Required methods: `create_relation`, `arity`, `scan_iter`, and `transaction`. The rest (`scan`, `scan_where`, `insert`, `delete`) have default implementations. | +| `Transaction` | trait | Atomic batch of inserts and deletes against a `Storage`. `insert` returns a pending `RowId`; `commit` consumes the boxed transaction and returns a `CommittedTx`; dropping without committing rolls back. | +| `CommittedTx` | struct | Result of a successful `Transaction::commit`. Resolves pending `RowId`s returned during the transaction to their post-commit form via `resolve`. Empty for KV adapters where pending equals real; populated for `geomerge`. | +| `StorageError` | enum | Error type returned by every fallible method. Variants: `RelationNotFound`, `RelationExists`, `ArityMismatch`, `Validation`, `Decode`, `Unsupported`, and `Backend`. | +| `CodecError` | enum | Wire-format failure reported as `StorageError::Decode`. Variants describe truncation, unknown tags, length overruns, and UTF-8 errors. | +| `RowStream<'a>` | type alias | `Box), StorageError>> + 'a>`. The value yielded by `Storage::scan_iter` and `Storage::scan_where`. | +| `RowId` | struct | Opaque, backend-assigned row identifier. Bytes are inline up to 36 bytes (covers every encoding the workspace produces today) and spill to the heap otherwise. Construct with `RowId::new(bytes)` or `RowId::from(u64)`. | +| `Value` | enum | Single cell value. Variants: `Int(i64)`, `Str(String)`, and `Id(RowId)`. `Value::Id` is the foreign-key reference used by `geomerge` and any future referencing backend. | +| `Table` | struct | Positional input relation with fixed arity. Produced from a backend scan by `scan_as_table`. Consumed by `query-ops` operators. | +| `scan_as_table(&dyn Storage, &str) -> Result` | function | Materialize a relation from a `Storage` backend into a `Table` for query-language operators. Row IDs are dropped; only cell values remain. | +| `MemoryStorage` | struct | In-process backend kept in `HashMap`. Always available; useful for tests and snapshot oracles. | +| `adapters::sqlite::SqliteStorage` | struct (feat) | `SQLite`-backed `Storage`, behind the `sqlite` feature. Uses `rusqlite` with bundled libsqlite3; supports a single connection with native write transactions. | +| `adapters::redb::RedbStorage` | struct (feat) | Single-file B-tree backed `Storage`, behind the `redb` feature. Wraps `redb::WriteTransaction` for native atomic commits. | +| `adapters::fjall::FjallStorage` | struct (feat) | LSM-tree backed `Storage`, behind the `fjall` feature. Each relation gets a partition; transactions buffer inserts and apply them on commit. | +| `adapters::lmdb::LmdbStorage` | struct (feat) | mmap'd B-tree backed `Storage`, behind the `lmdb` feature. Wraps `heed`'s `RwTxn` for native atomic commits. | +| `adapters::geomerge::GeomergeStorage` | struct (feat) | CRDT-backed `Storage` over the workspace's `geomerge` crate, behind the `geomerge` feature. Wraps `geomerge::Transaction` and resolves pending row IDs via `CommittedTx`. Deletion is not supported (append-only log). Construct with `from_theory`, `from_store`, or `with_relations` (synthesizes a theory from `(name, Vec)` for callers that lack a typed schema). | +| `adapters::geomerge::ColumnKind` | enum (feat) | Primitive column type fed to `GeomergeStorage::with_relations`: `Int` maps to geomerge `PrimInt`, `String` maps to `PrimString`. Exists so callers can synthesize a theory without depending on `geolog-lang::ir` directly. | Data types and their relationships: @@ -99,7 +100,8 @@ cargo test -p storage --all-features iterators. - **Atomic transactions.** For storage backends with write transactions support (LMDB, Redb, SQLite, and geomerge) we use their transaction API directly. - Adapters without native transaction support (MemoryStorage and Fjall) implement `Transaction` with an internal buffer of pending operations that are applied on `commit`. + Adapters without native transaction support (MemoryStorage and Fjall) implement `Transaction` with an internal buffer of pending operations that are + applied on `commit`. Note that dropping a transaction without calling `commit` rolls back any pending operations. - **Deletion support.** Most adapters implement `delete`. diff --git a/crates/storage/docs/diagrams/types.svg b/crates/storage/docs/diagrams/types.svg index 3177d12..3ff93a5 100644 --- a/crates/storage/docs/diagrams/types.svg +++ b/crates/storage/docs/diagrams/types.svg @@ -1,374 +1,200 @@ + "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd"> - - StorageTypes - - - - storage_node - - Storage - - -  (trait) - - - create_relation(name, arity) - - - arity(name) - - - scan_iter(name) -> RowStream - - - scan(name) -> Vec<(RowId, Vec<Value>)> - - - scan_where(name, col, value) - - - transaction() -> Box<dyn Transaction> - - - insert(name, row) -> RowId - - - delete(name, id) - - - - - transaction_node - - Transaction - - -  (trait) - - - insert(name, row) -> RowId - - - delete(name, id) - - - commit() -> CommittedTx - - - - - storage_node->transaction_node - - - transaction() yields - - - - - row_stream_node - - RowStream<'a> - - -  (type alias) - - - Box<dyn Iterator<Item = - - -  Result<(RowId, Vec<Value>), StorageError> - - - > + 'a> - - - - - storage_node->row_stream_node - - - scan_iter yields - - - - - committed_tx_node - - CommittedTx - - -  (struct) - - - resolutions: HashMap<RowId, RowId> - - - resolve(pending) -> RowId - - - - - transaction_node->committed_tx_node - - - commit() yields - - - - - row_id_node - - RowId - - -  (struct) - - - SmallVec<[u8; 36]> (opaque) - - - new(bytes), as_bytes(), from(u64) - - - - - transaction_node->row_id_node - - - insert() yields - - - - - committed_tx_node->row_id_node - - - resolve() yields - - - - - row_stream_node->row_id_node - - - Item = (RowId, _) - - - - - value_node - - Value - - -  (enum) - - - Int(i64) - - - Str(String) - - - Id(RowId) - - - - - row_stream_node->value_node - - - Item = (_, Vec<Value>) - - - - - value_node->row_id_node - - - Id(RowId) - - - - - table_node - - Table - - -  (struct) - - - arity: usize - - - rows: Vec<Vec<Value>> - - - - - table_node->value_node - - - Vec<Vec<Value>> - - - - - storage_error_node - - StorageError - - -  (enum) - - - RelationNotFound(String) - - - RelationExists(String) - - - ArityMismatch { expected, got } - - - Validation(String) - - - Decode(CodecError) - - - Unsupported(&'static str) - - - Backend(Box<dyn Error>) - - - - - codec_error_node - - CodecError - - -  (enum) - - - UnexpectedEof - - - UnknownTag(u8) - - - LengthOverrun { declared, available } - - - InvalidUtf8 - - - - - storage_error_node->codec_error_node - - - Decode(CodecError) - - - - - adapters_node - - Adapters - - -  (impl Storage) - - - MemoryStorage - - - SqliteStorage  (feat sqlite) - - - RedbStorage  (feat redb) - - - FjallStorage  (feat fjall) - - - LmdbStorage  (feat lmdb) - - - GeomergeStorage  (feat geomerge) - - - - - adapters_node->storage_node - - - impl - - - + viewBox="0.00 0.00 835.88 1114.00" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink"> + +StorageTypes + + + +storage_node + +Storage +  (trait) +create_relation(name, arity) +arity(name) +scan_iter(name) -> RowStream +scan(name) -> Vec<(RowId, Vec<Value>)> +scan_where(name, col, value) +transaction() -> Box<dyn Transaction> +insert(name, row) -> RowId +delete(name, id) + + + +transaction_node + +Transaction +  (trait) +insert(name, row) -> RowId +delete(name, id) +commit() -> CommittedTx + + + +storage_node->transaction_node + + +transaction() yields + + + +row_stream_node + +RowStream<'a> +  (type alias) +Box<dyn Iterator<Item = +  Result<(RowId, Vec<Value>), StorageError> +> + 'a> + + + +storage_node->row_stream_node + + +scan_iter yields + + + +committed_tx_node + +CommittedTx +  (struct) +resolutions: HashMap<RowId, RowId> +resolve(pending) -> RowId + + + +transaction_node->committed_tx_node + + +commit() yields + + + +row_id_node + +RowId +  (struct) +SmallVec<[u8; 36]> (opaque) +new(bytes), as_bytes(), from(u64) + + + +transaction_node->row_id_node + + +insert() yields + + + +committed_tx_node->row_id_node + + +resolve() yields + + + +row_stream_node->row_id_node + + +Item = (RowId, _) + + + +value_node + +Value +  (enum) +Int(i64) +Str(String) +Id(RowId) + + + +row_stream_node->value_node + + +Item = (_, Vec<Value>) + + + +value_node->row_id_node + + +Id(RowId) + + + +table_node + +Table +  (struct) +arity: usize +rows: Vec<Vec<Value>> + + + +table_node->value_node + + +Vec<Vec<Value>> + + + +storage_error_node + +StorageError +  (enum) +RelationNotFound(String) +RelationExists(String) +ArityMismatch { expected, got } +Validation(String) +Decode(CodecError) +Unsupported(&'static str) +Backend(Box<dyn Error>) + + + +codec_error_node + +CodecError +  (enum) +UnexpectedEof +UnknownTag(u8) +LengthOverrun { declared, available } +InvalidUtf8 + + + +storage_error_node->codec_error_node + + +Decode(CodecError) + + + +adapters_node + +Adapters +  (impl Storage) +MemoryStorage +SqliteStorage  (feat sqlite) +RedbStorage  (feat redb) +FjallStorage  (feat fjall) +LmdbStorage  (feat lmdb) +GeomergeStorage  (feat geomerge) + + + +adapters_node->storage_node + + +impl + + diff --git a/crates/storage/docs/diagrams/workflow.svg b/crates/storage/docs/diagrams/workflow.svg index c7f3f41..c40cae3 100644 --- a/crates/storage/docs/diagrams/workflow.svg +++ b/crates/storage/docs/diagrams/workflow.svg @@ -1,359 +1,226 @@ + "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd"> - - StorageWorkflow - - - cluster_inputs - - Inputs - - - - cluster_setup - - Setup  (open backend, declare relations) - - - - cluster_write - - Write  (atomic batch via Transaction) - - - - cluster_read - - Read - - - - cluster_output - - Output - - - - - schema - - Schema - - • - relation name - - • - arity (column count) - - - - - create_relation - - - storage.create_relation(name, arity) - - - - - schema->create_relation - - - - - - row_data - - Row Data - - • - Vec<Value> - - • - Int / Str / Id(RowId) - - - - - tx_ops - - tx.insert / tx.delete - - - • insert yields pending RowId - - - • pending RowIds reused as FKs - - - • delete by RowId - - - - - row_data->tx_ops - - - - - - open_backend - - Open Backend - - - MemoryStorage::new() / - - - SqliteStorage::open(path) / - - - FjallStorage::open(path) / ... - - - - - open_backend->create_relation - - - - - - begin_tx - - - storage.transaction() - - - -> Box<dyn Transaction> - - - - - create_relation->begin_tx - - - - - - scan_iter - - - storage.scan_iter(name) - - - -> RowStream - - - - - create_relation->scan_iter - - - - - - scan_where - - - storage.scan_where(name, col, value) - - - -> RowStream  (filtered) - - - - - create_relation->scan_where - - - - - - scan_full - - - storage.scan(name) - - - -> Vec<(RowId, Vec<Value>)> - - - - - create_relation->scan_full - - - - - - begin_tx->tx_ops - - - - - - commit - - tx.commit() - - • native commit (LMDB, redb, SQLite, geomerge) - - • buffered apply (memory, fjall) - - • law validation (geomerge) - - • yields CommittedTx - - - - - tx_ops->commit - - - - - - resolve_ids - - CommittedTx::resolve - - - • KV: pending == real - - - • geomerge: pending counter → (commit, counter) - - - - - commit->resolve_ids - - - CommittedTx - - - - - commit->scan_iter - - - after commit - - - - - rows_out - - Rows - - • (RowId, Vec<Value>) - - • consumed by query-ops - -  via scan_as_table - - - - - resolve_ids->rows_out - - - real RowIds - - - - - scan_iter->rows_out - - - - - - scan_where->rows_out - - - - - - scan_full->rows_out - - - - + viewBox="0.00 0.00 2195.75 573.00" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink"> + +StorageWorkflow + + +cluster_inputs + +Inputs + + +cluster_setup + +Setup  (open backend, declare relations) + + +cluster_write + +Write  (atomic batch via Transaction) + + +cluster_read + +Read + + +cluster_output + +Output + + + +schema + +Schema +• relation name +• arity (column count) + + + +create_relation + +storage.create_relation(name, arity) + + + +schema->create_relation + + + + + +row_data + +Row Data +• Vec<Value> +• Int / Str / Id(RowId) + + + +tx_ops + +tx.insert / tx.delete +• insert yields pending RowId +• pending RowIds reused as FKs +• delete by RowId + + + +row_data->tx_ops + + + + + +open_backend + +Open Backend +MemoryStorage::new() / +SqliteStorage::open(path) / +FjallStorage::open(path) / ... + + + +open_backend->create_relation + + + + + +begin_tx + +storage.transaction() +-> Box<dyn Transaction> + + + +create_relation->begin_tx + + + + + +scan_iter + +storage.scan_iter(name) +-> RowStream + + + +create_relation->scan_iter + + + + + +scan_where + +storage.scan_where(name, col, value) +-> RowStream  (filtered) + + + +create_relation->scan_where + + + + + +scan_full + +storage.scan(name) +-> Vec<(RowId, Vec<Value>)> + + + +create_relation->scan_full + + + + + +begin_tx->tx_ops + + + + + +commit + +tx.commit() +• native commit (LMDB, redb, SQLite, geomerge) +• buffered apply (memory, fjall) +• law validation (geomerge) +• yields CommittedTx + + + +tx_ops->commit + + + + + +resolve_ids + +CommittedTx::resolve +• KV: pending == real +• geomerge: pending counter → (commit, counter) + + + +commit->resolve_ids + + +CommittedTx + + + +commit->scan_iter + + +after commit + + + +rows_out + +Rows +• (RowId, Vec<Value>) +• consumed by query-ops +  via scan_as_table + + + +resolve_ids->rows_out + + +real RowIds + + + +scan_iter->rows_out + + + + + +scan_where->rows_out + + + + + +scan_full->rows_out + + + + diff --git a/crates/storage/src/adapters/geomerge.rs b/crates/storage/src/adapters/geomerge.rs index 7539105..6109feb 100644 --- a/crates/storage/src/adapters/geomerge.rs +++ b/crates/storage/src/adapters/geomerge.rs @@ -93,6 +93,17 @@ fn decode_pending_row_id(bytes: &[u8]) -> Result { } /// Geomerge-backed [`Storage`] implementation. +/// Primitive column type used by [`GeomergeStorage::with_relations`] to +/// synthesize a theory from an untyped `(name, arity)` schema. Geomerge +/// supports `PrimInt`, `PrimString`, and entity types; only the two +/// primitives are exposed here, since callers using this constructor by +/// definition don't carry entity-target information. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum ColumnKind { + Int, + String, +} + pub struct GeomergeStorage { store: Store, declared: HashSet, @@ -138,6 +149,52 @@ impl GeomergeStorage { } } + /// Build a store with a theory synthesized from a flat list of + /// `(relation_name, column_kinds)`. Each `ColumnKind` is mapped to the + /// matching `PrimType`. No entity columns and no laws are declared. + /// + /// This is the convenience constructor for callers (e.g., the + /// `plan-runner` CLI) whose schema only carries arity plus a column-by- + /// column primitive-type guess taken from a sample row. It exists so + /// those callers don't have to depend on `geolog-lang::ir` directly. + /// + /// # Errors + /// Returns [`StorageError::Backend`] if geomerge rejects the synthesized + /// theory. + pub fn with_relations(relations: I) -> Result + where + I: IntoIterator)>, + S: Into, + { + let tables: Vec = relations + .into_iter() + .map(|(name, kinds)| { + let columns = kinds + .into_iter() + .map(|k| ir::ColType::PrimType { + prim: match k { + ColumnKind::Int => ir::PrimType::PrimInt, + ColumnKind::String => ir::PrimType::PrimString, + }, + }) + .collect(); + let name: String = name.into(); + ir::TableEntry { + path: name.into(), + table: ir::Schema { + columns, + primary_key: None, + }, + } + }) + .collect(); + let theory = ir::FlatTheory { + tables, + laws: Vec::new(), + }; + Self::from_theory(theory) + } + /// Borrow the underlying geomerge store (for backend-specific operations /// like persistence, dump, or law inspection that aren't on the trait). #[must_use] diff --git a/crates/storage/src/adapters/lmdb.rs b/crates/storage/src/adapters/lmdb.rs index 5fd8353..dd44267 100644 --- a/crates/storage/src/adapters/lmdb.rs +++ b/crates/storage/src/adapters/lmdb.rs @@ -154,12 +154,12 @@ impl Transaction for LmdbTx<'_> { let Some(wtxn) = self.wtxn.as_ref() else { unreachable!("transaction was already committed") }; - let raw = self + let encoded = self .meta .get(wtxn, name.as_bytes()) .map_err(backend)? .ok_or_else(|| StorageError::RelationNotFound(name.to_string()))?; - let entry = decode_meta(raw)?; + let entry = decode_meta(encoded)?; self.next_ids.insert(name.to_string(), entry); entry }; diff --git a/crates/storage/src/adapters/sqlite.rs b/crates/storage/src/adapters/sqlite.rs index a561000..5646048 100644 --- a/crates/storage/src/adapters/sqlite.rs +++ b/crates/storage/src/adapters/sqlite.rs @@ -1,4 +1,4 @@ -//! SQLite adapter via the `rusqlite` crate (bundled libsqlite3). +//! `SQLite` adapter via the `rusqlite` crate (bundled libsqlite3). //! //! Storage layout: //! @@ -35,13 +35,13 @@ CREATE TABLE IF NOT EXISTS __rows ( ); "; -/// SQLite-backed [`Storage`] implementation. +/// `SQLite`-backed [`Storage`] implementation. pub struct SqliteStorage { conn: Connection, } impl SqliteStorage { - /// Open or create a SQLite database at `path`. Pass `":memory:"` for + /// Open or create a `SQLite` database at `path`. Pass `":memory:"` for /// an in-process database (useful in tests). /// /// # Errors diff --git a/notes/adapter/01-geolog-branches.md b/notes/adapter/01-geolog-branches.md deleted file mode 100644 index 21cefb1..0000000 --- a/notes/adapter/01-geolog-branches.md +++ /dev/null @@ -1,60 +0,0 @@ -# Geolog Branch Focus - -Generated from the local `tmp/geolog` repository. Focus is inferred from branch names, tip commit messages, commits unique relative to `origin/main`, -and touched files. Branches are sorted by latest tip commit date, newest first. Git does not store branch creator metadata in refs, so -`First Unique Commit Author` is the author of the oldest commit unique to that branch relative to `origin/main`. - -| Branch | Latest Update | Last Change Author | First Unique Commit Author | Unique Commits Versus `origin/main` | Focus | -|---------------------------|---------------|--------------------|----------------------------|------------------------------------:|----------------------------------------------------------------------------------------------------------------------------------------------| -| `may-spring-cleaning` | 2026-05-21 | James Deikun | Owen Lynch | 4 | Core cleanup around conversion checking, evaluation parameters, readback, value syntax, and printing. | -| `wasm-diagrams-demo` | 2026-05-21 | George Thomas | Owen Lynch | 136 | Wasm diagrams demo with interactive query-plan diagrams, Graphviz and diagrams vendoring, web layout fixes, and live frontend work. | -| `query-plan-ir-draft-1` | 2026-05-21 | Patrick Aldis | Owen Lynch | 81 | Query plan IR draft with evaluation passing tests, diagram rendering, join orientation visualization, and REPL query diagrams. | -| `main` | 2026-05-21 | Owen Lynch | Unknown, no unique commits | 0 | Current mainline. Tip is a merge of `manual-deploy-fixup`. | -| `reports-2026-05-19` | 2026-05-19 | George Thomas | Owen Lynch | 8 | Team report files through May 19, 2026. | -| `query-plan-ir` | 2026-05-18 | Patrick Aldis | Owen Lynch | 64 | Query planning IR on top of the in-memory and IR integration branch, including register information and plan rendering. | -| `wasm` | 2026-05-15 | George Thomas | Owen Lynch | 69 | Wasm and web frontend work, including JSON output, Geolog Wasm library, Nix/Haskell build updates, and diagrams dependencies. | -| `post-cale-in-memory-ir2` | 2026-05-13 | Patrick Aldis | Owen Lynch | 60 | Integration branch combining in-memory and REPL cleanup with `ir-draft2`. | -| `wasm-dist` | 2026-05-12 | George Thomas | Owen Lynch | 68 | Wasm distribution branch for built assets and frontend output, based on the Wasm and in-memory work. | -| `reports-2026-05-12` | 2026-05-12 | felix | Owen Lynch | 7 | Team report files for late April and May 12, 2026. | -| `felix-db-2` | 2026-05-12 | felix | felix | 8 | Experimental Datalog database backend work, including in-memory and Orville SQL APIs, `applyRule`, and chase tests. | -| `energy` | 2026-05-05 | James Deikun | James Deikun | 3 | Energy or potential semantics in core operations, elaboration, notation, pretty printing, and related golden tests. | -| `ir-draft2-bands` | 2026-05-01 | felix | Owen Lynch | 21 | `ir-draft2` plus a band example and golden output showing lowering behavior. | -| `hexane` | 2026-05-01 | George Thomas | George Thomas | 9 | Hexane integration, Haskell FFI and Rust tooling, Geomerge Haskell wrapper, JSON tests, and IR cherry-picks. | -| `in-memory-devibed` | 2026-04-30 | Patrick Aldis | Owen Lynch | 57 | Cleanup of in-memory and REPL code, structured REPL errors, command and state modules, comments, and property tests. | -| `lowering-via-eval-quote` | 2026-04-29 | Owen Lynch | Owen Lynch | 7 | Alternative lowering approach via eval and quote, with SSA and graded-rig tests. | -| `in-memory` | 2026-04-28 | Patrick Aldis | Owen Lynch | 48 | In-memory database execution, DB plan modules, IR lowering, REPL support, and formatting cleanup. | -| `lower-to-js` | 2026-04-26 | Owen Lynch | Owen Lynch | 8 | TypeScript and JavaScript runtime lowering experiments, including graph-of-graphs compilation and relation validators. | -| `george/wip` | 2026-04-20 | Cale Gibbard | Owen Lynch | 96 | Broad work-in-progress branch covering Nix flakes, Reflex, examples, data partitioning, diagnostics, FNotation, and Geolog language changes. | -| `latex-dates` | 2026-04-20 | George Thomas | George Thomas | 2 | Nix shell and Shake fixes for LaTeX date generation in documentation builds. | -| `config-default` | 2026-04-17 | Shuntian Liu | Unknown, no unique commits | 0 | Configuration default fix, likely already merged or superseded by `main`. | -| `geomerge` | 2026-04-17 | Shuntian Liu | Unknown, no unique commits | 0 | Geomerge import or merge branch, likely already merged or superseded by `main`. | -| `warmup-ci` | 2026-04-17 | Owen Lynch | Owen Lynch | 8 | CI warmup work, including Cabal update steps and a freeze file for Shake. | -| `mcp` | 2026-04-09 | George Thomas | Owen Lynch | 56 | MCP server or tool experiment with REPL-like functionality, LaTeX source-reading tools, transcripts, and in-memory dependencies. | -| `ir-draft2` | 2026-04-09 | James Deikun | Owen Lynch | 20 | Second IR draft with equality-type support and updates across core, elaborator, IR, lowering, diagnostics, and tests. | -| `golden-tests-fix` | 2026-04-09 | Patrick Aldis | Patrick Aldis | 2 | Golden-test path fix so `tasty-golden` looks in the `test` directory for FNotation and Geolog. | -| `in-memory-demo` | 2026-04-07 | Felix Dilke | Owen Lynch | 38 | In-memory demo and REPL workflow, including `:schema`, dotted table names for insert and query, and REPL test scripts. | -| `ir-draft1-clean` | 2026-04-07 | Cale Gibbard | Owen Lynch | 22 | Cleaned and rebased version of `ir-draft1`, including explicit lowering context and local environment handling. | -| `lsp-demo` | 2026-04-02 | Patrick Aldis | Patrick Aldis | 2 | Demo-only LSP behavior change to hide fatal errors. | -| `ir-draft1` | 2026-04-01 | James Deikun | Owen Lynch | 16 | First IR lowering draft, with support for columns, totality-rule cleanup, core lowering, and golden tests. | -| `lsp` | 2026-03-31 | Patrick Aldis | Patrick Aldis | 28 | Geolog LSP development, diagnostics integration, analyzed buffer result types, and `lsp-test` coverage. | -| `benchmarks` | 2026-03-31 | felix | Unknown, no unique commits | 0 | Benchmark work that appears already merged or superseded by `main`. | -| `why-geolog` | 2026-03-31 | Martin Kleppmann | Unknown, no unique commits | 0 | Why-Geolog documentation or positioning notes, likely already merged or superseded by `main`. | -| `fnotation-release` | 2026-03-30 | Owen Lynch | Unknown, no unique commits | 0 | Documentation for an FNotation release, likely already merged or superseded by `main`. | -| `repl` | 2026-03-27 | George Thomas | George Thomas | 18 | REPL usability changes, including command renames, completions, declaration display, and load-to-source naming. | -| `reporting-refactor` | 2026-03-27 | Patrick Aldis | Patrick Aldis | 4 | Reporter abstraction cleanup, including contrafunctor support and lifted `reportIO`. | -| `parser-exceptions` | 2026-03-25 | Patrick Aldis | Patrick Aldis | 8 | Parser, lexer, and elaborator refactor to use `ExceptT` and pure reporting. | -| `fb3-fail` | 2026-03-24 | felix | Patrick Aldis | 6 | Failure-case fixtures and Nix setup around `fb3` parser and elaborator golden tests. | -| `edits-from-london` | 2026-03-24 | Owen Lynch | Owen Lynch | 2 | London meeting edits across core syntax, operations, elaboration, lexer, pretty printing, and linear-order tests. | -| `fix-off-by-one` | 2026-03-20 | Patrick Aldis | Patrick Aldis | 2 | Diagnostic source-position fix for an off-by-one error in `lineOf`. | -| `felix-db-2-cale-edits` | 2026-03-11 | Felix Dilke | Felix Dilke | 2 | Incomplete Cale edits making `InMDB` monadic via state in `felix-db`. | -| `cg/query-planning` | 2026-03-10 | Cale Gibbard | Unknown, no unique commits | 0 | Query planning work that appears already merged or superseded by `main`. | -| `ir-design-2` | 2026-03-06 | James Deikun | Unknown, no unique commits | 0 | Follow-up IR design document cleanup, likely already merged or superseded by `main`. | -| `observational` | 2026-02-24 | Owen Lynch | Owen Lynch | 16 | Observational checking and diagnostics refactor, including lexer, parser, elaborator diagnostics, and related documentation. | -| `ir-design-1` | 2026-02-23 | James Deikun | Unknown, no unique commits | 0 | IR vision-document draft, likely already merged or superseded by `main`. | -| `haddock-ci` | 2026-01-15 | Owen Lynch | Unknown, no unique commits | 0 | Haddock CI experiment, likely already merged or superseded by `main`. | -| `notation-refactor` | 2025-12-24 | Owen Lynch | Unknown, no unique commits | 0 | Notation refactor and output-name updates, likely already merged or superseded by `main`. | -| `tests` | 2025-12-19 | Owen Lynch | Owen Lynch | 2 | Prospectus documentation change clarifying theorem expectations. | - -## Change Log - -- 2026-05-22: Updated branch table with latest update dates, last-change authors, first unique commit authors, unique commit counts, and inferred focus notes. diff --git a/notes/adapter/02-geolog-lang-findings.md b/notes/adapter/02-geolog-lang-findings.md deleted file mode 100644 index b533c92..0000000 --- a/notes/adapter/02-geolog-lang-findings.md +++ /dev/null @@ -1,222 +0,0 @@ -# Geolog Lang Findings - -Source inspected: local reference checkout at `tmp/geolog/geolog-lang`. - -## Summary - -`geolog-lang` is a Haskell prototype for a Geolog source language. It parses `.glog` theory files, elaborates them into a typed core language, lowers a selected `Main` theory into a flat relational representation, and includes an in-memory relation engine with law checking and a conjunctive-query planner. - -For this repository, the useful reference is not the Haskell implementation itself. The useful reference is the shape of the pipeline: - -```text -Geolog theory --> typed core elaboration --> flat relational theory --> tables, generated laws, and validation queries --> in-memory execution and query planning experiments -``` - -## Package Shape - -The package exposes modules for: - -- core syntax, values, evaluation, quoting, and equality checking -- notation, lexer, parser configuration, and diagnostics -- elaboration from parsed notation into typed core entries -- lowering from the typed `Main` theory into a flat relational theory -- relational IR for tables, atoms, propositions, and laws -- an in-memory database and a conjunctive-query planner - -The test suite combines golden tests for `.glog` elaboration and lowering with unit and property tests for the in-memory database and planner. - -## Source Language Shape - -The examples use declarations such as: - -```text -theory Main := sig - Node : Set - Edge : Node -> Node -> Set -end -``` - -The notation supports `theory`, `def`, `set`, `let`, `open`, `import`, `Set`, `Prop`, `Int`, `String`, `Inductive`, `pure`, `init`, `->`, `*->`, and `=>`. - -The source language is closer to a dependent typed theory language than to ordinary Datalog. Relations, functions, records, equality, and inductive constructions are represented in the source and then selectively lowered. - -## Lowered Relational IR - -The lowered IR has these main concepts: - -- `Path`: dotted table or law names represented as path components -- `Table`: column types and an optional primary key -- `Atom`: table reference, optional row identity term, and positional value terms -- `Prop`: atom, equality, conjunction, or disjunction -- `Law`: universally quantified variables, an antecedent proposition, and a consequent proposition -- `FlatTheory`: maps of tables and laws - -For the graph example above, lowering produces a `Node` table, an `Edge` table, and an `Edge.foreignKeys` law. The law says that every edge row implies the existence of both endpoint node rows. - -This is directly relevant to Geomerge-style validation experiments. A high-level declaration can compile into maintained violation checks by turning generated laws into queries: - -```text -required_consequent(x) :- antecedent(x). -violation(x) :- required_consequent(x), not consequent(x). -``` - -## Generated Law Patterns - -Two generated law shapes matter for this playground: - -- foreign-key laws: rows in one table require referenced rows in another table -- totality laws: functional relations require an output for each valid input - -Foreign-key laws are generated from dependent arguments that reference prior sets or relations. Totality laws are generated for functional table shapes. - -The current lowering appears intentionally narrow. Unsupported shapes are reported or panic in prototype code rather than being treated as complete semantics. That is useful for this repo because early adapters should also make supported and unsupported cases explicit. - -## In-Memory Execution - -The in-memory database stores: - -- ground values: integers, text, and entity identities -- relations: set-valued tuples with secondary indexes by column and value -- bindings: rows of variable assignments - -Query evaluation supports atoms and conjunctions. Conjunctions are evaluated with natural join. Law checking evaluates a law antecedent to produce witness bindings, then checks the consequent for each witness. Violations carry the law path and witness values. - -This gives a simple snapshot oracle shape for validation experiments: - -```text -facts + generated law --> antecedent witnesses --> consequent checks --> violation rows -``` - -## Storage Engine - -`geolog-lang` uses a custom in-memory storage engine, not SQLite, Postgres, or another external database. The storage module is `Geolog.DB.InMemory`. - -The core storage structures are: - -- `DB`: schema, relations, and next entity IDs -- `Relation`: fixed-arity tuple set plus a secondary index -- `Val`: stored values, including integers, text, and entity identities -- `Bindings`: query result rows as variable-to-value maps - -The relation storage shape is: - -```text -Relation { - arity: tuple width, - members: set of tuples, - index: map from (column position, value) to matching tuples -} -``` - -Inserts add a tuple to the `members` set and update the secondary index for every `(column, value)` pair in the tuple. Atom evaluation uses literal terms to probe those secondary indexes, intersects candidate tuple sets when multiple literal constraints are present, and then produces variable bindings. - -Conjunction evaluation has two paths: - -- naive evaluation: evaluate each atom, then natural-join the resulting bindings -- planned evaluation: use `Geolog.DB.Plan` to build a semijoin-based join plan, then evaluate that plan against the same in-memory relations - -The checked insert path inserts into a temporary updated database and checks generated laws such as `foreignKeys`. If the law check finds violations, the updated database is not returned. This gives rollback-like behavior at the API boundary, but it is not a transactional storage engine with durable logs or multi-step rollback. - -## Storage API - -`geolog-lang` does not define a storage abstraction or backend trait. It directly uses the concrete in-memory API in `Geolog.DB.InMemory`. - -The minimal storage API it needs is: - -```text -schema registration: - fromTheory :: FlatTheory -> DB - -entity allocation: - freshEntity :: Path -> DB -> (Val, DB) - -writes: - insertRow :: Path -> [Val] -> DB -> DB - checkedInsertRow :: FlatTheory -> Path -> [Val] -> DB -> Either InsertError DB - -reads: - evalAtom :: DB -> QAtom -> Either DBError Bindings - evalConjunction :: DB -> [QAtom] -> Either DBError Bindings - -validation: - checkLaw :: DB -> Path -> Law -> Either DBError [LawViolation] -``` - -The planner also depends on planned conjunction evaluation: - -```text -planConjunction :: [QAtom] -> JoinPlan -evalRootedPlan :: DB -> JoinPlan -> Either DBError Bindings -evalConjunctionPlanned :: DB -> [QAtom] -> Either DBError Bindings -``` - -For this repository, an adapter-facing storage interface should probably provide: - -1. flat schema registration from lowered tables, -2. fixed-arity tuple insertion into named relations, -3. stable entity ID allocation per table path, -4. tuple lookup by table and positional constraints, -5. variable binding output for query atoms, -6. natural join and semijoin support, or enough scans and index probes for the planner to implement them, -7. generated-law checking with structured violations, and -8. checked insert behavior that rejects or rolls back invalid writes. - -This API is snapshot-oriented. It is not yet a durable transaction API, an incremental DBSP API, or a general SQL-like storage layer. - -## Query Planning - -The planner targets conjunctive queries over the in-memory database. It: - -1. evaluates each atom independently, -2. builds an atom intersection graph weighted by shared variables, -3. computes a maximum spanning forest, -4. performs bottom-up and top-down semijoin reduction, and -5. performs full joins along the forest. - -The tests compare planned conjunction evaluation against naive conjunction evaluation on generated graph workloads. This is a useful test pattern for this repository: every planned execution path should have a simple snapshot or naive oracle while the planner is still experimental. - -## Adapter Takeaways - -The practical adapter direction is: - -1. Model a small `FlatTheory`-like IR in Rust before implementing a full Geolog frontend. -2. Start with generated foreign-key and totality laws because they have clear violation semantics. -3. Represent violation outputs with law identity, violation kind, relation or consequent identity, and bound variable values. -4. Keep source elaboration, relational lowering, planning, and execution as separate modules. -5. Add textual plan output early so planner behavior is inspectable. -6. Use a naive snapshot evaluator as the oracle for planned or maintained execution. - -## Open Questions - -- How much of Geolog source syntax should this repo parse directly versus using hand-built fixtures first? -- Should equality in consequents be supported before disjunction, existential witnesses, or recursive laws? -- Should totality validation use explicit missing-output violations or a generated required-output relation? -- How should entity identities map onto this repo's storage transaction and rollback model? -- Which law subset is enough to test Geomerge-style maintained violation relations without building a full chase engine? - -## Next Experiment Shape - -A small vertical slice for this repository could be: - -```text -input: - table node(id) - table edge(src, dst, id) - law edge_foreign_keys: - edge(src, dst, id) -> node(src) and node(dst) - -snapshot output: - violation rows for missing src or dst nodes - -planned output: - textual antijoin plan for edge against node(src) and node(dst) -``` - -That would test catalog construction, generated-law representation, antijoin planning, deterministic violation rows, and a snapshot oracle without requiring the rest of Geolog. diff --git a/notes/backend/01-cozo-and-lmdb-findings.md b/notes/backend/01-cozo-and-lmdb-findings.md deleted file mode 100644 index 9f5ad62..0000000 --- a/notes/backend/01-cozo-and-lmdb-findings.md +++ /dev/null @@ -1,135 +0,0 @@ -## Cozo and LMDB Findings - -Sources inspected: the Cozo source tree at `github.com/cozodb/cozo`, the LMDB source tree at `github.com/LMDB/lmdb`, and the `heed` Rust binding at `github.com/meilisearch/heed`. -File paths in this note are relative to the root of the named project's source tree. -The aim was to understand how a working Datalog engine (Cozo) implements joins and what a low-level key-value substrate (LMDB) provides that makes those joins cheap. -This note summarizes the design lessons and the practical implications for the `query-ops` crate in this playground. - -### Summary - -Cozo is an embedded Datalog database written in Rust. -It does not have a separate semijoin operator. -Instead, it has one inner-join operator that picks between two strategies based on how each relation is stored: an index-nested-loop strategy that uses ordered range scans over the substrate, and a fallback that materializes one side into a sorted vector and probes it. -Semijoin behavior, when needed, emerges from a separate rewrite step called the magic-sets transformation, which converts semijoin-shaped pruning into regular inner joins against derived relations. - -LMDB is a memory-mapped, ordered key-value store with a B+ tree on disk. -It exposes a small set of cursor primitives that support prefix iteration, range iteration, and exact-key lookup. -These primitives are exactly what an index-nested-loop join needs: seek to a key prefix, then iterate forward while the prefix matches. - -The combined lesson is that a good join does not require a clever operator. -It requires the relation to be stored with the join columns at the front of the key, so that the substrate's ordered iteration can do the join itself. - -### Cozo - -#### What It Is - -Cozo is a Datalog database with multiple swappable storage backends, including an in-memory store, SQLite, RocksDB, sled, and TiKV. -The execution engine speaks a single narrow storage trait whose surface is essentially `get`, `put`, `range_iter`, and `prefix_iter` over byte keys. -Each backend implements that trait. -The trait definition lives at `cozo-core/src/storage/mod.rs` in the Cozo source tree. - -#### Join Behavior - -The relational algebra at `cozo-core/src/query/ra.rs` in the Cozo source tree defines a single join operator named `InnerJoin`. -At execution time it chooses between two strategies based on a check called `join_is_prefix`: - -- prefix join: for each tuple from the left side, the engine builds a byte prefix from the join columns and calls `prefix_iter` on the right relation. - The substrate yields all matching tuples in key order. - No hash table is built. - This path is taken whenever the right side's join columns are stored as the prefix of its key. -- materialized join: used when the join columns are not a key prefix. - The right side is read fully into a sorted, deduplicated vector, reordered so the join columns come first, then walked with a `starts_with(prefix)` check. - This is the build-and-probe family, but with a sorted vector instead of a hash map. - -The choice is made entirely on whether the join columns sit at the front of the stored key. - -#### No Semijoin Operator - -A search of the Cozo source for `semijoin` or `semi_join` returns nothing. -Semijoin behavior comes from the magic-sets transformation at `cozo-core/src/query/magic.rs` in the Cozo source tree. -This pass rewrites each rule so that body atoms get joined against an auxiliary "magic" relation whose contents encode the binding patterns supplied by the rule's callers. -The net effect is the same as semijoining body atoms against caller-supplied filters, but the implementation is a logical rewrite, not a runtime operator. - -#### No Auto-Maintained Secondary Indexes - -Cozo does not maintain secondary indexes automatically. -If you want to query a relation by a column order different from how it was declared, you declare a second relation with the columns reordered and keep its contents synchronized at insert time. -A covering index is just another stored relation. -The decision of which column order to store comes from how you expect to query the data, not from the engine. - -### LMDB - -#### What It Is - -LMDB is a single-file, memory-mapped, ordered key-value store. -It uses a B+ tree on disk and exposes reads as zero-copy byte slices that point directly into the mmap. -It supports a single writer at a time and many concurrent readers, and it uses shadow paging for MVCC, which means commits are atomic without a write-ahead log. - -#### Cursor Primitives - -A cursor in LMDB is a position inside the B+ tree. -The full set of cursor operations is defined by the `MDB_cursor_op` enum in `libraries/liblmdb/lmdb.h` in the LMDB source tree. -The operations relevant to join work are: - -- `MDB_SET_RANGE`: position at the first key greater than or equal to a given key. - This is the seek primitive that makes prefix scans possible. -- `MDB_NEXT`: advance one step forward in key order. - Combined with `MDB_SET_RANGE` and a per-step prefix check, this gives you ordered range iteration. -- `MDB_SET` and `MDB_SET_KEY`: exact-key positioning, used for point lookups. -- `MDB_FIRST` and `MDB_LAST`: positional endpoints. - -For databases opened with the `MDB_DUPSORT` flag, one key can carry multiple sorted values, and additional operations apply: `MDB_GET_BOTH`, `MDB_NEXT_DUP`, `MDB_FIRST_DUP`. -This is useful when a relation is encoded as "key = join columns, duplicate values = remaining columns": the set of duplicates is itself a secondary index over the join key. - -#### Rust Binding - -`heed` is the idiomatic Rust binding for LMDB. -It wraps the cursor operations as `RoCursor` and `RwCursor` and returns key and value byte slices tied to the transaction lifetime, so reads remain zero-copy. -Meilisearch uses `heed` in production, so the binding is well exercised. - -### LMDB Versus RocksDB - -Both LMDB and RocksDB are ordered key-value stores with prefix and range scans, but their internal designs lead to different operational profiles. - -LMDB highlights: - -- B+ tree on disk, memory mapped -- Single writer at a time, many concurrent readers -- Zero-copy reads from the mmap -- Append-only on-disk format; deletes leave reclaimable free pages -- File size grows up to a configured `mapsize` -- No background compaction -- Manual reclaim with `mdb_copy --compact` - -RocksDB highlights: - -- Log-structured merge tree -- Multiple concurrent writers -- Background compaction -- Higher write throughput at the cost of write amplification -- Reads may traverse multiple levels with bloom-filter checks -- Engine manages its own disk layout - -For a read-heavy prototype with batch inserts, LMDB is the closer fit: predictable read costs, cheap range scans, and zero-copy probes. -RocksDB earns its overhead when sustained write throughput is the bottleneck. - -### Practical Implications - -The current `query-ops` crate works on in-memory `Vec` values and will implement semijoin and natural join with a transient hash on one side. -The Cozo design suggests a clear upgrade path once a real substrate is added. - -Short term: keep the in-memory operator and build a transient hash on the smaller side. -This is correct, easy to test, and easy to reason about. - -Medium term: when relations move into a substrate like LMDB, encode each relation so that the join columns sit at the prefix of the key, or use a `DUPSORT` database where the duplicate values carry the remaining columns. -At that point the join operator becomes a cursor pattern (`MDB_SET_RANGE` followed by `MDB_NEXT` while the prefix matches), and the separate hash-building step disappears. - -Index discipline: if a relation needs to be joined two different ways, store it twice with different prefix orders. -There is no clever-indexing shortcut in either Cozo or LMDB, and trying to invent one is unlikely to be worth the cost. - -The takeaway is that the operator surface in `query-ops` is fine for an in-memory prototype, but the substrate decision is the load-bearing one for performance. -We do not need to design around it now, but the natural successor to the current operators is a key-encoding discipline rather than a more elaborate operator implementation. - -### Changelog - -- **June 2, 2026** -- The first version of this document was made. diff --git a/notes/flowlog/.gitkeep b/notes/flowlog/.gitkeep deleted file mode 100644 index e69de29..0000000 diff --git a/tools/exporter/README.md b/tools/exporter/README.md new file mode 100644 index 0000000..1dccef0 --- /dev/null +++ b/tools/exporter/README.md @@ -0,0 +1,75 @@ +## Plan Exporter + +Haskell tool that turns a hand-authored scenario into the JSON IR consumed by +[`crates/plan-runner`](../../crates/plan-runner). +Each scenario carries a small schema, a set of ground facts, a list of conjunctive-query atoms, and an expected-bindings oracle. +The exporter parses the scenario, runs `Geolog.DB.Plan.planConjunction` from +[`external/geolog`](../../external/geolog) to produce a Yannakakis-style plan, +self-checks the planned bindings against the oracle, then writes the runner IR to stdout. + +This is the only producer of runner IR today; +any other frontend that emits the same JSON shape can drive the runner instead. + +### Layout + +```text +tools/exporter/ +├── plan-exporter.cabal # cabal package: executable `plan-export` +├── cabal.project # points at geolog-lang and its siblings in external/geolog +├── src/Main.hs # the exporter itself; rustdoc-style header explains the IR +└── examples/ # hand-authored scenario inputs + ├── cartesian.scenario.json + ├── self_loop.scenario.json + ├── three_atom_chain.scenario.json + └── two_atom_join.scenario.json +``` + +### Run It + +The exporter needs GHC 9.12 and Cabal. +The repository's Nix dev shell provides both; +enter it with `make shell` (or `nix develop`) before running the commands below. + +```sh +# Build the executable: +cd tools/exporter && cabal build plan-export + +# Export one scenario to runner IR JSON: +cabal run -v0 plan-export -- examples/two_atom_join.scenario.json + +# Regenerate every fixture in crates/plan-runner/fixtures/ from these scenarios: +make export-fixtures # run from the repository root +``` + +### Scenario Format + +Each `.scenario.json` carries four blocks: + +| Block | Maps to | +|---------------------|----------------------------------------------------------------------| +| `schema` | `FlatTheory` from `geolog-lang`: relation paths and column types. | +| `facts` | Ground tuples per relation, in the shape `InMemory` ingests. | +| `atoms` | The conjunctive-query body, as a list of `QAtom`. | +| `expected_bindings` | Variables and rows the planner is expected to produce. Used as a self-check; lifted through to the runner-side oracle. | + +See the four committed examples for the exact JSON shape. +The runner-side IR shape (what the exporter emits) is documented in +[`crates/plan-runner/src/lib.rs`](../../crates/plan-runner/src/lib.rs). + +### Self-Check + +Before emitting JSON, the exporter runs the planned query through +`Geolog.DB.InMemory.evalConjunctionPlanned` and asserts the bindings match +the scenario's `expected_bindings`. +A mismatched scenario fails at export time, so the Rust side never sees a broken fixture. + +### Notes + +- **Cross-language contract.** + The JSON shape is the contract between this tool and `plan-runner`. + Changing it means editing both sides. +- **Subset of the source language.** + The exporter only handles conjunctive queries that fit `planConjunction`. + Negation, recursion, and laws are out of scope here; they belong to a different planner pass. +- **No durable references to ignored paths.** + This tool depends on `external/geolog` (a submodule), not on any ad-hoc clone or generated copy in an ignored path. diff --git a/tools/exporter/cabal.project b/tools/exporter/cabal.project index d2d5aa3..35a9fe7 100644 --- a/tools/exporter/cabal.project +++ b/tools/exporter/cabal.project @@ -7,7 +7,7 @@ -- against. packages: - glog-exporter.cabal + plan-exporter.cabal ../../external/geolog/geolog-lang/geolog-lang.cabal ../../external/geolog/data-partition/data-partition.cabal ../../external/geolog/diagnostician/diagnostician.cabal diff --git a/tools/exporter/examples/cartesian.scenario.json b/tools/exporter/examples/cartesian.scenario.json new file mode 100644 index 0000000..eec917a --- /dev/null +++ b/tools/exporter/examples/cartesian.scenario.json @@ -0,0 +1,140 @@ +{ + "name": "cartesian", + "_description": "Two disjoint atoms over different tables. Exercises the 'stray' branch of planConjunction's spanning forest (no shared variables = no edge in the intersection graph) and the linear chain of natural-joins that fullJoinForest emits over disconnected components.", + "schema": { + "left": { + "columns": [ + { + "entity": "left" + } + ] + }, + "right": { + "columns": [ + { + "entity": "right" + } + ] + } + }, + "facts": { + "left": [ + [ + { + "entity": [ + "left", + 1 + ] + } + ], + [ + { + "entity": [ + "left", + 2 + ] + } + ] + ], + "right": [ + [ + { + "entity": [ + "right", + 10 + ] + } + ], + [ + { + "entity": [ + "right", + 20 + ] + } + ] + ] + }, + "atoms": [ + { + "table": "left", + "values": { + "0": { + "var": "a" + } + } + }, + { + "table": "right", + "values": { + "0": { + "var": "b" + } + } + } + ], + "expected_bindings": { + "columns": [ + "a", + "b" + ], + "rows": [ + [ + { + "entity": [ + "left", + 1 + ] + }, + { + "entity": [ + "right", + 10 + ] + } + ], + [ + { + "entity": [ + "left", + 1 + ] + }, + { + "entity": [ + "right", + 20 + ] + } + ], + [ + { + "entity": [ + "left", + 2 + ] + }, + { + "entity": [ + "right", + 10 + ] + } + ], + [ + { + "entity": [ + "left", + 2 + ] + }, + { + "entity": [ + "right", + 20 + ] + } + ] + ] + } +} diff --git a/tools/exporter/examples/self_loop.scenario.json b/tools/exporter/examples/self_loop.scenario.json new file mode 100644 index 0000000..735ffeb --- /dev/null +++ b/tools/exporter/examples/self_loop.scenario.json @@ -0,0 +1,119 @@ +{ + "name": "self-loop", + "_description": "Single-atom query with a repeated variable across two columns: edge(x, x, _). Exercises evalAtom's equality-enforcement path; the planner emits one PlanEvalAtom node and no joins.", + "schema": { + "edge": { + "columns": [ + { + "entity": "node" + }, + { + "entity": "node" + }, + { + "entity": "edge" + } + ] + } + }, + "facts": { + "edge": [ + [ + { + "entity": [ + "node", + 1 + ] + }, + { + "entity": [ + "node", + 2 + ] + }, + { + "entity": [ + "edge", + 1 + ] + } + ], + [ + { + "entity": [ + "node", + 2 + ] + }, + { + "entity": [ + "node", + 2 + ] + }, + { + "entity": [ + "edge", + 2 + ] + } + ], + [ + { + "entity": [ + "node", + 3 + ] + }, + { + "entity": [ + "node", + 3 + ] + }, + { + "entity": [ + "edge", + 3 + ] + } + ] + ] + }, + "atoms": [ + { + "table": "edge", + "values": { + "0": { + "var": "x" + }, + "1": { + "var": "x" + } + } + } + ], + "expected_bindings": { + "columns": [ + "x" + ], + "rows": [ + [ + { + "entity": [ + "node", + 2 + ] + } + ], + [ + { + "entity": [ + "node", + 3 + ] + } + ] + ] + } +} diff --git a/tools/exporter/examples/three_atom_chain.scenario.json b/tools/exporter/examples/three_atom_chain.scenario.json new file mode 100644 index 0000000..fb6485a --- /dev/null +++ b/tools/exporter/examples/three_atom_chain.scenario.json @@ -0,0 +1,156 @@ +{ + "name": "three-atom-chain", + "schema": { + "node": { + "columns": [ + { + "entity": "node" + } + ] + }, + "edge": { + "columns": [ + { + "entity": "node" + }, + { + "entity": "node" + }, + { + "entity": "edge" + } + ] + } + }, + "facts": { + "node": [ + [ + { + "entity": [ + "node", + 1 + ] + } + ], + [ + { + "entity": [ + "node", + 2 + ] + } + ], + [ + { + "entity": [ + "node", + 3 + ] + } + ] + ], + "edge": [ + [ + { + "entity": [ + "node", + 1 + ] + }, + { + "entity": [ + "node", + 2 + ] + }, + { + "entity": [ + "edge", + 1 + ] + } + ], + [ + { + "entity": [ + "node", + 2 + ] + }, + { + "entity": [ + "node", + 3 + ] + }, + { + "entity": [ + "edge", + 2 + ] + } + ] + ] + }, + "atoms": [ + { + "table": "node", + "values": { + "0": { + "var": "a" + } + } + }, + { + "table": "edge", + "values": { + "0": { + "var": "a" + }, + "1": { + "var": "b" + } + } + }, + { + "table": "edge", + "values": { + "0": { + "var": "b" + }, + "1": { + "var": "c" + } + } + } + ], + "expected_bindings": { + "columns": [ + "a", + "b", + "c" + ], + "rows": [ + [ + { + "entity": [ + "node", + 1 + ] + }, + { + "entity": [ + "node", + 2 + ] + }, + { + "entity": [ + "node", + 3 + ] + } + ] + ] + } +} diff --git a/tools/exporter/examples/two_atom_join.scenario.json b/tools/exporter/examples/two_atom_join.scenario.json new file mode 100644 index 0000000..9a316ef --- /dev/null +++ b/tools/exporter/examples/two_atom_join.scenario.json @@ -0,0 +1,144 @@ +{ + "name": "two-atom-join", + "schema": { + "node": { + "columns": [ + { + "entity": "node" + } + ] + }, + "edge": { + "columns": [ + { + "entity": "node" + }, + { + "entity": "node" + }, + { + "entity": "edge" + } + ] + } + }, + "facts": { + "node": [ + [ + { + "entity": [ + "node", + 1 + ] + } + ], + [ + { + "entity": [ + "node", + 2 + ] + } + ] + ], + "edge": [ + [ + { + "entity": [ + "node", + 1 + ] + }, + { + "entity": [ + "node", + 2 + ] + }, + { + "entity": [ + "edge", + 1 + ] + } + ], + [ + { + "entity": [ + "node", + 2 + ] + }, + { + "entity": [ + "node", + 1 + ] + }, + { + "entity": [ + "edge", + 2 + ] + } + ] + ] + }, + "atoms": [ + { + "table": "node", + "values": { + "0": { + "var": "a" + } + } + }, + { + "table": "edge", + "values": { + "0": { + "var": "a" + }, + "1": { + "var": "b" + } + } + } + ], + "expected_bindings": { + "columns": [ + "a", + "b" + ], + "rows": [ + [ + { + "entity": [ + "node", + 1 + ] + }, + { + "entity": [ + "node", + 2 + ] + } + ], + [ + { + "entity": [ + "node", + 2 + ] + }, + { + "entity": [ + "node", + 1 + ] + } + ] + ] + } +} diff --git a/tools/exporter/glog-exporter.cabal b/tools/exporter/plan-exporter.cabal similarity index 51% rename from tools/exporter/glog-exporter.cabal rename to tools/exporter/plan-exporter.cabal index 8dd2aae..95ae2f3 100644 --- a/tools/exporter/glog-exporter.cabal +++ b/tools/exporter/plan-exporter.cabal @@ -1,19 +1,20 @@ cabal-version: 3.4 -name: glog-exporter +name: plan-exporter version: 0.1.0.0 license: MIT OR Apache-2.0 author: storage-engine-playground -synopsis: Export geolog-lang join plans as JSON for the Rust runner. +synopsis: Export conjunctive-query plans as JSON for the Rust plan-runner. description: - Builds a FlatTheory + facts + a list of QAtoms for a named scenario, - runs Geolog.DB.Plan.planConjunction, and emits a JSON document that - crates/glog-runner consumes. This allows the playground use query-ops and - storage end-to-end with a real Yannakakis plan produced by the geolog - frontend, not a hand-written fixture. + Reads a scenario (FlatTheory + facts + a list of QAtoms) from JSON, + runs Geolog.DB.Plan.planConjunction, and emits a plan IR JSON document + that crates/plan-runner consumes. The IR is the contract between the + Haskell frontend and the Rust executor; this tool is currently the only + producer, but any frontend that emits the same JSON shape can drive the + runner. build-type: Simple -executable glog-export +executable plan-export main-is: Main.hs hs-source-dirs: src default-language: GHC2024 @@ -32,5 +33,6 @@ executable glog-export , base , bytestring , containers + , fnotation , geolog-lang , text diff --git a/tools/exporter/src/Main.hs b/tools/exporter/src/Main.hs index d9d204d..8276f14 100644 --- a/tools/exporter/src/Main.hs +++ b/tools/exporter/src/Main.hs @@ -1,31 +1,41 @@ --- | Exports a geolog-lang join plan as JSON for the Rust runner in --- @crates/glog-runner@. +-- | Reads a @.scenario.json@ example, plans its conjunction with +-- @Geolog.DB.Plan.planConjunction@, and writes a runner-IR JSON plan that +-- @crates\/plan-runner@ consumes. -- -- Invocation: -- -- @ --- cabal run glog-export -- > plan.json +-- cabal run plan-export -- -- @ -- --- Available scenarios: @three-atom-chain@. +-- The scenario format is documented in @examples\/README@ or by example +-- (@examples\/*.scenario.json@); the output shape is documented in +-- @crates\/plan-runner\/src\/lib.rs@. -- --- The output shape is documented in @crates\/glog-runner\/src\/lib.rs@. --- This program is the canonical producer: any change to the IR should --- start here, with the Rust runner updated to match. +-- The exporter is also a self-check: before emitting, it runs the planned +-- query through @evalConjunctionPlanned@ and verifies the bindings match +-- the scenario's @expected_bindings@. A mismatched scenario fails loudly +-- here rather than handing a bad fixture to the Rust runner. module Main (main) where import Algebra.Graph qualified as AG -import Data.Aeson ((.=)) +import Control.Monad (unless) +import Data.Aeson ((.!=), (.:), (.:?), (.=)) import Data.Aeson qualified as Aeson import Data.Aeson.Encode.Pretty qualified as AesonPretty import Data.Aeson.Key qualified as Key +import Data.Aeson.KeyMap qualified as KM +import Data.Aeson.Types (Parser) import Data.ByteString.Lazy.Char8 qualified as LBS8 +import Data.Foldable (toList) import Data.List (sortOn) import Data.Map.Strict (Map) import Data.Map.Strict qualified as Map import Data.Set qualified as Set import Data.Text (Text) import Data.Text qualified as T +import Data.String (fromString) +import FNotation.Names (Name) import Geolog.DB.InMemory import Geolog.DB.Plan import Geolog.IR qualified as IR @@ -33,74 +43,142 @@ import System.Environment (getArgs) import System.Exit (die) import System.IO (hPutStrLn, stderr) --- * Scenario plumbing +-- * Scenario file format -- --- A scenario fixes a schema, a set of ground facts, and a conjunction of --- query atoms. The exporter is intentionally code-driven (not @.glog@ --- driven): @.glog@ files declare theories, not queries, so the query --- side has to live in Haskell either way. +-- Mirrors @Geolog.IR.FlatTheory@ + @[(Path, [Val])]@ + @[QAtom]@. The +-- 'Expected' block is optional but, when present, the exporter cross- +-- checks it against the planner's own evaluation before emitting. data Scenario = Scenario - { scName :: String - , scTheory :: IR.FlatTheory + { scName :: Text + , scSchema :: Map IR.Path SchemaEntry , scFacts :: [(IR.Path, [Val])] , scAtoms :: [QAtom] + , scExpected :: Maybe Expected } + deriving (Show) --- * three-atom-chain --- --- Mirrors @DB.InMemoryTest@ "matches evalConjunction on three-atom chain". --- node = {e1, e2, e3}, edge = {(e1,e2,ee1), (e2,e3,ee2)}. --- Conjunction: node(a), edge(a, b, _), edge(b, c, _). +data SchemaEntry = SchemaEntry + { seColumns :: [IR.ColType] + , sePrimaryKey :: Maybe [Int] + } + deriving (Show) -nodePath, edgePath :: IR.Path -nodePath = ["node"] -edgePath = ["edge"] +data Expected = Expected + { exColumns :: [Text] + , exRows :: [[Val]] + } + deriving (Show) -threeAtomChain :: Scenario -threeAtomChain = - Scenario - { scName = "three-atom-chain" - , scTheory = - IR.FlatTheory - { tables = - Map.fromList - [ (nodePath, IR.Table {columns = [IR.EntityType nodePath], primaryKey = Nothing}) - , (edgePath, IR.Table {columns = [IR.EntityType nodePath, IR.EntityType nodePath, IR.EntityType edgePath], primaryKey = Nothing}) - ] - , laws = Map.empty - } - , scFacts = - [ (nodePath, [ValEntity nodePath 1]) - , (nodePath, [ValEntity nodePath 2]) - , (nodePath, [ValEntity nodePath 3]) - , (edgePath, [ValEntity nodePath 1, ValEntity nodePath 2, ValEntity edgePath 1]) - , (edgePath, [ValEntity nodePath 2, ValEntity nodePath 3, ValEntity edgePath 2]) - ] - , scAtoms = - [ QAtom {qaTable = nodePath, qaRowId = Nothing, qaValues = Map.singleton 0 (QVar (Var "a"))} - , QAtom {qaTable = edgePath, qaRowId = Nothing, qaValues = Map.fromList [(0, QVar (Var "a")), (1, QVar (Var "b"))]} - , QAtom {qaTable = edgePath, qaRowId = Nothing, qaValues = Map.fromList [(0, QVar (Var "b")), (1, QVar (Var "c"))]} - ] +-- ** JSON parsers + +parsePath :: Aeson.Value -> Parser IR.Path +parsePath = Aeson.withText "path" \t -> pure [nameFromText t] + +-- | Build a single-segment 'Name' from text. Multi-segment names (which +-- would carry a non-empty 'init' field) aren't needed by any current +-- example; if a scenario wants @"a/b"@-style paths, extend this helper. +nameFromText :: Text -> Name +nameFromText = fromString . T.unpack + +instance Aeson.FromJSON SchemaEntry where + parseJSON = Aeson.withObject "SchemaEntry" \o -> + SchemaEntry <$> o .: "columns" <*> o .:? "primaryKey" + +instance Aeson.FromJSON IR.ColType where + parseJSON = Aeson.withObject "ColType" \o -> do + case KM.toList o of + [("entity", v)] -> IR.EntityType <$> parsePath v + [("prim", v)] -> IR.PrimType <$> parsePrim v + _ -> fail "ColType: expected {\"entity\": } or {\"prim\": \"int\"|\"string\"}" + +parsePrim :: Aeson.Value -> Parser IR.PrimType +parsePrim = Aeson.withText "prim type" \case + "int" -> pure IR.PrimInt + "string" -> pure IR.PrimString + other -> fail ("unknown primitive type: " <> T.unpack other) + +parseVal :: Aeson.Value -> Parser Val +parseVal = Aeson.withObject "Val" \o -> + case KM.toList o of + [("int", v)] -> ValInt <$> Aeson.parseJSON v + [("str", v)] -> ValText <$> Aeson.parseJSON v + [("entity", v)] -> parseEntity v + _ -> fail "Val: expected {\"int\": ..} | {\"str\": ..} | {\"entity\": [, ]}" + where + parseEntity = Aeson.withArray "entity" \arr -> case toList arr of + [pv, nv] -> do + p <- parsePath pv + n <- Aeson.parseJSON nv + pure (ValEntity p n) + _ -> fail "entity: expected [, ]" + +parseQVal :: Aeson.Value -> Parser QVal +parseQVal = Aeson.withObject "QVal" \o -> + case KM.toList o of + [("var", v)] -> QVar . Var <$> Aeson.parseJSON v + [("lit", v)] -> QLit <$> parseVal v + _ -> fail "QVal: expected {\"var\": \"name\"} or {\"lit\": }" + +parseAtom :: Aeson.Value -> Parser QAtom +parseAtom = Aeson.withObject "QAtom" \o -> do + qaTable <- o .: "table" >>= parsePath + qaRowId <- o .:? "rowId" >>= traverse parseQVal + values <- o .: "values" :: Parser (Map Text Aeson.Value) + qaValues <- + Map.fromList + <$> traverse + ( \(k, v) -> case reads (T.unpack k) of + [(i, "")] -> (i,) <$> parseQVal v + _ -> fail ("non-integer key in atom values: " <> T.unpack k) + ) + (Map.toList values) + pure QAtom {qaTable, qaRowId, qaValues} + +parseExpected :: Aeson.Value -> Parser Expected +parseExpected = Aeson.withObject "Expected" \o -> do + exColumns <- o .: "columns" + rawRows <- o .: "rows" :: Parser [[Aeson.Value]] + exRows <- traverse (traverse parseVal) rawRows + pure Expected {exColumns, exRows} + +instance Aeson.FromJSON Scenario where + parseJSON = Aeson.withObject "Scenario" \o -> do + scName <- o .:? "name" .!= "unnamed" + rawSchema <- o .: "schema" :: Parser (Map Text SchemaEntry) + let scSchema = Map.fromList [([nameFromText k], v) | (k, v) <- Map.toList rawSchema] + rawFacts <- o .:? "facts" .!= mempty :: Parser (Map Text [[Aeson.Value]]) + scFacts <- + concat + <$> traverse + ( \(name, rows) -> do + let path = [nameFromText name] + parsedRows <- traverse (traverse parseVal) rows + pure [(path, row) | row <- parsedRows] + ) + (Map.toList rawFacts) + rawAtoms <- o .: "atoms" :: Parser [Aeson.Value] + scAtoms <- traverse parseAtom rawAtoms + scExpected <- o .:? "expected_bindings" >>= traverse parseExpected + pure Scenario {scName, scSchema, scFacts, scAtoms, scExpected} + +-- * Scenario → FlatTheory + DB + atoms + +toFlatTheory :: Scenario -> IR.FlatTheory +toFlatTheory sc = + IR.FlatTheory + { tables = Map.map (\e -> IR.Table {columns = seColumns e, primaryKey = sePrimaryKey e}) sc.scSchema + , laws = Map.empty } -scenarios :: [Scenario] -scenarios = [threeAtomChain] +populateDB :: Scenario -> DB +populateDB sc = foldl (\d (p, row) -> insertRow p row d) (fromTheory (toFlatTheory sc)) sc.scFacts --- * JSON encoding +-- * JSON encoding for the plan-runner IR -- --- The shape mirrors the IR in @crates/glog-runner/src/lib.rs@: --- --- > { --- > "schema": {: , ...}, --- > "facts": {: [[, ...], ...], ...}, --- > "query": {"root": , "nodes": [{"id": , "action": }, ...]} --- > } +-- The shape is the same one we settled on earlier; see +-- @crates/plan-runner/src/lib.rs@. --- | Render a 'Geolog.IR.Path' (a list of 'FNotation.Names.Name') as a flat --- string for use as a relation name on the Rust side. Each 'Name' is --- already shown with @\/@ between its own init segments and last, so we --- reuse 'show' and join Names with @\/@ too. pathText :: IR.Path -> Text pathText = T.intercalate "/" . map (T.pack . show) @@ -119,10 +197,6 @@ encodeTerm = \case QVar (Var name) -> Aeson.object ["var" .= name] QLit v -> Aeson.object ["lit" .= encodeValue v] --- | Flatten an atom into one term per stored column, mirroring --- @Geolog.DB.InMemory.toFlatArgs@: @qaValues@ keys map to positions --- @0..n-2@, @qaRowId@ (if present) maps to position @n-1@, and any --- missing positions become wildcard variables with locally-unique names. flattenAtom :: Int -> Int -> QAtom -> [Aeson.Value] flattenAtom atomIdx arity qa = [ encodeTerm (Map.findWithDefault (wildcard atomIdx pos) pos merged) @@ -145,9 +219,6 @@ encodeAtom tables atomIdx qa = Just t -> length t.columns Nothing -> error ("encodeAtom: unknown table " <> show qa.qaTable) --- | Stable atom indexing keyed by atom identity, so the wildcard names in --- @flattenAtom@ are deterministic across runs even if the planner's node --- ordering changes. atomIndex :: [QAtom] -> Map QAtom Int atomIndex atoms = Map.fromList (zip (Set.toList (Set.fromList atoms)) [0 ..]) @@ -176,9 +247,6 @@ encodeNode tables idx n = ] ] --- | Render a 'PlanGraph' as the JSON the runner consumes. Empty graphs --- produce @{"root": 0, "nodes": []}@, which the runner treats as a --- well-formed but empty query. encodeQuery :: Map IR.Path IR.Table -> Map QAtom Int -> PlanGraph -> Aeson.Value encodeQuery tables idx (PlanGraph g) | null nodes = @@ -192,24 +260,30 @@ encodeQuery tables idx (PlanGraph g) nodes = sortOn (.graphId.unPlanNodeId) (AG.vertexList g) rootId = case graphRoot (PlanGraph g) of Just (PlanNodeId i) -> i - -- Non-empty graph with no topological root means a cycle, which - -- planConjunction never produces. Fall back to the last id rather - -- than crashing so a bug here is still inspectable. Nothing -> (.graphId.unPlanNodeId) (last nodes) +encodeExpected :: Expected -> Aeson.Value +encodeExpected ex = + Aeson.object + [ "columns" .= exColumns ex + , "rows" .= map (map encodeValue) (exRows ex) + ] + encodePlan :: Scenario -> Aeson.Value encodePlan sc = Aeson.object - [ "_scenario" .= sc.scName - , "schema" .= Aeson.object - [pathKey p .= length t.columns | (p, t) <- Map.toList sc.scTheory.tables] - , "facts" .= Aeson.object - [pathKey p .= map (map encodeValue) rows | (p, rows) <- groupedFacts sc.scFacts] - , "query" .= encodeQuery sc.scTheory.tables (atomIndex sc.scAtoms) (planConjunction sc.scAtoms) - ] + ( [ "_scenario" .= scName sc + , "schema" .= Aeson.object [pathKey p .= length (seColumns t) | (p, t) <- Map.toList sc.scSchema] + , "facts" + .= Aeson.object + [ pathKey p .= map (map encodeValue) rows + | (p, rows) <- groupedFacts sc.scFacts + ] + , "query" .= encodeQuery (toFlatTheory sc).tables (atomIndex sc.scAtoms) (planConjunction sc.scAtoms) + ] + ++ maybe [] (\e -> ["expected_bindings" .= encodeExpected e]) sc.scExpected + ) --- | Group facts by table while preserving table-first-seen order and --- per-table insertion order. groupedFacts :: [(IR.Path, [Val])] -> [(IR.Path, [[Val]])] groupedFacts = go [] where @@ -222,17 +296,45 @@ groupedFacts = go [] -- * Self-check -- --- Run the planner's @evalConjunctionPlanned@ against the scenario's DB --- to confirm the plan we're about to emit is well-formed and produces --- non-error output. Catches malformed scenarios before they hand a bad --- plan to the Rust runner. +-- Cross-check the planned bindings against any user-supplied +-- 'expected_bindings'. Detects two classes of bug before they reach the +-- Rust side: a scenario whose 'expected' is wrong, and a planner output +-- that disagrees with 'evalConjunction'. selfCheck :: Scenario -> IO () selfCheck sc = do - let db = foldl (\d (p, row) -> insertRow p row d) (fromTheory sc.scTheory) sc.scFacts + let db = populateDB sc case evalConjunctionPlanned db sc.scAtoms of - Left err -> die ("self-check failed for " <> sc.scName <> ": " <> show err) - Right _ -> pure () + Left err -> die ("self-check failed for " <> T.unpack sc.scName <> ": " <> show err) + Right actual -> case sc.scExpected of + Nothing -> pure () + Just expected -> verifyAgainstExpected sc.scName expected actual + +verifyAgainstExpected :: Text -> Expected -> Bindings -> IO () +verifyAgainstExpected name expected actual = do + let actualCols = actual.cols + expectedCols = Set.fromList (map Var (exColumns expected)) + unless (Set.isSubsetOf expectedCols actualCols) $ + die $ + "self-check failed for " + <> T.unpack name + <> ": expected_bindings names columns not produced by the plan: " + <> show (Set.difference expectedCols actualCols) + let projectedActual = Set.map (`projectOn` exColumns expected) actual.table + expectedProjected = Set.fromList (map (zip (exColumns expected)) (exRows expected)) + expectedSet = Set.map (Map.fromList . map (\(v, x) -> (Var v, x))) expectedProjected + unless (projectedActual == expectedSet) $ + die $ + "self-check failed for " + <> T.unpack name + <> ":\n expected: " + <> show expectedSet + <> "\n actual: " + <> show projectedActual + +projectOn :: Map Var Val -> [Text] -> Map Var Val +projectOn row keys = + Map.fromList [(Var k, v) | k <- keys, Just v <- [Map.lookup (Var k) row]] -- * Entry point @@ -240,13 +342,13 @@ main :: IO () main = do args <- getArgs case args of - [name] -> case lookup name [(s.scName, s) | s <- scenarios] of - Just sc -> do - selfCheck sc - LBS8.putStrLn (AesonPretty.encodePretty (encodePlan sc)) - Nothing -> - die ("unknown scenario: " <> name <> "\navailable: " <> unwords (map (.scName) scenarios)) + [path] -> do + raw <- LBS8.readFile path + sc <- case Aeson.eitherDecode raw of + Left err -> die ("failed to parse " <> path <> ": " <> err) + Right sc -> pure sc + selfCheck sc + LBS8.putStrLn (AesonPretty.encodePretty (encodePlan sc)) _ -> do - hPutStrLn stderr "usage: glog-export " - hPutStrLn stderr ("scenarios: " <> unwords (map (.scName) scenarios)) + hPutStrLn stderr "usage: plan-export " die ""
Value (enum)
Value (enum, from storage)
Int(i64)
Str(String)
Id(RowId)