diff --git a/.gitignore b/.gitignore index 65ea1aa..74570ed 100644 --- a/.gitignore +++ b/.gitignore @@ -81,3 +81,4 @@ tarpaulin-report.html .claude/ .codex .agents/ +dist-newstyle/ diff --git a/.gitmodules b/.gitmodules index 19b695e..8d5a604 100644 --- a/.gitmodules +++ b/.gitmodules @@ -5,4 +5,4 @@ [submodule "external/geolog"] path = external/geolog url = gitlab@git.sgai.uk:creators/geolog.git - branch = query-plan-ir-draft-1 + branch = query-plan-algebraic diff --git a/Makefile b/Makefile index 4b4fefc..bb4bb46 100644 --- a/Makefile +++ b/Makefile @@ -76,6 +76,23 @@ clean: ## Remove build output cargo clean; \ fi +EXPORTER_DIR := tools/exporter +EXPORTER_FIXTURES := crates/glog-runner/fixtures +EXPORTER_SCENARIOS := three-atom-chain + +.PHONY: export-fixtures +export-fixtures: ## Regenerate JSON plan fixtures from the Haskell exporter (needs Cabal and GHC; use `make shell` first). + @if ! command -v cabal >/dev/null 2>&1; then \ + echo "cabal not found. Enter the dev shell with 'make shell' (or 'nix develop') first."; \ + exit 1; \ + fi + @cd $(EXPORTER_DIR) && cabal build glog-export + @for sc in $(EXPORTER_SCENARIOS); do \ + out=$(EXPORTER_FIXTURES)/$$(echo $$sc | tr '-' '_').json; \ + echo "exporting $$sc -> $$out"; \ + (cd $(EXPORTER_DIR) && cabal run -v0 glog-export -- $$sc) > $$out; \ + done + .PHONY: shell shell: ## Enter the Nix dev shell defined in flake.nix @nix develop diff --git a/crates/glog-runner/Cargo.toml b/crates/glog-runner/Cargo.toml new file mode 100644 index 0000000..29b75d5 --- /dev/null +++ b/crates/glog-runner/Cargo.toml @@ -0,0 +1,19 @@ +[package] +name = "glog-runner" +version = "0.1.0" +edition.workspace = true +license.workspace = true +rust-version.workspace = true + +[lints] +workspace = true + +[dependencies] +storage = { path = "../storage" } +query-ops = { path = "../query-ops" } +serde = { version = "1", features = ["derive"] } +serde_json = "1" + +[[bin]] +name = "glog-run" +path = "src/main.rs" diff --git a/crates/glog-runner/fixtures/three_atom_chain.json b/crates/glog-runner/fixtures/three_atom_chain.json new file mode 100644 index 0000000..c8d3f7e --- /dev/null +++ b/crates/glog-runner/fixtures/three_atom_chain.json @@ -0,0 +1,166 @@ +{ + "_scenario": "three-atom-chain", + "facts": { + "edge": [ + [ + { + "str": "node:1" + }, + { + "str": "node:2" + }, + { + "str": "edge:1" + } + ], + [ + { + "str": "node:2" + }, + { + "str": "node:3" + }, + { + "str": "edge:2" + } + ] + ], + "node": [ + [ + { + "str": "node:1" + } + ], + [ + { + "str": "node:2" + } + ], + [ + { + "str": "node:3" + } + ] + ] + }, + "query": { + "nodes": [ + { + "action": { + "scan": { + "columns": [ + { + "var": "a" + }, + { + "var": "b" + }, + { + "var": "_w0_2" + } + ], + "table": "edge" + } + }, + "id": 1 + }, + { + "action": { + "scan": { + "columns": [ + { + "var": "b" + }, + { + "var": "c" + }, + { + "var": "_w1_2" + } + ], + "table": "edge" + } + }, + "id": 2 + }, + { + "action": { + "scan": { + "columns": [ + { + "var": "a" + } + ], + "table": "node" + } + }, + "id": 3 + }, + { + "action": { + "join": { + "left": 1, + "op": "left", + "right": 3 + } + }, + "id": 4 + }, + { + "action": { + "join": { + "left": 2, + "op": "left", + "right": 4 + } + }, + "id": 5 + }, + { + "action": { + "join": { + "left": 5, + "op": "right", + "right": 4 + } + }, + "id": 6 + }, + { + "action": { + "join": { + "left": 6, + "op": "right", + "right": 3 + } + }, + "id": 7 + }, + { + "action": { + "join": { + "left": 6, + "op": "natural", + "right": 7 + } + }, + "id": 8 + }, + { + "action": { + "join": { + "left": 5, + "op": "natural", + "right": 8 + } + }, + "id": 9 + } + ], + "root": 9 + }, + "schema": { + "edge": 3, + "node": 1 + } +} diff --git a/crates/glog-runner/src/lib.rs b/crates/glog-runner/src/lib.rs new file mode 100644 index 0000000..c7da6e4 --- /dev/null +++ b/crates/glog-runner/src/lib.rs @@ -0,0 +1,344 @@ +//! End-to-end runner that executes a `geolog-lang` conjunctive-query plan +//! against this workspace's storage and `query-ops` operators. +//! +//! The upstream Haskell planner in `external/geolog/geolog-lang` +//! (`Geolog.DB.Plan`) builds a Yannakakis-style join DAG over `QAtom`s. This +//! crate accepts that DAG as JSON, materializes the input relations through +//! the [`Storage`] trait, and walks the DAG using +//! [`query_ops::atom::scan_atom`], [`query_ops::join::semijoin`], and +//! [`query_ops::join::natural_join`]. The result is a binding +//! [`Relation`](query_ops::relation::Relation) over the query's variables. +//! +//! The JSON IR mirrors `Geolog.DB.Plan.JoinPlan` and `Geolog.DB.InMemory.QAtom` +//! without depending on the Haskell side at build time. A Haskell exporter +//! that dumps `(schema, facts, JoinPlan)` to this shape is the planned +//! follow-up that completes the round trip; the IR is the contract. +//! +//! Mapping from the Haskell planner: +//! +//! | `Geolog.DB.Plan` | this crate | +//! |-----------------------------|-----------------------------------------------| +//! | `PlanEvalAtom` | [`Action::Scan`] → `scan_atom` | +//! | `PlanJoin LeftJoin a b` | [`Action::Join`] with [`JoinOp::Left`] → `semijoin(rel[a], rel[b])` | +//! | `PlanJoin RightJoin a b` | [`Action::Join`] with [`JoinOp::Right`] → `semijoin(rel[b], rel[a])` | +//! | `PlanJoin NaturalJoin a b` | [`Action::Join`] with [`JoinOp::Natural`] → `natural_join(rel[a], rel[b])` | +//! +//! The atom side covers `evalAtom` (`Geolog.DB.InMemory`): a [`Term::Var`] +//! repeated across positions enforces equality, [`Term::Lit`] filters by +//! constant, and distinct variables project in first-occurrence order. + +use std::collections::HashMap; + +use serde::Deserialize; + +use query_ops::atom::{AtomPattern, Term, scan_atom}; +use query_ops::join::{natural_join, semijoin}; +use query_ops::relation::Relation; +use storage::value::Value; +use storage::{MemoryStorage, Storage, StorageError, scan_as_table}; + +/// A single fixture: schema, ground facts, and a query plan to execute. +#[derive(Debug, Clone, Deserialize)] +pub struct Plan { + /// Relation name → arity (column count). + pub schema: HashMap, + /// Relation name → list of ground tuples to insert before execution. + pub facts: HashMap>>, + /// The join DAG itself. + pub query: Query, +} + +/// Mirrors `Geolog.DB.Plan.JoinPlan`: a set of nodes plus the id of the +/// rooted result node. +#[derive(Debug, Clone, Deserialize)] +pub struct Query { + pub root: u32, + pub nodes: Vec, +} + +/// One node of the plan DAG. `id`s are dense within a `Query` but don't need +/// to start at any particular value, mirroring the Haskell `PlanNodeId`. +#[derive(Debug, Clone, Deserialize)] +pub struct Node { + pub id: u32, + pub action: Action, +} + +/// What to compute at a node. Tagged externally so JSON reads as +/// `{"action": {"scan": {...}}}` or `{"action": {"join": {...}}}`. +#[derive(Debug, Clone, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum Action { + Scan(Atom), + Join(Join), +} + +/// A flat atom pattern, one entry per column of the target relation. +/// Matches the `toFlatArgs` view used by `Geolog.DB.InMemory.evalAtom`: +/// `qaValues` positions are filled in directly, and the entity-id column +/// (if any) appears at the last position. Wildcard positions in the +/// Haskell `QAtom` (a `Map Int QVal` with a missing key) translate to a +/// fresh, unique variable name on this side, which the operator binds but +/// never joins against. +#[derive(Debug, Clone, Deserialize)] +pub struct Atom { + pub table: String, + pub columns: Vec, +} + +#[derive(Debug, Clone, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum JsonTerm { + Var(String), + Lit(JsonValue), +} + +/// Wire-level value tag. Restricted to what `storage::value::Value` carries. +/// Entity identities from the Haskell side (`ValEntity path id`) round-trip +/// through `Str` for now using a `"path:id"` convention; that's a fixture +/// concern, not a runner concern. +#[derive(Debug, Clone, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum JsonValue { + Int(i64), + Str(String), +} + +#[derive(Debug, Clone, Deserialize)] +pub struct Join { + pub op: JoinOp, + pub left: u32, + pub right: u32, +} + +#[derive(Debug, Clone, Copy, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum JoinOp { + /// `Geolog.DB.Plan.LeftJoin`: result is `left` rows whose shared columns + /// appear in `right`. Lowered to `semijoin(left, right)`. + Left, + /// `Geolog.DB.Plan.RightJoin`: result is `right` rows whose shared + /// columns appear in `left`. Lowered to `semijoin(right, left)`. + Right, + /// `Geolog.DB.Plan.NaturalJoin`. Lowered to `natural_join(left, right)`. + Natural, +} + +/// Errors a runner can produce in addition to storage failures. +#[derive(Debug)] +pub enum RunError { + /// A fact references a relation that isn't declared in `schema`. + UnknownRelation(String), + /// A node id appears in a `Join` action but no node with that id exists. + MissingNode(u32), + /// `Query.root` doesn't match any node in `nodes`. + MissingRoot(u32), + /// Two nodes share the same id. + DuplicateNode(u32), + /// A join node references its left or right side before that side has + /// been computed: the DAG isn't actually topologically sorted by id, or + /// it has a cycle. + UnresolvedDependency { node: u32, depends_on: u32 }, + /// Storage layer rejected an operation. + Storage(StorageError), +} + +impl std::fmt::Display for RunError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::UnknownRelation(name) => { + write!(f, "facts reference relation {name:?} not in schema") + } + Self::MissingNode(id) => write!(f, "plan references missing node id {id}"), + Self::MissingRoot(id) => write!(f, "plan root id {id} matches no node"), + Self::DuplicateNode(id) => write!(f, "duplicate node id {id} in plan"), + Self::UnresolvedDependency { node, depends_on } => write!( + f, + "node {node} depends on {depends_on}, which has not been computed yet" + ), + Self::Storage(err) => write!(f, "storage error: {err}"), + } + } +} + +impl std::error::Error for RunError { + fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { + match self { + Self::Storage(err) => Some(err), + _ => None, + } + } +} + +impl From for RunError { + fn from(err: StorageError) -> Self { + Self::Storage(err) + } +} + +impl From for Value { + fn from(jv: JsonValue) -> Self { + match jv { + JsonValue::Int(n) => Self::Int(n), + JsonValue::Str(s) => Self::Str(s), + } + } +} + +impl From for Term { + fn from(t: JsonTerm) -> Self { + match t { + JsonTerm::Var(name) => Self::Var(name), + JsonTerm::Lit(value) => Self::Lit(value.into()), + } + } +} + +/// Parse a [`Plan`] from a JSON string. +/// +/// # Errors +/// Returns a [`serde_json::Error`] if the input isn't valid JSON in the +/// expected shape. +pub fn parse_plan(json: &str) -> Result { + serde_json::from_str(json) +} + +/// Load schema and facts from a [`Plan`] into a fresh [`MemoryStorage`]. +/// +/// All facts are inserted in a single transaction; commit is atomic so a +/// failure on row N leaves the storage empty. +/// +/// # Errors +/// Returns [`RunError::UnknownRelation`] if facts mention a relation not +/// declared in `schema`. Wraps storage failures (arity mismatch, transaction +/// errors) in [`RunError::Storage`]. +pub fn load_into_memory(plan: &Plan) -> Result { + let mut storage = MemoryStorage::default(); + for (name, arity) in &plan.schema { + storage.create_relation(name, *arity)?; + } + { + let mut tx = storage.transaction()?; + for (name, rows) in &plan.facts { + if !plan.schema.contains_key(name) { + return Err(RunError::UnknownRelation(name.clone())); + } + for row in rows { + let cells: Vec = row.iter().cloned().map(Value::from).collect(); + tx.insert(name, cells)?; + } + } + let _ = tx.commit()?; + } + Ok(storage) +} + +/// Execute a plan against a storage backend, returning the bindings +/// [`Relation`] for the rooted plan node. +/// +/// Nodes are executed in ascending `id` order. For a Yannakakis plan as +/// emitted by `Geolog.DB.Plan` this is equivalent to a topological sort, +/// since `insertJoin` only references node ids that have already been +/// allocated. A non-monotone id ordering is rejected with +/// [`RunError::UnresolvedDependency`]. +/// +/// # Errors +/// Returns [`RunError::DuplicateNode`] for repeated ids, +/// [`RunError::MissingNode`] for join references to unknown ids, +/// [`RunError::MissingRoot`] if `query.root` isn't present, and storage +/// errors during the per-scan `scan_as_table` call. +pub fn execute(storage: &S, query: &Query) -> Result { + let mut seen_ids: std::collections::HashSet = + std::collections::HashSet::with_capacity(query.nodes.len()); + for node in &query.nodes { + if !seen_ids.insert(node.id) { + return Err(RunError::DuplicateNode(node.id)); + } + } + if !seen_ids.contains(&query.root) { + return Err(RunError::MissingRoot(query.root)); + } + + let mut ordered: Vec<&Node> = query.nodes.iter().collect(); + ordered.sort_by_key(|n| n.id); + + let mut results: HashMap = HashMap::with_capacity(ordered.len()); + for node in ordered { + let computed = match &node.action { + Action::Scan(atom) => { + let table = scan_as_table(storage, &atom.table)?; + let pattern = AtomPattern { + columns: atom.columns.iter().cloned().map(Term::from).collect(), + }; + scan_atom(&table, &pattern) + } + Action::Join(join) => { + let left = require_dep(&results, &seen_ids, node.id, join.left)?; + let right = require_dep(&results, &seen_ids, node.id, join.right)?; + match join.op { + JoinOp::Left => semijoin(left, right), + JoinOp::Right => semijoin(right, left), + JoinOp::Natural => natural_join(left, right), + } + } + }; + results.insert(node.id, computed); + } + + results + .remove(&query.root) + .ok_or(RunError::MissingRoot(query.root)) +} + +fn require_dep<'a>( + results: &'a HashMap, + seen: &std::collections::HashSet, + node: u32, + depends_on: u32, +) -> Result<&'a Relation, RunError> { + if let Some(rel) = results.get(&depends_on) { + Ok(rel) + } else if seen.contains(&depends_on) { + Err(RunError::UnresolvedDependency { node, depends_on }) + } else { + Err(RunError::MissingNode(depends_on)) + } +} + +/// Convenience: parse JSON, load it into a fresh in-memory storage, and +/// execute, returning the root binding relation. +/// +/// # Errors +/// Returns a JSON parse error if the input is malformed, or a [`RunError`] +/// for any later step. +pub fn run_json(json: &str) -> Result { + let plan = parse_plan(json).map_err(RunFromJsonError::Parse)?; + let storage = load_into_memory(&plan).map_err(RunFromJsonError::Run)?; + let bindings = execute(&storage, &plan.query).map_err(RunFromJsonError::Run)?; + Ok(bindings) +} + +/// Combined error from [`run_json`]. +#[derive(Debug)] +pub enum RunFromJsonError { + Parse(serde_json::Error), + Run(RunError), +} + +impl std::fmt::Display for RunFromJsonError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::Parse(err) => write!(f, "parse error: {err}"), + Self::Run(err) => write!(f, "run error: {err}"), + } + } +} + +impl std::error::Error for RunFromJsonError { + fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { + match self { + Self::Parse(err) => Some(err), + Self::Run(err) => Some(err), + } + } +} diff --git a/crates/glog-runner/src/main.rs b/crates/glog-runner/src/main.rs new file mode 100644 index 0000000..2c121ec --- /dev/null +++ b/crates/glog-runner/src/main.rs @@ -0,0 +1,59 @@ +//! `glog-run` CLI: read a JSON plan from a file (or stdin if `-`), execute +//! it against a fresh in-memory store, and print the resulting binding +//! relation as JSON on stdout. + +use std::io::{self, Read}; +use std::process::ExitCode; + +fn main() -> ExitCode { + let mut args = std::env::args().skip(1); + let Some(path) = args.next() else { + eprintln!("usage: glog-run "); + return ExitCode::from(2); + }; + + let input = match read_input(&path) { + Ok(s) => s, + Err(err) => { + eprintln!("failed to read {path}: {err}"); + return ExitCode::from(1); + } + }; + + let relation = match glog_runner::run_json(&input) { + Ok(r) => r, + Err(err) => { + eprintln!("{err}"); + return ExitCode::from(1); + } + }; + + let payload = serde_json::json!({ + "columns": relation.columns, + "rows": relation + .rows + .iter() + .map(|row| row.iter().map(value_to_json).collect::>()) + .collect::>(), + }); + println!("{payload}"); + ExitCode::SUCCESS +} + +fn read_input(path: &str) -> io::Result { + if path == "-" { + let mut buf = String::new(); + io::stdin().read_to_string(&mut buf)?; + Ok(buf) + } else { + std::fs::read_to_string(path) + } +} + +fn value_to_json(value: &storage::value::Value) -> serde_json::Value { + match value { + storage::value::Value::Int(n) => serde_json::Value::Number((*n).into()), + storage::value::Value::Str(s) => serde_json::Value::String(s.clone()), + storage::value::Value::Id(id) => serde_json::Value::String(id.to_string()), + } +} diff --git a/crates/glog-runner/tests/three_atom_chain.rs b/crates/glog-runner/tests/three_atom_chain.rs new file mode 100644 index 0000000..9a318ea --- /dev/null +++ b/crates/glog-runner/tests/three_atom_chain.rs @@ -0,0 +1,73 @@ +//! End-to-end check: run the JSON fixture and verify the resulting bindings +//! match the `DB.InMemoryTest` "matches evalConjunction on three-atom chain" +//! case from `external/geolog/geolog-lang/test/DB/InMemoryTest.hs`. +//! +//! For `node = {e1, e2, e3}` and `edge = {(e1,e2,ee1), (e2,e3,ee2)}` the +//! conjunction `node(a), edge(a, b, _), edge(b, c, _)` has exactly one +//! solution: `(a=e1, b=e2, c=e3)`. + +use std::collections::BTreeMap; + +use glog_runner::run_json; +use storage::value::Value; + +fn fixture() -> &'static str { + include_str!("../fixtures/three_atom_chain.json") +} + +fn ent(path: &str, id: u32) -> Value { + Value::Str(format!("{path}:{id}")) +} + +fn project<'a>( + columns: &'a [String], + row: &'a [Value], + keep: &'a [&'a str], +) -> BTreeMap<&'a str, &'a Value> { + keep.iter() + .map(|name| { + let pos = columns + .iter() + .position(|c| c == name) + .expect("column missing"); + (*name, &row[pos]) + }) + .collect() +} + +#[test] +fn three_atom_chain_matches_haskell_oracle() { + let result = run_json(fixture()).expect("fixture should execute"); + + // The plan's root keeps every variable, including the per-atom wildcards + // `_r1` and `_r2`. The oracle only asserts the (a, b, c) projection. + let keep = ["a", "b", "c"]; + let mut projected: Vec> = result + .rows + .iter() + .map(|row| project(&result.columns, row, &keep)) + .collect(); + projected.sort_by_key(|m| format!("{m:?}")); + + let e1 = ent("node", 1); + let e2 = ent("node", 2); + let e3 = ent("node", 3); + let expected = vec![BTreeMap::from([("a", &e1), ("b", &e2), ("c", &e3)])]; + + assert_eq!(projected, expected); +} + +#[test] +fn root_columns_cover_a_b_c_plus_two_wildcards() { + // The exporter emits unique wildcard variable names for the entity-id + // column of each edge atom (e.g. `_w0_2`, `_w1_2`); their exact spelling + // is an implementation detail of the exporter, so this test only checks + // that the named variables are all present and that the total column + // count is the three named ones plus two anonymous wildcards. + let result = run_json(fixture()).expect("fixture should execute"); + let cols: std::collections::HashSet<&str> = result.columns.iter().map(String::as_str).collect(); + for expected in ["a", "b", "c"] { + assert!(cols.contains(expected), "missing column {expected}"); + } + assert_eq!(result.columns.len(), 5, "expected 3 named + 2 wildcards"); +} diff --git a/external/geolog b/external/geolog index 99d4006..426d4c9 160000 --- a/external/geolog +++ b/external/geolog @@ -1 +1 @@ -Subproject commit 99d4006f4655d8a6815a9156fe4d9304515f356d +Subproject commit 426d4c96d6031ccaf5e14c12c3dab496e3b4c365 diff --git a/flake.nix b/flake.nix index 416eb1f..373a991 100644 --- a/flake.nix +++ b/flake.nix @@ -1,5 +1,5 @@ { - description = "Storage engine playground: Rust workspace for FlowLog, DBSP, CRDT, and Geomerge experiments."; + description = "Storage engine playground"; inputs = { nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable"; @@ -29,12 +29,13 @@ packages = [ rustToolchain - # Diagram regeneration in crates/geomerge-demo/docs/diagrams. pkgs.graphviz - # Cargo helpers. pkgs.cargo-watch pkgs.cargo-nextest - # Pre-commit hooks (see .pre-commit-config.yaml, Makefile setup-hooks). + pkgs.haskell.compiler.ghc912 + pkgs.cabal-install + pkgs.pkg-config + pkgs.zlib pkgs.pre-commit pkgs.python3 ]; @@ -44,10 +45,14 @@ }; shellHook = '' - echo "storage-engine-playground dev shell" - echo " rustc: $(rustc --version)" - echo " cargo: $(cargo --version)" - echo " dot: $(dot -V 2>&1)" + # Banner goes to stderr so `nix develop --command` invocations + # that pipe stdout (e.g. tools/exporter producing JSON) stay clean. + >&2 echo "storage-engine-playground dev shell" + >&2 echo " rustc: $(rustc --version)" + >&2 echo " cargo: $(cargo --version)" + >&2 echo " ghc: $(ghc --version)" + >&2 echo " cabal: $(cabal --version | head -1)" + >&2 echo " dot: $(dot -V 2>&1)" ''; }; diff --git a/tools/exporter/cabal.project b/tools/exporter/cabal.project new file mode 100644 index 0000000..d2d5aa3 --- /dev/null +++ b/tools/exporter/cabal.project @@ -0,0 +1,24 @@ +-- cabal.project for the geolog -> Rust JSON exporter. +-- +-- This file points at the geolog-lang library inside the external/geolog +-- submodule, plus the sibling packages it depends on (data-partition, +-- diagnostician, fnotation). It mirrors the submodule's own cabal.project +-- so the exporter sees the same source set the submodule's tests build +-- against. + +packages: + glog-exporter.cabal + ../../external/geolog/geolog-lang/geolog-lang.cabal + ../../external/geolog/data-partition/data-partition.cabal + ../../external/geolog/diagnostician/diagnostician.cabal + ../../external/geolog/fnotation/fnotation.cabal + +-- geolog-lang's DB.Plan.Render module uses a patched diagrams-graphviz. +-- Same pin as external/geolog/cabal.project. +source-repository-package + type: git + location: https://github.com/georgefst/diagrams-graphviz.git + tag: 993533c564861f9d0663d719eafd56efd95f59ba + +jobs: $ncpus +semaphore: true diff --git a/tools/exporter/glog-exporter.cabal b/tools/exporter/glog-exporter.cabal new file mode 100644 index 0000000..8dd2aae --- /dev/null +++ b/tools/exporter/glog-exporter.cabal @@ -0,0 +1,36 @@ +cabal-version: 3.4 +name: glog-exporter +version: 0.1.0.0 +license: MIT OR Apache-2.0 +author: storage-engine-playground +synopsis: Export geolog-lang join plans as JSON for the Rust runner. +description: + Builds a FlatTheory + facts + a list of QAtoms for a named scenario, + runs Geolog.DB.Plan.planConjunction, and emits a JSON document that + crates/glog-runner consumes. This allows the playground use query-ops and + storage end-to-end with a real Yannakakis plan produced by the geolog + frontend, not a hand-written fixture. + +build-type: Simple + +executable glog-export + main-is: Main.hs + hs-source-dirs: src + default-language: GHC2024 + default-extensions: + BlockArguments + LambdaCase + OverloadedRecordDot + OverloadedStrings + + ghc-options: -Wall + + build-depends: + , aeson >=2.2 + , aeson-pretty >=0.8 + , algebraic-graphs >=0.7 + , base + , bytestring + , containers + , geolog-lang + , text diff --git a/tools/exporter/src/Main.hs b/tools/exporter/src/Main.hs new file mode 100644 index 0000000..d9d204d --- /dev/null +++ b/tools/exporter/src/Main.hs @@ -0,0 +1,252 @@ +-- | Exports a geolog-lang join plan as JSON for the Rust runner in +-- @crates/glog-runner@. +-- +-- Invocation: +-- +-- @ +-- cabal run glog-export -- > plan.json +-- @ +-- +-- Available scenarios: @three-atom-chain@. +-- +-- The output shape is documented in @crates\/glog-runner\/src\/lib.rs@. +-- This program is the canonical producer: any change to the IR should +-- start here, with the Rust runner updated to match. +module Main (main) where + +import Algebra.Graph qualified as AG +import Data.Aeson ((.=)) +import Data.Aeson qualified as Aeson +import Data.Aeson.Encode.Pretty qualified as AesonPretty +import Data.Aeson.Key qualified as Key +import Data.ByteString.Lazy.Char8 qualified as LBS8 +import Data.List (sortOn) +import Data.Map.Strict (Map) +import Data.Map.Strict qualified as Map +import Data.Set qualified as Set +import Data.Text (Text) +import Data.Text qualified as T +import Geolog.DB.InMemory +import Geolog.DB.Plan +import Geolog.IR qualified as IR +import System.Environment (getArgs) +import System.Exit (die) +import System.IO (hPutStrLn, stderr) + +-- * Scenario plumbing +-- +-- A scenario fixes a schema, a set of ground facts, and a conjunction of +-- query atoms. The exporter is intentionally code-driven (not @.glog@ +-- driven): @.glog@ files declare theories, not queries, so the query +-- side has to live in Haskell either way. + +data Scenario = Scenario + { scName :: String + , scTheory :: IR.FlatTheory + , scFacts :: [(IR.Path, [Val])] + , scAtoms :: [QAtom] + } + +-- * three-atom-chain +-- +-- Mirrors @DB.InMemoryTest@ "matches evalConjunction on three-atom chain". +-- node = {e1, e2, e3}, edge = {(e1,e2,ee1), (e2,e3,ee2)}. +-- Conjunction: node(a), edge(a, b, _), edge(b, c, _). + +nodePath, edgePath :: IR.Path +nodePath = ["node"] +edgePath = ["edge"] + +threeAtomChain :: Scenario +threeAtomChain = + Scenario + { scName = "three-atom-chain" + , scTheory = + IR.FlatTheory + { tables = + Map.fromList + [ (nodePath, IR.Table {columns = [IR.EntityType nodePath], primaryKey = Nothing}) + , (edgePath, IR.Table {columns = [IR.EntityType nodePath, IR.EntityType nodePath, IR.EntityType edgePath], primaryKey = Nothing}) + ] + , laws = Map.empty + } + , scFacts = + [ (nodePath, [ValEntity nodePath 1]) + , (nodePath, [ValEntity nodePath 2]) + , (nodePath, [ValEntity nodePath 3]) + , (edgePath, [ValEntity nodePath 1, ValEntity nodePath 2, ValEntity edgePath 1]) + , (edgePath, [ValEntity nodePath 2, ValEntity nodePath 3, ValEntity edgePath 2]) + ] + , scAtoms = + [ QAtom {qaTable = nodePath, qaRowId = Nothing, qaValues = Map.singleton 0 (QVar (Var "a"))} + , QAtom {qaTable = edgePath, qaRowId = Nothing, qaValues = Map.fromList [(0, QVar (Var "a")), (1, QVar (Var "b"))]} + , QAtom {qaTable = edgePath, qaRowId = Nothing, qaValues = Map.fromList [(0, QVar (Var "b")), (1, QVar (Var "c"))]} + ] + } + +scenarios :: [Scenario] +scenarios = [threeAtomChain] + +-- * JSON encoding +-- +-- The shape mirrors the IR in @crates/glog-runner/src/lib.rs@: +-- +-- > { +-- > "schema": {: , ...}, +-- > "facts": {: [[, ...], ...], ...}, +-- > "query": {"root": , "nodes": [{"id": , "action": }, ...]} +-- > } + +-- | Render a 'Geolog.IR.Path' (a list of 'FNotation.Names.Name') as a flat +-- string for use as a relation name on the Rust side. Each 'Name' is +-- already shown with @\/@ between its own init segments and last, so we +-- reuse 'show' and join Names with @\/@ too. +pathText :: IR.Path -> Text +pathText = T.intercalate "/" . map (T.pack . show) + +pathKey :: IR.Path -> Aeson.Key +pathKey = Key.fromText . pathText + +encodeValue :: Val -> Aeson.Value +encodeValue = + Aeson.object . pure . \case + ValInt n -> "int" .= n + ValText t -> "str" .= t + ValEntity p n -> "str" .= (pathText p <> ":" <> T.pack (show n)) + +encodeTerm :: QVal -> Aeson.Value +encodeTerm = \case + QVar (Var name) -> Aeson.object ["var" .= name] + QLit v -> Aeson.object ["lit" .= encodeValue v] + +-- | Flatten an atom into one term per stored column, mirroring +-- @Geolog.DB.InMemory.toFlatArgs@: @qaValues@ keys map to positions +-- @0..n-2@, @qaRowId@ (if present) maps to position @n-1@, and any +-- missing positions become wildcard variables with locally-unique names. +flattenAtom :: Int -> Int -> QAtom -> [Aeson.Value] +flattenAtom atomIdx arity qa = + [ encodeTerm (Map.findWithDefault (wildcard atomIdx pos) pos merged) + | pos <- [0 .. arity - 1] + ] + where + merged = case qa.qaRowId of + Nothing -> qa.qaValues + Just v -> Map.insert (arity - 1) v qa.qaValues + wildcard a p = QVar (Var (T.pack ("_w" <> show a <> "_" <> show p))) + +encodeAtom :: Map IR.Path IR.Table -> Int -> QAtom -> Aeson.Value +encodeAtom tables atomIdx qa = + Aeson.object + [ "table" .= pathText qa.qaTable + , "columns" .= flattenAtom atomIdx arity qa + ] + where + arity = case Map.lookup qa.qaTable tables of + Just t -> length t.columns + Nothing -> error ("encodeAtom: unknown table " <> show qa.qaTable) + +-- | Stable atom indexing keyed by atom identity, so the wildcard names in +-- @flattenAtom@ are deterministic across runs even if the planner's node +-- ordering changes. +atomIndex :: [QAtom] -> Map QAtom Int +atomIndex atoms = Map.fromList (zip (Set.toList (Set.fromList atoms)) [0 ..]) + +encodeJoinOp :: JoinType -> Aeson.Value +encodeJoinOp = \case + LeftJoin -> "left" + RightJoin -> "right" + NaturalJoin -> "natural" + +encodeNode :: Map IR.Path IR.Table -> Map QAtom Int -> PlanNode -> Aeson.Value +encodeNode tables idx n = + Aeson.object + [ "id" .= n.graphId.unPlanNodeId + , "action" .= case n.action of + PlanEvalAtom qa -> + let i = Map.findWithDefault 0 qa idx + in Aeson.object ["scan" .= encodeAtom tables i qa] + PlanJoin jt (PlanNodeId a) (PlanNodeId b) -> + Aeson.object + [ "join" + .= Aeson.object + [ "op" .= encodeJoinOp jt + , "left" .= a + , "right" .= b + ] + ] + ] + +-- | Render a 'PlanGraph' as the JSON the runner consumes. Empty graphs +-- produce @{"root": 0, "nodes": []}@, which the runner treats as a +-- well-formed but empty query. +encodeQuery :: Map IR.Path IR.Table -> Map QAtom Int -> PlanGraph -> Aeson.Value +encodeQuery tables idx (PlanGraph g) + | null nodes = + Aeson.object ["root" .= (0 :: Int), "nodes" .= ([] :: [Aeson.Value])] + | otherwise = + Aeson.object + [ "root" .= rootId + , "nodes" .= map (encodeNode tables idx) nodes + ] + where + nodes = sortOn (.graphId.unPlanNodeId) (AG.vertexList g) + rootId = case graphRoot (PlanGraph g) of + Just (PlanNodeId i) -> i + -- Non-empty graph with no topological root means a cycle, which + -- planConjunction never produces. Fall back to the last id rather + -- than crashing so a bug here is still inspectable. + Nothing -> (.graphId.unPlanNodeId) (last nodes) + +encodePlan :: Scenario -> Aeson.Value +encodePlan sc = + Aeson.object + [ "_scenario" .= sc.scName + , "schema" .= Aeson.object + [pathKey p .= length t.columns | (p, t) <- Map.toList sc.scTheory.tables] + , "facts" .= Aeson.object + [pathKey p .= map (map encodeValue) rows | (p, rows) <- groupedFacts sc.scFacts] + , "query" .= encodeQuery sc.scTheory.tables (atomIndex sc.scAtoms) (planConjunction sc.scAtoms) + ] + +-- | Group facts by table while preserving table-first-seen order and +-- per-table insertion order. +groupedFacts :: [(IR.Path, [Val])] -> [(IR.Path, [[Val]])] +groupedFacts = go [] + where + go acc [] = reverse [(p, reverse rs) | (p, rs) <- acc] + go acc ((p, row) : rest) = + let acc' = case break (\(q, _) -> q == p) acc of + (before, (q, rs) : after) -> before ++ (q, row : rs) : after + (before, []) -> before ++ [(p, [row])] + in go acc' rest + +-- * Self-check +-- +-- Run the planner's @evalConjunctionPlanned@ against the scenario's DB +-- to confirm the plan we're about to emit is well-formed and produces +-- non-error output. Catches malformed scenarios before they hand a bad +-- plan to the Rust runner. + +selfCheck :: Scenario -> IO () +selfCheck sc = do + let db = foldl (\d (p, row) -> insertRow p row d) (fromTheory sc.scTheory) sc.scFacts + case evalConjunctionPlanned db sc.scAtoms of + Left err -> die ("self-check failed for " <> sc.scName <> ": " <> show err) + Right _ -> pure () + +-- * Entry point + +main :: IO () +main = do + args <- getArgs + case args of + [name] -> case lookup name [(s.scName, s) | s <- scenarios] of + Just sc -> do + selfCheck sc + LBS8.putStrLn (AesonPretty.encodePretty (encodePlan sc)) + Nothing -> + die ("unknown scenario: " <> name <> "\navailable: " <> unwords (map (.scName) scenarios)) + _ -> do + hPutStrLn stderr "usage: glog-export " + hPutStrLn stderr ("scenarios: " <> unwords (map (.scName) scenarios)) + die ""