Add the infrastructure to run the plan on Rust

This commit is contained in:
Hassan Abedi 2026-06-05 11:20:50 +02:00
parent 693151b89e
commit 510662e7c9
13 changed files with 1006 additions and 10 deletions

1
.gitignore vendored
View File

@ -81,3 +81,4 @@ tarpaulin-report.html
.claude/ .claude/
.codex .codex
.agents/ .agents/
dist-newstyle/

2
.gitmodules vendored
View File

@ -5,4 +5,4 @@
[submodule "external/geolog"] [submodule "external/geolog"]
path = external/geolog path = external/geolog
url = gitlab@git.sgai.uk:creators/geolog.git url = gitlab@git.sgai.uk:creators/geolog.git
branch = query-plan-ir-draft-1 branch = query-plan-algebraic

View File

@ -76,6 +76,23 @@ clean: ## Remove build output
cargo clean; \ cargo clean; \
fi fi
EXPORTER_DIR := tools/exporter
EXPORTER_FIXTURES := crates/glog-runner/fixtures
EXPORTER_SCENARIOS := three-atom-chain
.PHONY: export-fixtures
export-fixtures: ## Regenerate JSON plan fixtures from the Haskell exporter (needs Cabal and GHC; use `make shell` first).
@if ! command -v cabal >/dev/null 2>&1; then \
echo "cabal not found. Enter the dev shell with 'make shell' (or 'nix develop') first."; \
exit 1; \
fi
@cd $(EXPORTER_DIR) && cabal build glog-export
@for sc in $(EXPORTER_SCENARIOS); do \
out=$(EXPORTER_FIXTURES)/$$(echo $$sc | tr '-' '_').json; \
echo "exporting $$sc -> $$out"; \
(cd $(EXPORTER_DIR) && cabal run -v0 glog-export -- $$sc) > $$out; \
done
.PHONY: shell .PHONY: shell
shell: ## Enter the Nix dev shell defined in flake.nix shell: ## Enter the Nix dev shell defined in flake.nix
@nix develop @nix develop

View File

@ -0,0 +1,19 @@
[package]
name = "glog-runner"
version = "0.1.0"
edition.workspace = true
license.workspace = true
rust-version.workspace = true
[lints]
workspace = true
[dependencies]
storage = { path = "../storage" }
query-ops = { path = "../query-ops" }
serde = { version = "1", features = ["derive"] }
serde_json = "1"
[[bin]]
name = "glog-run"
path = "src/main.rs"

View File

@ -0,0 +1,166 @@
{
"_scenario": "three-atom-chain",
"facts": {
"edge": [
[
{
"str": "node:1"
},
{
"str": "node:2"
},
{
"str": "edge:1"
}
],
[
{
"str": "node:2"
},
{
"str": "node:3"
},
{
"str": "edge:2"
}
]
],
"node": [
[
{
"str": "node:1"
}
],
[
{
"str": "node:2"
}
],
[
{
"str": "node:3"
}
]
]
},
"query": {
"nodes": [
{
"action": {
"scan": {
"columns": [
{
"var": "a"
},
{
"var": "b"
},
{
"var": "_w0_2"
}
],
"table": "edge"
}
},
"id": 1
},
{
"action": {
"scan": {
"columns": [
{
"var": "b"
},
{
"var": "c"
},
{
"var": "_w1_2"
}
],
"table": "edge"
}
},
"id": 2
},
{
"action": {
"scan": {
"columns": [
{
"var": "a"
}
],
"table": "node"
}
},
"id": 3
},
{
"action": {
"join": {
"left": 1,
"op": "left",
"right": 3
}
},
"id": 4
},
{
"action": {
"join": {
"left": 2,
"op": "left",
"right": 4
}
},
"id": 5
},
{
"action": {
"join": {
"left": 5,
"op": "right",
"right": 4
}
},
"id": 6
},
{
"action": {
"join": {
"left": 6,
"op": "right",
"right": 3
}
},
"id": 7
},
{
"action": {
"join": {
"left": 6,
"op": "natural",
"right": 7
}
},
"id": 8
},
{
"action": {
"join": {
"left": 5,
"op": "natural",
"right": 8
}
},
"id": 9
}
],
"root": 9
},
"schema": {
"edge": 3,
"node": 1
}
}

View File

@ -0,0 +1,344 @@
//! End-to-end runner that executes a `geolog-lang` conjunctive-query plan
//! against this workspace's storage and `query-ops` operators.
//!
//! The upstream Haskell planner in `external/geolog/geolog-lang`
//! (`Geolog.DB.Plan`) builds a Yannakakis-style join DAG over `QAtom`s. This
//! crate accepts that DAG as JSON, materializes the input relations through
//! the [`Storage`] trait, and walks the DAG using
//! [`query_ops::atom::scan_atom`], [`query_ops::join::semijoin`], and
//! [`query_ops::join::natural_join`]. The result is a binding
//! [`Relation`](query_ops::relation::Relation) over the query's variables.
//!
//! The JSON IR mirrors `Geolog.DB.Plan.JoinPlan` and `Geolog.DB.InMemory.QAtom`
//! without depending on the Haskell side at build time. A Haskell exporter
//! that dumps `(schema, facts, JoinPlan)` to this shape is the planned
//! follow-up that completes the round trip; the IR is the contract.
//!
//! Mapping from the Haskell planner:
//!
//! | `Geolog.DB.Plan` | this crate |
//! |-----------------------------|-----------------------------------------------|
//! | `PlanEvalAtom` | [`Action::Scan`] → `scan_atom` |
//! | `PlanJoin LeftJoin a b` | [`Action::Join`] with [`JoinOp::Left`] → `semijoin(rel[a], rel[b])` |
//! | `PlanJoin RightJoin a b` | [`Action::Join`] with [`JoinOp::Right`] → `semijoin(rel[b], rel[a])` |
//! | `PlanJoin NaturalJoin a b` | [`Action::Join`] with [`JoinOp::Natural`] → `natural_join(rel[a], rel[b])` |
//!
//! The atom side covers `evalAtom` (`Geolog.DB.InMemory`): a [`Term::Var`]
//! repeated across positions enforces equality, [`Term::Lit`] filters by
//! constant, and distinct variables project in first-occurrence order.
use std::collections::HashMap;
use serde::Deserialize;
use query_ops::atom::{AtomPattern, Term, scan_atom};
use query_ops::join::{natural_join, semijoin};
use query_ops::relation::Relation;
use storage::value::Value;
use storage::{MemoryStorage, Storage, StorageError, scan_as_table};
/// A single fixture: schema, ground facts, and a query plan to execute.
#[derive(Debug, Clone, Deserialize)]
pub struct Plan {
/// Relation name → arity (column count).
pub schema: HashMap<String, usize>,
/// Relation name → list of ground tuples to insert before execution.
pub facts: HashMap<String, Vec<Vec<JsonValue>>>,
/// The join DAG itself.
pub query: Query,
}
/// Mirrors `Geolog.DB.Plan.JoinPlan`: a set of nodes plus the id of the
/// rooted result node.
#[derive(Debug, Clone, Deserialize)]
pub struct Query {
pub root: u32,
pub nodes: Vec<Node>,
}
/// One node of the plan DAG. `id`s are dense within a `Query` but don't need
/// to start at any particular value, mirroring the Haskell `PlanNodeId`.
#[derive(Debug, Clone, Deserialize)]
pub struct Node {
pub id: u32,
pub action: Action,
}
/// What to compute at a node. Tagged externally so JSON reads as
/// `{"action": {"scan": {...}}}` or `{"action": {"join": {...}}}`.
#[derive(Debug, Clone, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum Action {
Scan(Atom),
Join(Join),
}
/// A flat atom pattern, one entry per column of the target relation.
/// Matches the `toFlatArgs` view used by `Geolog.DB.InMemory.evalAtom`:
/// `qaValues` positions are filled in directly, and the entity-id column
/// (if any) appears at the last position. Wildcard positions in the
/// Haskell `QAtom` (a `Map Int QVal` with a missing key) translate to a
/// fresh, unique variable name on this side, which the operator binds but
/// never joins against.
#[derive(Debug, Clone, Deserialize)]
pub struct Atom {
pub table: String,
pub columns: Vec<JsonTerm>,
}
#[derive(Debug, Clone, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum JsonTerm {
Var(String),
Lit(JsonValue),
}
/// Wire-level value tag. Restricted to what `storage::value::Value` carries.
/// Entity identities from the Haskell side (`ValEntity path id`) round-trip
/// through `Str` for now using a `"path:id"` convention; that's a fixture
/// concern, not a runner concern.
#[derive(Debug, Clone, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum JsonValue {
Int(i64),
Str(String),
}
#[derive(Debug, Clone, Deserialize)]
pub struct Join {
pub op: JoinOp,
pub left: u32,
pub right: u32,
}
#[derive(Debug, Clone, Copy, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum JoinOp {
/// `Geolog.DB.Plan.LeftJoin`: result is `left` rows whose shared columns
/// appear in `right`. Lowered to `semijoin(left, right)`.
Left,
/// `Geolog.DB.Plan.RightJoin`: result is `right` rows whose shared
/// columns appear in `left`. Lowered to `semijoin(right, left)`.
Right,
/// `Geolog.DB.Plan.NaturalJoin`. Lowered to `natural_join(left, right)`.
Natural,
}
/// Errors a runner can produce in addition to storage failures.
#[derive(Debug)]
pub enum RunError {
/// A fact references a relation that isn't declared in `schema`.
UnknownRelation(String),
/// A node id appears in a `Join` action but no node with that id exists.
MissingNode(u32),
/// `Query.root` doesn't match any node in `nodes`.
MissingRoot(u32),
/// Two nodes share the same id.
DuplicateNode(u32),
/// A join node references its left or right side before that side has
/// been computed: the DAG isn't actually topologically sorted by id, or
/// it has a cycle.
UnresolvedDependency { node: u32, depends_on: u32 },
/// Storage layer rejected an operation.
Storage(StorageError),
}
impl std::fmt::Display for RunError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::UnknownRelation(name) => {
write!(f, "facts reference relation {name:?} not in schema")
}
Self::MissingNode(id) => write!(f, "plan references missing node id {id}"),
Self::MissingRoot(id) => write!(f, "plan root id {id} matches no node"),
Self::DuplicateNode(id) => write!(f, "duplicate node id {id} in plan"),
Self::UnresolvedDependency { node, depends_on } => write!(
f,
"node {node} depends on {depends_on}, which has not been computed yet"
),
Self::Storage(err) => write!(f, "storage error: {err}"),
}
}
}
impl std::error::Error for RunError {
fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
match self {
Self::Storage(err) => Some(err),
_ => None,
}
}
}
impl From<StorageError> for RunError {
fn from(err: StorageError) -> Self {
Self::Storage(err)
}
}
impl From<JsonValue> for Value {
fn from(jv: JsonValue) -> Self {
match jv {
JsonValue::Int(n) => Self::Int(n),
JsonValue::Str(s) => Self::Str(s),
}
}
}
impl From<JsonTerm> for Term {
fn from(t: JsonTerm) -> Self {
match t {
JsonTerm::Var(name) => Self::Var(name),
JsonTerm::Lit(value) => Self::Lit(value.into()),
}
}
}
/// Parse a [`Plan`] from a JSON string.
///
/// # Errors
/// Returns a [`serde_json::Error`] if the input isn't valid JSON in the
/// expected shape.
pub fn parse_plan(json: &str) -> Result<Plan, serde_json::Error> {
serde_json::from_str(json)
}
/// Load schema and facts from a [`Plan`] into a fresh [`MemoryStorage`].
///
/// All facts are inserted in a single transaction; commit is atomic so a
/// failure on row N leaves the storage empty.
///
/// # Errors
/// Returns [`RunError::UnknownRelation`] if facts mention a relation not
/// declared in `schema`. Wraps storage failures (arity mismatch, transaction
/// errors) in [`RunError::Storage`].
pub fn load_into_memory(plan: &Plan) -> Result<MemoryStorage, RunError> {
let mut storage = MemoryStorage::default();
for (name, arity) in &plan.schema {
storage.create_relation(name, *arity)?;
}
{
let mut tx = storage.transaction()?;
for (name, rows) in &plan.facts {
if !plan.schema.contains_key(name) {
return Err(RunError::UnknownRelation(name.clone()));
}
for row in rows {
let cells: Vec<Value> = row.iter().cloned().map(Value::from).collect();
tx.insert(name, cells)?;
}
}
let _ = tx.commit()?;
}
Ok(storage)
}
/// Execute a plan against a storage backend, returning the bindings
/// [`Relation`] for the rooted plan node.
///
/// Nodes are executed in ascending `id` order. For a Yannakakis plan as
/// emitted by `Geolog.DB.Plan` this is equivalent to a topological sort,
/// since `insertJoin` only references node ids that have already been
/// allocated. A non-monotone id ordering is rejected with
/// [`RunError::UnresolvedDependency`].
///
/// # Errors
/// Returns [`RunError::DuplicateNode`] for repeated ids,
/// [`RunError::MissingNode`] for join references to unknown ids,
/// [`RunError::MissingRoot`] if `query.root` isn't present, and storage
/// errors during the per-scan `scan_as_table` call.
pub fn execute<S: Storage>(storage: &S, query: &Query) -> Result<Relation, RunError> {
let mut seen_ids: std::collections::HashSet<u32> =
std::collections::HashSet::with_capacity(query.nodes.len());
for node in &query.nodes {
if !seen_ids.insert(node.id) {
return Err(RunError::DuplicateNode(node.id));
}
}
if !seen_ids.contains(&query.root) {
return Err(RunError::MissingRoot(query.root));
}
let mut ordered: Vec<&Node> = query.nodes.iter().collect();
ordered.sort_by_key(|n| n.id);
let mut results: HashMap<u32, Relation> = HashMap::with_capacity(ordered.len());
for node in ordered {
let computed = match &node.action {
Action::Scan(atom) => {
let table = scan_as_table(storage, &atom.table)?;
let pattern = AtomPattern {
columns: atom.columns.iter().cloned().map(Term::from).collect(),
};
scan_atom(&table, &pattern)
}
Action::Join(join) => {
let left = require_dep(&results, &seen_ids, node.id, join.left)?;
let right = require_dep(&results, &seen_ids, node.id, join.right)?;
match join.op {
JoinOp::Left => semijoin(left, right),
JoinOp::Right => semijoin(right, left),
JoinOp::Natural => natural_join(left, right),
}
}
};
results.insert(node.id, computed);
}
results
.remove(&query.root)
.ok_or(RunError::MissingRoot(query.root))
}
fn require_dep<'a>(
results: &'a HashMap<u32, Relation>,
seen: &std::collections::HashSet<u32>,
node: u32,
depends_on: u32,
) -> Result<&'a Relation, RunError> {
if let Some(rel) = results.get(&depends_on) {
Ok(rel)
} else if seen.contains(&depends_on) {
Err(RunError::UnresolvedDependency { node, depends_on })
} else {
Err(RunError::MissingNode(depends_on))
}
}
/// Convenience: parse JSON, load it into a fresh in-memory storage, and
/// execute, returning the root binding relation.
///
/// # Errors
/// Returns a JSON parse error if the input is malformed, or a [`RunError`]
/// for any later step.
pub fn run_json(json: &str) -> Result<Relation, RunFromJsonError> {
let plan = parse_plan(json).map_err(RunFromJsonError::Parse)?;
let storage = load_into_memory(&plan).map_err(RunFromJsonError::Run)?;
let bindings = execute(&storage, &plan.query).map_err(RunFromJsonError::Run)?;
Ok(bindings)
}
/// Combined error from [`run_json`].
#[derive(Debug)]
pub enum RunFromJsonError {
Parse(serde_json::Error),
Run(RunError),
}
impl std::fmt::Display for RunFromJsonError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::Parse(err) => write!(f, "parse error: {err}"),
Self::Run(err) => write!(f, "run error: {err}"),
}
}
}
impl std::error::Error for RunFromJsonError {
fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
match self {
Self::Parse(err) => Some(err),
Self::Run(err) => Some(err),
}
}
}

View File

@ -0,0 +1,59 @@
//! `glog-run` CLI: read a JSON plan from a file (or stdin if `-`), execute
//! it against a fresh in-memory store, and print the resulting binding
//! relation as JSON on stdout.
use std::io::{self, Read};
use std::process::ExitCode;
fn main() -> ExitCode {
let mut args = std::env::args().skip(1);
let Some(path) = args.next() else {
eprintln!("usage: glog-run <plan.json | ->");
return ExitCode::from(2);
};
let input = match read_input(&path) {
Ok(s) => s,
Err(err) => {
eprintln!("failed to read {path}: {err}");
return ExitCode::from(1);
}
};
let relation = match glog_runner::run_json(&input) {
Ok(r) => r,
Err(err) => {
eprintln!("{err}");
return ExitCode::from(1);
}
};
let payload = serde_json::json!({
"columns": relation.columns,
"rows": relation
.rows
.iter()
.map(|row| row.iter().map(value_to_json).collect::<Vec<_>>())
.collect::<Vec<_>>(),
});
println!("{payload}");
ExitCode::SUCCESS
}
fn read_input(path: &str) -> io::Result<String> {
if path == "-" {
let mut buf = String::new();
io::stdin().read_to_string(&mut buf)?;
Ok(buf)
} else {
std::fs::read_to_string(path)
}
}
fn value_to_json(value: &storage::value::Value) -> serde_json::Value {
match value {
storage::value::Value::Int(n) => serde_json::Value::Number((*n).into()),
storage::value::Value::Str(s) => serde_json::Value::String(s.clone()),
storage::value::Value::Id(id) => serde_json::Value::String(id.to_string()),
}
}

View File

@ -0,0 +1,73 @@
//! End-to-end check: run the JSON fixture and verify the resulting bindings
//! match the `DB.InMemoryTest` "matches evalConjunction on three-atom chain"
//! case from `external/geolog/geolog-lang/test/DB/InMemoryTest.hs`.
//!
//! For `node = {e1, e2, e3}` and `edge = {(e1,e2,ee1), (e2,e3,ee2)}` the
//! conjunction `node(a), edge(a, b, _), edge(b, c, _)` has exactly one
//! solution: `(a=e1, b=e2, c=e3)`.
use std::collections::BTreeMap;
use glog_runner::run_json;
use storage::value::Value;
fn fixture() -> &'static str {
include_str!("../fixtures/three_atom_chain.json")
}
fn ent(path: &str, id: u32) -> Value {
Value::Str(format!("{path}:{id}"))
}
fn project<'a>(
columns: &'a [String],
row: &'a [Value],
keep: &'a [&'a str],
) -> BTreeMap<&'a str, &'a Value> {
keep.iter()
.map(|name| {
let pos = columns
.iter()
.position(|c| c == name)
.expect("column missing");
(*name, &row[pos])
})
.collect()
}
#[test]
fn three_atom_chain_matches_haskell_oracle() {
let result = run_json(fixture()).expect("fixture should execute");
// The plan's root keeps every variable, including the per-atom wildcards
// `_r1` and `_r2`. The oracle only asserts the (a, b, c) projection.
let keep = ["a", "b", "c"];
let mut projected: Vec<BTreeMap<&str, &Value>> = result
.rows
.iter()
.map(|row| project(&result.columns, row, &keep))
.collect();
projected.sort_by_key(|m| format!("{m:?}"));
let e1 = ent("node", 1);
let e2 = ent("node", 2);
let e3 = ent("node", 3);
let expected = vec![BTreeMap::from([("a", &e1), ("b", &e2), ("c", &e3)])];
assert_eq!(projected, expected);
}
#[test]
fn root_columns_cover_a_b_c_plus_two_wildcards() {
// The exporter emits unique wildcard variable names for the entity-id
// column of each edge atom (e.g. `_w0_2`, `_w1_2`); their exact spelling
// is an implementation detail of the exporter, so this test only checks
// that the named variables are all present and that the total column
// count is the three named ones plus two anonymous wildcards.
let result = run_json(fixture()).expect("fixture should execute");
let cols: std::collections::HashSet<&str> = result.columns.iter().map(String::as_str).collect();
for expected in ["a", "b", "c"] {
assert!(cols.contains(expected), "missing column {expected}");
}
assert_eq!(result.columns.len(), 5, "expected 3 named + 2 wildcards");
}

2
external/geolog vendored

@ -1 +1 @@
Subproject commit 99d4006f4655d8a6815a9156fe4d9304515f356d Subproject commit 426d4c96d6031ccaf5e14c12c3dab496e3b4c365

View File

@ -1,5 +1,5 @@
{ {
description = "Storage engine playground: Rust workspace for FlowLog, DBSP, CRDT, and Geomerge experiments."; description = "Storage engine playground";
inputs = { inputs = {
nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable"; nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable";
@ -29,12 +29,13 @@
packages = [ packages = [
rustToolchain rustToolchain
# Diagram regeneration in crates/geomerge-demo/docs/diagrams.
pkgs.graphviz pkgs.graphviz
# Cargo helpers.
pkgs.cargo-watch pkgs.cargo-watch
pkgs.cargo-nextest pkgs.cargo-nextest
# Pre-commit hooks (see .pre-commit-config.yaml, Makefile setup-hooks). pkgs.haskell.compiler.ghc912
pkgs.cabal-install
pkgs.pkg-config
pkgs.zlib
pkgs.pre-commit pkgs.pre-commit
pkgs.python3 pkgs.python3
]; ];
@ -44,10 +45,14 @@
}; };
shellHook = '' shellHook = ''
echo "storage-engine-playground dev shell" # Banner goes to stderr so `nix develop --command` invocations
echo " rustc: $(rustc --version)" # that pipe stdout (e.g. tools/exporter producing JSON) stay clean.
echo " cargo: $(cargo --version)" >&2 echo "storage-engine-playground dev shell"
echo " dot: $(dot -V 2>&1)" >&2 echo " rustc: $(rustc --version)"
>&2 echo " cargo: $(cargo --version)"
>&2 echo " ghc: $(ghc --version)"
>&2 echo " cabal: $(cabal --version | head -1)"
>&2 echo " dot: $(dot -V 2>&1)"
''; '';
}; };

View File

@ -0,0 +1,24 @@
-- cabal.project for the geolog -> Rust JSON exporter.
--
-- This file points at the geolog-lang library inside the external/geolog
-- submodule, plus the sibling packages it depends on (data-partition,
-- diagnostician, fnotation). It mirrors the submodule's own cabal.project
-- so the exporter sees the same source set the submodule's tests build
-- against.
packages:
glog-exporter.cabal
../../external/geolog/geolog-lang/geolog-lang.cabal
../../external/geolog/data-partition/data-partition.cabal
../../external/geolog/diagnostician/diagnostician.cabal
../../external/geolog/fnotation/fnotation.cabal
-- geolog-lang's DB.Plan.Render module uses a patched diagrams-graphviz.
-- Same pin as external/geolog/cabal.project.
source-repository-package
type: git
location: https://github.com/georgefst/diagrams-graphviz.git
tag: 993533c564861f9d0663d719eafd56efd95f59ba
jobs: $ncpus
semaphore: true

View File

@ -0,0 +1,36 @@
cabal-version: 3.4
name: glog-exporter
version: 0.1.0.0
license: MIT OR Apache-2.0
author: storage-engine-playground
synopsis: Export geolog-lang join plans as JSON for the Rust runner.
description:
Builds a FlatTheory + facts + a list of QAtoms for a named scenario,
runs Geolog.DB.Plan.planConjunction, and emits a JSON document that
crates/glog-runner consumes. This allows the playground use query-ops and
storage end-to-end with a real Yannakakis plan produced by the geolog
frontend, not a hand-written fixture.
build-type: Simple
executable glog-export
main-is: Main.hs
hs-source-dirs: src
default-language: GHC2024
default-extensions:
BlockArguments
LambdaCase
OverloadedRecordDot
OverloadedStrings
ghc-options: -Wall
build-depends:
, aeson >=2.2
, aeson-pretty >=0.8
, algebraic-graphs >=0.7
, base
, bytestring
, containers
, geolog-lang
, text

252
tools/exporter/src/Main.hs Normal file
View File

@ -0,0 +1,252 @@
-- | Exports a geolog-lang join plan as JSON for the Rust runner in
-- @crates/glog-runner@.
--
-- Invocation:
--
-- @
-- cabal run glog-export -- <scenario> > plan.json
-- @
--
-- Available scenarios: @three-atom-chain@.
--
-- The output shape is documented in @crates\/glog-runner\/src\/lib.rs@.
-- This program is the canonical producer: any change to the IR should
-- start here, with the Rust runner updated to match.
module Main (main) where
import Algebra.Graph qualified as AG
import Data.Aeson ((.=))
import Data.Aeson qualified as Aeson
import Data.Aeson.Encode.Pretty qualified as AesonPretty
import Data.Aeson.Key qualified as Key
import Data.ByteString.Lazy.Char8 qualified as LBS8
import Data.List (sortOn)
import Data.Map.Strict (Map)
import Data.Map.Strict qualified as Map
import Data.Set qualified as Set
import Data.Text (Text)
import Data.Text qualified as T
import Geolog.DB.InMemory
import Geolog.DB.Plan
import Geolog.IR qualified as IR
import System.Environment (getArgs)
import System.Exit (die)
import System.IO (hPutStrLn, stderr)
-- * Scenario plumbing
--
-- A scenario fixes a schema, a set of ground facts, and a conjunction of
-- query atoms. The exporter is intentionally code-driven (not @.glog@
-- driven): @.glog@ files declare theories, not queries, so the query
-- side has to live in Haskell either way.
data Scenario = Scenario
{ scName :: String
, scTheory :: IR.FlatTheory
, scFacts :: [(IR.Path, [Val])]
, scAtoms :: [QAtom]
}
-- * three-atom-chain
--
-- Mirrors @DB.InMemoryTest@ "matches evalConjunction on three-atom chain".
-- node = {e1, e2, e3}, edge = {(e1,e2,ee1), (e2,e3,ee2)}.
-- Conjunction: node(a), edge(a, b, _), edge(b, c, _).
nodePath, edgePath :: IR.Path
nodePath = ["node"]
edgePath = ["edge"]
threeAtomChain :: Scenario
threeAtomChain =
Scenario
{ scName = "three-atom-chain"
, scTheory =
IR.FlatTheory
{ tables =
Map.fromList
[ (nodePath, IR.Table {columns = [IR.EntityType nodePath], primaryKey = Nothing})
, (edgePath, IR.Table {columns = [IR.EntityType nodePath, IR.EntityType nodePath, IR.EntityType edgePath], primaryKey = Nothing})
]
, laws = Map.empty
}
, scFacts =
[ (nodePath, [ValEntity nodePath 1])
, (nodePath, [ValEntity nodePath 2])
, (nodePath, [ValEntity nodePath 3])
, (edgePath, [ValEntity nodePath 1, ValEntity nodePath 2, ValEntity edgePath 1])
, (edgePath, [ValEntity nodePath 2, ValEntity nodePath 3, ValEntity edgePath 2])
]
, scAtoms =
[ QAtom {qaTable = nodePath, qaRowId = Nothing, qaValues = Map.singleton 0 (QVar (Var "a"))}
, QAtom {qaTable = edgePath, qaRowId = Nothing, qaValues = Map.fromList [(0, QVar (Var "a")), (1, QVar (Var "b"))]}
, QAtom {qaTable = edgePath, qaRowId = Nothing, qaValues = Map.fromList [(0, QVar (Var "b")), (1, QVar (Var "c"))]}
]
}
scenarios :: [Scenario]
scenarios = [threeAtomChain]
-- * JSON encoding
--
-- The shape mirrors the IR in @crates/glog-runner/src/lib.rs@:
--
-- > {
-- > "schema": {<name>: <arity>, ...},
-- > "facts": {<name>: [[<value>, ...], ...], ...},
-- > "query": {"root": <id>, "nodes": [{"id": <id>, "action": <action>}, ...]}
-- > }
-- | Render a 'Geolog.IR.Path' (a list of 'FNotation.Names.Name') as a flat
-- string for use as a relation name on the Rust side. Each 'Name' is
-- already shown with @\/@ between its own init segments and last, so we
-- reuse 'show' and join Names with @\/@ too.
pathText :: IR.Path -> Text
pathText = T.intercalate "/" . map (T.pack . show)
pathKey :: IR.Path -> Aeson.Key
pathKey = Key.fromText . pathText
encodeValue :: Val -> Aeson.Value
encodeValue =
Aeson.object . pure . \case
ValInt n -> "int" .= n
ValText t -> "str" .= t
ValEntity p n -> "str" .= (pathText p <> ":" <> T.pack (show n))
encodeTerm :: QVal -> Aeson.Value
encodeTerm = \case
QVar (Var name) -> Aeson.object ["var" .= name]
QLit v -> Aeson.object ["lit" .= encodeValue v]
-- | Flatten an atom into one term per stored column, mirroring
-- @Geolog.DB.InMemory.toFlatArgs@: @qaValues@ keys map to positions
-- @0..n-2@, @qaRowId@ (if present) maps to position @n-1@, and any
-- missing positions become wildcard variables with locally-unique names.
flattenAtom :: Int -> Int -> QAtom -> [Aeson.Value]
flattenAtom atomIdx arity qa =
[ encodeTerm (Map.findWithDefault (wildcard atomIdx pos) pos merged)
| pos <- [0 .. arity - 1]
]
where
merged = case qa.qaRowId of
Nothing -> qa.qaValues
Just v -> Map.insert (arity - 1) v qa.qaValues
wildcard a p = QVar (Var (T.pack ("_w" <> show a <> "_" <> show p)))
encodeAtom :: Map IR.Path IR.Table -> Int -> QAtom -> Aeson.Value
encodeAtom tables atomIdx qa =
Aeson.object
[ "table" .= pathText qa.qaTable
, "columns" .= flattenAtom atomIdx arity qa
]
where
arity = case Map.lookup qa.qaTable tables of
Just t -> length t.columns
Nothing -> error ("encodeAtom: unknown table " <> show qa.qaTable)
-- | Stable atom indexing keyed by atom identity, so the wildcard names in
-- @flattenAtom@ are deterministic across runs even if the planner's node
-- ordering changes.
atomIndex :: [QAtom] -> Map QAtom Int
atomIndex atoms = Map.fromList (zip (Set.toList (Set.fromList atoms)) [0 ..])
encodeJoinOp :: JoinType -> Aeson.Value
encodeJoinOp = \case
LeftJoin -> "left"
RightJoin -> "right"
NaturalJoin -> "natural"
encodeNode :: Map IR.Path IR.Table -> Map QAtom Int -> PlanNode -> Aeson.Value
encodeNode tables idx n =
Aeson.object
[ "id" .= n.graphId.unPlanNodeId
, "action" .= case n.action of
PlanEvalAtom qa ->
let i = Map.findWithDefault 0 qa idx
in Aeson.object ["scan" .= encodeAtom tables i qa]
PlanJoin jt (PlanNodeId a) (PlanNodeId b) ->
Aeson.object
[ "join"
.= Aeson.object
[ "op" .= encodeJoinOp jt
, "left" .= a
, "right" .= b
]
]
]
-- | Render a 'PlanGraph' as the JSON the runner consumes. Empty graphs
-- produce @{"root": 0, "nodes": []}@, which the runner treats as a
-- well-formed but empty query.
encodeQuery :: Map IR.Path IR.Table -> Map QAtom Int -> PlanGraph -> Aeson.Value
encodeQuery tables idx (PlanGraph g)
| null nodes =
Aeson.object ["root" .= (0 :: Int), "nodes" .= ([] :: [Aeson.Value])]
| otherwise =
Aeson.object
[ "root" .= rootId
, "nodes" .= map (encodeNode tables idx) nodes
]
where
nodes = sortOn (.graphId.unPlanNodeId) (AG.vertexList g)
rootId = case graphRoot (PlanGraph g) of
Just (PlanNodeId i) -> i
-- Non-empty graph with no topological root means a cycle, which
-- planConjunction never produces. Fall back to the last id rather
-- than crashing so a bug here is still inspectable.
Nothing -> (.graphId.unPlanNodeId) (last nodes)
encodePlan :: Scenario -> Aeson.Value
encodePlan sc =
Aeson.object
[ "_scenario" .= sc.scName
, "schema" .= Aeson.object
[pathKey p .= length t.columns | (p, t) <- Map.toList sc.scTheory.tables]
, "facts" .= Aeson.object
[pathKey p .= map (map encodeValue) rows | (p, rows) <- groupedFacts sc.scFacts]
, "query" .= encodeQuery sc.scTheory.tables (atomIndex sc.scAtoms) (planConjunction sc.scAtoms)
]
-- | Group facts by table while preserving table-first-seen order and
-- per-table insertion order.
groupedFacts :: [(IR.Path, [Val])] -> [(IR.Path, [[Val]])]
groupedFacts = go []
where
go acc [] = reverse [(p, reverse rs) | (p, rs) <- acc]
go acc ((p, row) : rest) =
let acc' = case break (\(q, _) -> q == p) acc of
(before, (q, rs) : after) -> before ++ (q, row : rs) : after
(before, []) -> before ++ [(p, [row])]
in go acc' rest
-- * Self-check
--
-- Run the planner's @evalConjunctionPlanned@ against the scenario's DB
-- to confirm the plan we're about to emit is well-formed and produces
-- non-error output. Catches malformed scenarios before they hand a bad
-- plan to the Rust runner.
selfCheck :: Scenario -> IO ()
selfCheck sc = do
let db = foldl (\d (p, row) -> insertRow p row d) (fromTheory sc.scTheory) sc.scFacts
case evalConjunctionPlanned db sc.scAtoms of
Left err -> die ("self-check failed for " <> sc.scName <> ": " <> show err)
Right _ -> pure ()
-- * Entry point
main :: IO ()
main = do
args <- getArgs
case args of
[name] -> case lookup name [(s.scName, s) | s <- scenarios] of
Just sc -> do
selfCheck sc
LBS8.putStrLn (AesonPretty.encodePretty (encodePlan sc))
Nothing ->
die ("unknown scenario: " <> name <> "\navailable: " <> unwords (map (.scName) scenarios))
_ -> do
hPutStrLn stderr "usage: glog-export <scenario>"
hPutStrLn stderr ("scenarios: " <> unwords (map (.scName) scenarios))
die ""