From cb07245bd9b720af2ab04660154eb84f4cbd216c Mon Sep 17 00:00:00 2001 From: Hassan Abedi Date: Wed, 3 Jun 2026 11:48:33 +0200 Subject: [PATCH] Add the early version of `query-ops` implmenation --- crates/query-ops/README.md | 135 +++++++++++ .../query-ops/docs/diagrams/make_figures.sh | 14 ++ crates/query-ops/docs/diagrams/types.dot | 60 +++++ crates/query-ops/docs/diagrams/types.svg | 85 +++++++ crates/query-ops/docs/diagrams/workflow.dot | 122 ++++++++++ crates/query-ops/docs/diagrams/workflow.svg | 159 +++++++++++++ crates/query-ops/src/atom.rs | 171 +++++++++++++- crates/query-ops/src/join.rs | 213 +++++++++++++++++- crates/query-ops/src/relation.rs | 54 +++++ crates/query-ops/src/table.rs | 15 ++ crates/query-ops/tests/hand_plan.rs | 91 ++++++++ 11 files changed, 1105 insertions(+), 14 deletions(-) create mode 100644 crates/query-ops/README.md create mode 100755 crates/query-ops/docs/diagrams/make_figures.sh create mode 100644 crates/query-ops/docs/diagrams/types.dot create mode 100644 crates/query-ops/docs/diagrams/types.svg create mode 100644 crates/query-ops/docs/diagrams/workflow.dot create mode 100644 crates/query-ops/docs/diagrams/workflow.svg create mode 100644 crates/query-ops/tests/hand_plan.rs diff --git a/crates/query-ops/README.md b/crates/query-ops/README.md new file mode 100644 index 0000000..edcd325 --- /dev/null +++ b/crates/query-ops/README.md @@ -0,0 +1,135 @@ +## Query Ops + +This crate provides a small set of query operators that can be used to implement a simple query-plan executor. +The operators are: **atom scan**, **semijoin**, and **natural join**. + +### Public API + +| Item | Kind | Description | +|--------------------------------------------------|----------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `scan_atom(&Table, &AtomPattern) -> Relation` | function | Scans the table under the pattern and returns a binding relation with one column per distinct variable in first-occurrence order. Literal positions and repeated variables filter rows during the scan. | +| `semijoin(&Relation, &Relation) -> Relation` | function | Returns the rows of `left` whose values on the columns shared with `right` also appear in `right`. The output column list is the same as `left.columns`. | +| `natural_join(&Relation, &Relation) -> Relation` | function | Returns every pair of `left` and `right` rows that agree on shared columns. Each output row holds the columns of `left` followed by the non-shared columns of `right`. | +| `Table` | struct | Holds positional input rows of fixed arity and carries no column names. Construct it with `Table::new(arity)` or `Table::from_rows(arity, rows)`. | +| `AtomPattern` | struct | Specifies, for each table column, either a variable to bind or a literal value to match. The pattern is a `Vec` whose length must equal the table's arity. | +| `Term` | enum | Represents one position of an `AtomPattern`. A term is either `Var(String)` to bind the cell to a named variable, or `Lit(Value)` to require the cell to equal a given value. | +| `Relation` | struct | Holds rows over named columns and is the type produced by every operator. Construct it with `Relation::new(columns)` or `Relation::from_rows(columns, rows)`. Column names within a single relation must be unique. | +| `Value` | enum | Represents a single cell value stored in a `Table` or `Relation`. A value is either `Int(i64)` or `Str(String)`. | + +Data types and their relationships: + +
+ + Types + +
+ +### Example + +The rule below returns the authors of every bestseller along with the book's price. +It uses all three operators: + +- `scan_atom` for the three input tables, +- `semijoin` to keep only authors of bestsellers, +- and `natural_join` to attach each book's price. + +```prolog +Q(name, book, dollars) :- author(name, book), bestseller(book), price(book, dollars). +``` + +```rust +use query_ops::atom::{AtomPattern, Term, scan_atom}; +use query_ops::join::{natural_join, semijoin}; +use query_ops::table::Table; +use query_ops::value::Value; + +fn s(x: &str) -> Value { + Value::Str(x.to_string()) +} +fn i(x: i64) -> Value { + Value::Int(x) +} + +fn main() { + let author = Table::from_rows( + 2, + vec![ + vec![s("Alice"), s("Foo")], + vec![s("Bob"), s("Bar")], + vec![s("Alice"), s("Baz")], + vec![s("Carol"), s("Qux")], + ], + ); + let bestseller = Table::from_rows(1, vec![vec![s("Foo")], vec![s("Baz")]]); + let price = Table::from_rows( + 2, + vec![ + vec![s("Foo"), i(25)], + vec![s("Bar"), i(15)], + vec![s("Baz"), i(30)], + vec![s("Qux"), i(20)], + ], + ); + + let author_rel = scan_atom( + &author, + &AtomPattern { + columns: vec![Term::Var("name".to_string()), Term::Var("book".to_string())], + }, + ); + let bestseller_rel = scan_atom( + &bestseller, + &AtomPattern { + columns: vec![Term::Var("book".to_string())], + }, + ); + let price_rel = scan_atom( + &price, + &AtomPattern { + columns: vec![Term::Var("book".to_string()), Term::Var("dollars".to_string())], + }, + ); + + let authors_of_bestsellers = semijoin(&author_rel, &bestseller_rel); + let result = natural_join(&authors_of_bestsellers, &price_rel); + + assert_eq!( + result.columns, + vec!["name".to_string(), "book".to_string(), "dollars".to_string()], + ); + assert_eq!( + result.rows, + vec![ + vec![s("Alice"), s("Foo"), i(25)], + vec![s("Alice"), s("Baz"), i(30)], + ], + ); +} +``` + +How it works: + +
+ + Types + +
+ +### Run the Tests + +```sh +cargo test -p query-ops +``` + +### Notes + +- **Tables versus relations:** A `Table` is positional (fixed arity with no column names), while a `Relation` is keyed by variable names. The atom + scan is the bridge that turns one into the other (look at the example), and every join after that operates on relations. +- **Joining is by column name:** `semijoin` and `natural_join` find shared columns by matching the strings in `Relation.columns`. Whether two + relations join on a column therefore depends on the variable name you chose in each `AtomPattern`. Picking the same `Term::Var(name)` in two + patterns is what makes them join on that column. +- **No projection operator yet:** `natural_join` always carries forward every column from both inputs, and `scan_atom` keeps every distinct variable + that appears in the pattern. There is no way to drop columns from a relation today, so a result may include more columns than the Datalog rule head + implies. +- **Bulk, not streaming:** Each operator materializes its full output as a new `Relation` and returns it. Operators compose by passing the result of + one as input to the next: `natural_join(&semijoin(&a, &b), &scan_atom(&t, &p))`. diff --git a/crates/query-ops/docs/diagrams/make_figures.sh b/crates/query-ops/docs/diagrams/make_figures.sh new file mode 100755 index 0000000..6d30150 --- /dev/null +++ b/crates/query-ops/docs/diagrams/make_figures.sh @@ -0,0 +1,14 @@ +#!/usr/bin/env bash + +# You need to have Graphviz installed to run this script +# On Debian-based OSes, you can install it using: sudo apt-get install graphviz + +# Directory containing .dot files. Defaults to the script's own directory so the +# script works regardless of the caller's working directory. +SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +ASSET_DIR=${1:-"${SCRIPT_DIR}"} + +# Make figures from .dot files +for f in "${ASSET_DIR}"/*.dot; do + dot -Tsvg "$f" -o "${f%.dot}.svg" +done diff --git a/crates/query-ops/docs/diagrams/types.dot b/crates/query-ops/docs/diagrams/types.dot new file mode 100644 index 0000000..d11a233 --- /dev/null +++ b/crates/query-ops/docs/diagrams/types.dot @@ -0,0 +1,60 @@ +digraph QueryOpsTypes { +fontname = "Helvetica,Arial,sans-serif" +layout = dot +rankdir = TB +ranksep = 0.7; +nodesep = 0.7; +splines = true; +bgcolor = "white" + +node [ +fontname = "Helvetica,Arial,sans-serif", +shape = box, +style = "filled,rounded", +color = "#555555", +fillcolor = "white", +penwidth = 1.5 +] +edge [ +fontname = "Helvetica,Arial,sans-serif", +color = "#333333", +fontsize = 9, +fontcolor = "#555555", +penwidth = 1.2 +] + +table_node [label = < + + + +
Table (struct)
arity: usize
rows: Vec<Vec<Value>>
>, fillcolor = "#E8F4FD", color = "#2196F3"] + +relation_node [label = < + + + +
Relation (struct)
columns: Vec<String>
rows: Vec<Vec<Value>>
>, fillcolor = "#ECEFF1", color = "#607D8B"] + +atom_pattern_node [label = < + + +
AtomPattern (struct)
columns: Vec<Term>
>, fillcolor = "#F3E5F5", color = "#9C27B0"] + +term_node [label = < + + + +
Term (enum)
Var(String)
Lit(Value)
>, fillcolor = "#F3E5F5", color = "#9C27B0"] + +value_node [label = < + + + +
Value (enum)
Int(i64)
Str(String)
>, fillcolor = "#FFF3E0", color = "#FF9800"] + +// composition edges: arrow X -> Y reads "X contains Y" +atom_pattern_node -> term_node [label = "Vec"] +term_node -> value_node [label = "Lit(Value)"] +table_node -> value_node [label = "Vec>"] +relation_node -> value_node [label = "Vec>"] +} diff --git a/crates/query-ops/docs/diagrams/types.svg b/crates/query-ops/docs/diagrams/types.svg new file mode 100644 index 0000000..f34a94d --- /dev/null +++ b/crates/query-ops/docs/diagrams/types.svg @@ -0,0 +1,85 @@ + + + + + + +QueryOpsTypes + + + +table_node + +Table +  (struct) +arity: usize +rows: Vec<Vec<Value>> + + + +value_node + +Value +  (enum) +Int(i64) +Str(String) + + + +table_node->value_node + + +Vec<Vec<Value>> + + + +relation_node + +Relation +  (struct) +columns: Vec<String> +rows: Vec<Vec<Value>> + + + +relation_node->value_node + + +Vec<Vec<Value>> + + + +atom_pattern_node + +AtomPattern +  (struct) +columns: Vec<Term> + + + +term_node + +Term +  (enum) +Var(String) +Lit(Value) + + + +atom_pattern_node->term_node + + +Vec<Term> + + + +term_node->value_node + + +Lit(Value) + + + diff --git a/crates/query-ops/docs/diagrams/workflow.dot b/crates/query-ops/docs/diagrams/workflow.dot new file mode 100644 index 0000000..a073ed4 --- /dev/null +++ b/crates/query-ops/docs/diagrams/workflow.dot @@ -0,0 +1,122 @@ +digraph QueryOpsHandPlan { +fontname = "Helvetica,Arial,sans-serif" +layout = dot +rankdir = LR +ranksep = 0.9; +nodesep = 0.7; +splines = true; +compound = true; +bgcolor = "white" + +node [ +fontname = "Helvetica,Arial,sans-serif", +shape = box, +style = "filled,rounded", +color = "#555555", +fillcolor = "white", +penwidth = 1.5 +] +edge [ +fontname = "Helvetica,Arial,sans-serif", +color = "#333333", +fontsize = 9, +fontcolor = "#555555", +labeldistance = 2.0, +penwidth = 1.2 +] + +subgraph cluster_inputs { +label = "Inputs (positional tables)" +style = "dashed" +color = "#888888" +fontcolor = "#555555" +margin = 18 +author_table [label = < + + + +
Table: author
• arity 2
• rows: (name, book)
>, fillcolor = "#E8F4FD", color = "#2196F3"] +bestseller_table [label = < + + + +
Table: bestseller
• arity 1
• rows: (book)
>, fillcolor = "#E8F4FD", color = "#2196F3"] +price_table [label = < + + + +
Table: price
• arity 2
• rows: (book, dollars)
>, fillcolor = "#E8F4FD", color = "#2196F3"] +} + +subgraph cluster_atoms { +label = "Atom Scans (scan_atom: Table × AtomPattern → Relation)" +style = "dashed" +color = "#9C27B0" +fontcolor = "#7B1FA2" +margin = 14 +author_rel [label = < + + + +
author_rel
pattern: [Var name, Var book]
cols: [name, book]
>, fillcolor = "#F3E5F5", color = "#9C27B0"] +bestseller_rel [label = < + + + +
bestseller_rel
pattern: [Var book]
cols: [book]
>, fillcolor = "#F3E5F5", color = "#9C27B0"] +price_rel [label = < + + + +
price_rel
pattern: [Var book, Var dollars]
cols: [book, dollars]
>, fillcolor = "#F3E5F5", color = "#9C27B0"] +} + +subgraph cluster_joins { +label = "Joins (shared cols = matching column names)" +style = "dashed" +color = "#4CAF50" +fontcolor = "#388E3C" +margin = 14 +semijoin_step [label = < + + + + +
semijoin
authors of bestsellers
shared: book
cols: [name, book]
>, fillcolor = "#E8F5E9", color = "#4CAF50"] +natural_join_step [label = < + + + + +
natural_join
attach each book's price
shared: book
cols: [name, book, dollars]
>, fillcolor = "#E8F5E9", color = "#4CAF50"] +} + +subgraph cluster_output { +label = "Output (binding relation)" +style = "dashed" +color = "#888888" +fontcolor = "#555555" +margin = 18 +result [label = < + + + +
Q result
authors of bestsellers with each book's price
cols: [name, book, dollars]
>, fillcolor = "#ECEFF1", color = "#607D8B"] +} + +// Atom scans consume tables +author_table -> author_rel [color = "#2196F3"] +bestseller_table -> bestseller_rel [color = "#2196F3"] +price_table -> price_rel [color = "#2196F3"] + +// semijoin narrows author_rel to bestseller authors +author_rel -> semijoin_step [label = "left", color = "#9C27B0"] +bestseller_rel -> semijoin_step [label = "right", color = "#9C27B0"] + +// natural_join attaches price +semijoin_step -> natural_join_step [label = "left", color = "#4CAF50"] +price_rel -> natural_join_step [label = "right", color = "#9C27B0"] + +// Final output +natural_join_step -> result [color = "#4CAF50"] +} diff --git a/crates/query-ops/docs/diagrams/workflow.svg b/crates/query-ops/docs/diagrams/workflow.svg new file mode 100644 index 0000000..f02b646 --- /dev/null +++ b/crates/query-ops/docs/diagrams/workflow.svg @@ -0,0 +1,159 @@ + + + + + + +QueryOpsHandPlan + + +cluster_inputs + +Inputs (positional tables) + + +cluster_atoms + +Atom Scans  (scan_atom: Table × AtomPattern → Relation) + + +cluster_joins + +Joins  (shared cols = matching column names) + + +cluster_output + +Output (binding relation) + + + +author_table + +Table: author +• arity 2 +• rows: (name, book) + + + +author_rel + +author_rel +pattern: [Var name, Var book] +cols: [name, book] + + + +author_table->author_rel + + + + + +bestseller_table + +Table: bestseller +• arity 1 +• rows: (book) + + + +bestseller_rel + +bestseller_rel +pattern: [Var book] +cols: [book] + + + +bestseller_table->bestseller_rel + + + + + +price_table + +Table: price +• arity 2 +• rows: (book, dollars) + + + +price_rel + +price_rel +pattern: [Var book, Var dollars] +cols: [book, dollars] + + + +price_table->price_rel + + + + + +semijoin_step + +semijoin +authors of bestsellers +shared: book +cols: [name, book] + + + +author_rel->semijoin_step + + +left + + + +bestseller_rel->semijoin_step + + +right + + + +natural_join_step + +natural_join +attach each book's price +shared: book +cols: [name, book, dollars] + + + +price_rel->natural_join_step + + +right + + + +semijoin_step->natural_join_step + + +left + + + +result + +Q result +authors of bestsellers with each book's price +cols: [name, book, dollars] + + + +natural_join_step->result + + + + + diff --git a/crates/query-ops/src/atom.rs b/crates/query-ops/src/atom.rs index d9b31d9..edfa9e9 100644 --- a/crates/query-ops/src/atom.rs +++ b/crates/query-ops/src/atom.rs @@ -7,6 +7,8 @@ //! self-loops). The output relation has one column per distinct variable, in //! first-occurrence order. +use std::collections::HashMap; + use crate::{relation::Relation, table::Table, value::Value}; #[derive(Debug, Clone, PartialEq, Eq)] @@ -20,10 +22,169 @@ pub struct AtomPattern { pub columns: Vec, } +/// # Panics +/// Panics if `pattern.columns.len() != table.arity`. #[must_use] -pub fn scan_atom(_table: &Table, _pattern: &AtomPattern) -> Relation { - todo!( - "scan rows, filter by repeated-variable equality and literal equality, \ - project to one column per distinct variable in first-occurrence order" - ) +pub fn scan_atom(table: &Table, pattern: &AtomPattern) -> Relation { + assert_eq!( + pattern.columns.len(), + table.arity, + "pattern arity mismatch: pattern has {}, table has {}", + pattern.columns.len(), + table.arity, + ); + + let mut output_vars: Vec = Vec::new(); + let mut output_positions: Vec = Vec::new(); + let mut equality_pairs: Vec<(usize, usize)> = Vec::new(); + let mut literal_checks: Vec<(usize, &Value)> = Vec::new(); + let mut first_position: HashMap<&str, usize> = HashMap::new(); + + for (i, term) in pattern.columns.iter().enumerate() { + match term { + Term::Var(name) => { + if let Some(&j) = first_position.get(name.as_str()) { + equality_pairs.push((j, i)); + } else { + first_position.insert(name.as_str(), i); + output_vars.push(name.clone()); + output_positions.push(i); + } + } + Term::Lit(value) => literal_checks.push((i, value)), + } + } + + let mut output = Relation::new(output_vars); + 'rows: for row in &table.rows { + for &(i, lit) in &literal_checks { + if &row[i] != lit { + continue 'rows; + } + } + for &(j, i) in &equality_pairs { + if row[i] != row[j] { + continue 'rows; + } + } + let projected: Vec = output_positions.iter().map(|&i| row[i].clone()).collect(); + output.push(projected); + } + output +} + +#[cfg(test)] +mod tests { + use super::*; + + fn var(name: &str) -> Term { + Term::Var(name.to_string()) + } + + fn lit(value: i64) -> Term { + Term::Lit(Value::Int(value)) + } + + fn int(value: i64) -> Value { + Value::Int(value) + } + + #[test] + fn repeated_variable_keeps_only_self_loops() { + let edge = Table::from_rows( + 2, + vec![ + vec![int(1), int(2)], + vec![int(2), int(2)], + vec![int(3), int(3)], + vec![int(1), int(1)], + ], + ); + let pattern = AtomPattern { + columns: vec![var("X"), var("X")], + }; + let result = scan_atom(&edge, &pattern); + assert_eq!(result.columns, vec!["X".to_string()]); + assert_eq!(result.rows, vec![vec![int(2)], vec![int(3)], vec![int(1)]]); + } + + #[test] + fn literal_filters_rows_to_match() { + let edge = Table::from_rows( + 2, + vec![ + vec![int(1), int(2)], + vec![int(2), int(3)], + vec![int(1), int(4)], + ], + ); + let pattern = AtomPattern { + columns: vec![lit(1), var("Y")], + }; + let result = scan_atom(&edge, &pattern); + assert_eq!(result.columns, vec!["Y".to_string()]); + assert_eq!(result.rows, vec![vec![int(2)], vec![int(4)]]); + } + + #[test] + fn distinct_variables_project_in_first_occurrence_order() { + let triples = Table::from_rows( + 3, + vec![vec![int(1), int(2), int(3)], vec![int(4), int(5), int(6)]], + ); + let pattern = AtomPattern { + columns: vec![var("A"), var("B"), var("C")], + }; + let result = scan_atom(&triples, &pattern); + assert_eq!( + result.columns, + vec!["A".to_string(), "B".to_string(), "C".to_string()], + ); + assert_eq!( + result.rows, + vec![vec![int(1), int(2), int(3)], vec![int(4), int(5), int(6)]], + ); + } + + #[test] + fn variable_repeated_three_times_requires_all_equal() { + let triples = Table::from_rows( + 3, + vec![ + vec![int(1), int(1), int(1)], + vec![int(1), int(1), int(2)], + vec![int(2), int(2), int(2)], + vec![int(1), int(2), int(1)], + ], + ); + let pattern = AtomPattern { + columns: vec![var("X"), var("X"), var("X")], + }; + let result = scan_atom(&triples, &pattern); + assert_eq!(result.columns, vec!["X".to_string()]); + assert_eq!(result.rows, vec![vec![int(1)], vec![int(2)]]); + } + + #[test] + fn literal_filter_repeated_var_and_projection_combine() { + // Pattern: [Lit(1), Var("X"), Lit(2), Var("X")]. + // Keep rows where col0 == 1, col2 == 2, and col1 == col3. + // Output is one column [X], bound to col1 (the first occurrence). + let table = Table::from_rows( + 4, + vec![ + vec![int(1), int(7), int(2), int(7)], + vec![int(1), int(7), int(2), int(8)], + vec![int(0), int(7), int(2), int(7)], + vec![int(1), int(7), int(3), int(7)], + vec![int(1), int(9), int(2), int(9)], + ], + ); + let pattern = AtomPattern { + columns: vec![lit(1), var("X"), lit(2), var("X")], + }; + let result = scan_atom(&table, &pattern); + assert_eq!(result.columns, vec!["X".to_string()]); + assert_eq!(result.rows, vec![vec![int(7)], vec![int(9)]]); + } } diff --git a/crates/query-ops/src/join.rs b/crates/query-ops/src/join.rs index e75ccd1..384baae 100644 --- a/crates/query-ops/src/join.rs +++ b/crates/query-ops/src/join.rs @@ -9,17 +9,212 @@ //! emitting one row with the union of columns. Output column order is //! `left.columns` followed by `right.columns` minus the shared ones. -use crate::relation::Relation; +use std::collections::{HashMap, HashSet}; -#[must_use] -pub fn semijoin(_left: &Relation, _right: &Relation) -> Relation { - todo!("hash `right` on shared columns, probe with `left`, keep matching left rows") +use crate::{relation::Relation, value::Value}; + +fn shared_columns(left: &Relation, right: &Relation) -> Vec<(usize, usize)> { + left.columns + .iter() + .enumerate() + .filter_map(|(li, name)| { + right + .columns + .iter() + .position(|rname| rname == name) + .map(|ri| (li, ri)) + }) + .collect() +} + +fn project<'a>(row: &'a [Value], indices: impl IntoIterator) -> Vec { + indices.into_iter().map(|&i| row[i].clone()).collect() } #[must_use] -pub fn natural_join(_left: &Relation, _right: &Relation) -> Relation { - todo!( - "hash one side on shared columns, probe with the other, emit \ - left ++ (right \\ shared) for every match" - ) +pub fn semijoin(left: &Relation, right: &Relation) -> Relation { + let shared = shared_columns(left, right); + let left_keys: Vec = shared.iter().map(|&(li, _)| li).collect(); + let right_keys: Vec = shared.iter().map(|&(_, ri)| ri).collect(); + + let mut right_set: HashSet> = HashSet::new(); + for row in &right.rows { + right_set.insert(project(row, &right_keys)); + } + + let mut output = Relation::new(left.columns.clone()); + for row in &left.rows { + if right_set.contains(&project(row, &left_keys)) { + output.push(row.clone()); + } + } + output +} + +#[must_use] +pub fn natural_join(left: &Relation, right: &Relation) -> Relation { + let shared = shared_columns(left, right); + let left_keys: Vec = shared.iter().map(|&(li, _)| li).collect(); + let right_keys: Vec = shared.iter().map(|&(_, ri)| ri).collect(); + + let shared_right: HashSet = right_keys.iter().copied().collect(); + let right_only: Vec = (0..right.columns.len()) + .filter(|i| !shared_right.contains(i)) + .collect(); + + let mut output_columns = left.columns.clone(); + for &i in &right_only { + output_columns.push(right.columns[i].clone()); + } + + let mut right_index: HashMap, Vec<&Vec>> = HashMap::new(); + for row in &right.rows { + right_index + .entry(project(row, &right_keys)) + .or_default() + .push(row); + } + + let mut output = Relation::new(output_columns); + for left_row in &left.rows { + let key = project(left_row, &left_keys); + let Some(matches) = right_index.get(&key) else { + continue; + }; + for right_row in matches { + let mut joined = left_row.clone(); + for &i in &right_only { + joined.push(right_row[i].clone()); + } + output.push(joined); + } + } + output +} + +#[cfg(test)] +mod tests { + use super::*; + + fn col(name: &str) -> String { + name.to_string() + } + + fn int(value: i64) -> Value { + Value::Int(value) + } + + #[test] + fn semijoin_keeps_left_rows_matched_on_shared_column() { + let left = Relation::from_rows( + vec![col("X"), col("Y")], + vec![ + vec![int(1), int(10)], + vec![int(2), int(20)], + vec![int(3), int(30)], + ], + ); + let right = Relation::from_rows(vec![col("X")], vec![vec![int(1)], vec![int(3)]]); + let result = semijoin(&left, &right); + assert_eq!(result.columns, vec![col("X"), col("Y")]); + assert_eq!( + result.rows, + vec![vec![int(1), int(10)], vec![int(3), int(30)]], + ); + } + + #[test] + fn semijoin_does_not_duplicate_left_rows_when_right_has_duplicates() { + let left = Relation::from_rows(vec![col("X")], vec![vec![int(1)], vec![int(2)]]); + let right = Relation::from_rows( + vec![col("X"), col("Y")], + vec![ + vec![int(1), int(100)], + vec![int(1), int(101)], + vec![int(2), int(200)], + ], + ); + let result = semijoin(&left, &right); + assert_eq!(result.columns, vec![col("X")]); + assert_eq!(result.rows, vec![vec![int(1)], vec![int(2)]]); + } + + #[test] + fn natural_join_emits_union_of_columns_on_match() { + let left = Relation::from_rows( + vec![col("X"), col("Y")], + vec![vec![int(1), int(10)], vec![int(2), int(20)]], + ); + let right = Relation::from_rows( + vec![col("Y"), col("Z")], + vec![ + vec![int(10), int(100)], + vec![int(20), int(200)], + vec![int(20), int(201)], + ], + ); + let result = natural_join(&left, &right); + assert_eq!(result.columns, vec![col("X"), col("Y"), col("Z")]); + assert_eq!( + result.rows, + vec![ + vec![int(1), int(10), int(100)], + vec![int(2), int(20), int(200)], + vec![int(2), int(20), int(201)], + ], + ); + } + + #[test] + fn natural_join_with_no_shared_columns_is_cartesian_product() { + let left = Relation::from_rows(vec![col("X")], vec![vec![int(1)], vec![int(2)]]); + let right = Relation::from_rows(vec![col("Y")], vec![vec![int(10)], vec![int(20)]]); + let result = natural_join(&left, &right); + assert_eq!(result.columns, vec![col("X"), col("Y")]); + assert_eq!( + result.rows, + vec![ + vec![int(1), int(10)], + vec![int(1), int(20)], + vec![int(2), int(10)], + vec![int(2), int(20)], + ], + ); + } + + #[test] + fn semijoin_returns_empty_when_either_side_is_empty() { + let nonempty = Relation::from_rows(vec![col("X")], vec![vec![int(1)]]); + let empty = Relation::from_rows(vec![col("X")], vec![]); + + let r1 = semijoin(&empty, &nonempty); + assert_eq!(r1.columns, vec![col("X")]); + assert!(r1.rows.is_empty()); + + let r2 = semijoin(&nonempty, &empty); + assert_eq!(r2.columns, vec![col("X")]); + assert!(r2.rows.is_empty()); + + let r3 = semijoin(&empty, &empty); + assert_eq!(r3.columns, vec![col("X")]); + assert!(r3.rows.is_empty()); + } + + #[test] + fn natural_join_returns_empty_when_either_side_is_empty() { + let nonempty = Relation::from_rows(vec![col("X")], vec![vec![int(1)]]); + let empty = Relation::from_rows(vec![col("X")], vec![]); + + let r1 = natural_join(&empty, &nonempty); + assert_eq!(r1.columns, vec![col("X")]); + assert!(r1.rows.is_empty()); + + let r2 = natural_join(&nonempty, &empty); + assert_eq!(r2.columns, vec![col("X")]); + assert!(r2.rows.is_empty()); + + let r3 = natural_join(&empty, &empty); + assert_eq!(r3.columns, vec![col("X")]); + assert!(r3.rows.is_empty()); + } } diff --git a/crates/query-ops/src/relation.rs b/crates/query-ops/src/relation.rs index 3e49152..e2b75ca 100644 --- a/crates/query-ops/src/relation.rs +++ b/crates/query-ops/src/relation.rs @@ -3,6 +3,12 @@ //! Every operator in this crate (after the initial atom scan) consumes and //! produces [`Relation`]s. Column names are variable names; a value at column //! `i` of a row is the value bound to variable `columns[i]` in that solution. +//! +//! Column names within a single relation must be unique. Constructors enforce +//! this invariant; downstream operators rely on it when matching shared columns +//! across two relations. + +use std::collections::HashSet; use crate::value::Value; @@ -12,15 +18,46 @@ pub struct Relation { pub rows: Vec>, } +fn assert_unique_columns(columns: &[String]) { + let mut seen: HashSet<&str> = HashSet::with_capacity(columns.len()); + for name in columns { + assert!( + seen.insert(name.as_str()), + "duplicate column name in relation: {name}", + ); + } +} + impl Relation { + /// # Panics + /// Panics if `columns` contains a duplicate name. #[must_use] pub fn new(columns: Vec) -> Self { + assert_unique_columns(&columns); Self { columns, rows: Vec::new(), } } + /// # Panics + /// Panics if `columns` contains a duplicate name, or if any row's length + /// differs from `columns.len()`. + #[must_use] + pub fn from_rows(columns: Vec, rows: Vec>) -> Self { + assert_unique_columns(&columns); + let arity = columns.len(); + for (i, row) in rows.iter().enumerate() { + assert_eq!( + row.len(), + arity, + "row {i} arity mismatch: expected {arity}, got {}", + row.len(), + ); + } + Self { columns, rows } + } + /// # Panics /// Panics if `row.len() != self.columns.len()`. pub fn push(&mut self, row: Vec) { @@ -34,3 +71,20 @@ impl Relation { self.rows.push(row); } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + #[should_panic(expected = "duplicate column name")] + fn from_rows_rejects_duplicate_column_names() { + let _ = Relation::from_rows(vec!["X".to_string(), "X".to_string()], vec![]); + } + + #[test] + #[should_panic(expected = "duplicate column name")] + fn new_rejects_duplicate_column_names() { + let _ = Relation::new(vec!["X".to_string(), "X".to_string()]); + } +} diff --git a/crates/query-ops/src/table.rs b/crates/query-ops/src/table.rs index de0f143..60af911 100644 --- a/crates/query-ops/src/table.rs +++ b/crates/query-ops/src/table.rs @@ -20,6 +20,21 @@ impl Table { } } + /// # Panics + /// Panics if any row's length differs from `arity`. + #[must_use] + pub fn from_rows(arity: usize, rows: Vec>) -> Self { + for (i, row) in rows.iter().enumerate() { + assert_eq!( + row.len(), + arity, + "row {i} arity mismatch: expected {arity}, got {}", + row.len(), + ); + } + Self { arity, rows } + } + /// # Panics /// Panics if `row.len() != self.arity`. pub fn push(&mut self, row: Vec) { diff --git a/crates/query-ops/tests/hand_plan.rs b/crates/query-ops/tests/hand_plan.rs new file mode 100644 index 0000000..7265610 --- /dev/null +++ b/crates/query-ops/tests/hand_plan.rs @@ -0,0 +1,91 @@ +//! Hand-written query plan composed from `scan_atom`, `semijoin`, and `natural_join`. +//! +//! Schema: +//! - `author(name, book)`: who wrote each book +//! - `bestseller(book)`: the set of bestseller titles +//! - `price(book, dollars)`: price of each book +//! +//! Rule: +//! - `Q(name, book, dollars) :- author(name, book), bestseller(book), price(book, dollars).` +//! ("Authors of bestsellers along with each book's price.") +//! +//! The plan first scans each input table, then narrows `author` to authors of +//! bestsellers via a semijoin against `bestseller`, then attaches each book's +//! price via a natural join against `price`. + +use query_ops::atom::{AtomPattern, Term, scan_atom}; +use query_ops::join::{natural_join, semijoin}; +use query_ops::table::Table; +use query_ops::value::Value; + +fn s(x: &str) -> Value { + Value::Str(x.to_string()) +} + +fn i(x: i64) -> Value { + Value::Int(x) +} + +#[test] +fn authors_of_bestsellers_with_price() { + let author = Table::from_rows( + 2, + vec![ + vec![s("Alice"), s("Foo")], + vec![s("Bob"), s("Bar")], + vec![s("Alice"), s("Baz")], + vec![s("Carol"), s("Qux")], + ], + ); + let bestseller = Table::from_rows(1, vec![vec![s("Foo")], vec![s("Baz")]]); + let price = Table::from_rows( + 2, + vec![ + vec![s("Foo"), i(25)], + vec![s("Bar"), i(15)], + vec![s("Baz"), i(30)], + vec![s("Qux"), i(20)], + ], + ); + + let author_rel = scan_atom( + &author, + &AtomPattern { + columns: vec![Term::Var("name".to_string()), Term::Var("book".to_string())], + }, + ); + let bestseller_rel = scan_atom( + &bestseller, + &AtomPattern { + columns: vec![Term::Var("book".to_string())], + }, + ); + let price_rel = scan_atom( + &price, + &AtomPattern { + columns: vec![ + Term::Var("book".to_string()), + Term::Var("dollars".to_string()), + ], + }, + ); + + let authors_of_bestsellers = semijoin(&author_rel, &bestseller_rel); + let result = natural_join(&authors_of_bestsellers, &price_rel); + + assert_eq!( + result.columns, + vec![ + "name".to_string(), + "book".to_string(), + "dollars".to_string() + ], + ); + assert_eq!( + result.rows, + vec![ + vec![s("Alice"), s("Foo"), i(25)], + vec![s("Alice"), s("Baz"), i(30)], + ], + ); +}