223 lines
7.1 KiB
Rust
223 lines
7.1 KiB
Rust
//! Semijoin and natural join over binding relations.
|
|
//!
|
|
//! Both operators join on the shared column names of their inputs (the
|
|
//! "overlapping variables" in Datalog terms).
|
|
//!
|
|
//! - [`semijoin`] keeps rows of `left` whose shared-column values appear in
|
|
//! `right`. Output columns are `left.columns` unchanged.
|
|
//! - [`natural_join`] keeps every pair `(l, r)` that agrees on shared columns,
|
|
//! emitting one row with the union of columns. Output column order is
|
|
//! `left.columns` followed by `right.columns` minus the shared ones.
|
|
|
|
use std::collections::{HashMap, HashSet};
|
|
|
|
use storage::value::Value;
|
|
|
|
use crate::relation::Relation;
|
|
|
|
fn shared_columns(left: &Relation, right: &Relation) -> Vec<(usize, usize)> {
|
|
left.columns
|
|
.iter()
|
|
.enumerate()
|
|
.filter_map(|(li, name)| {
|
|
right
|
|
.columns
|
|
.iter()
|
|
.position(|rname| rname == name)
|
|
.map(|ri| (li, ri))
|
|
})
|
|
.collect()
|
|
}
|
|
|
|
fn project<'a>(row: &'a [Value], indices: impl IntoIterator<Item = &'a usize>) -> Vec<Value> {
|
|
indices.into_iter().map(|&i| row[i].clone()).collect()
|
|
}
|
|
|
|
#[must_use]
|
|
pub fn semijoin(left: &Relation, right: &Relation) -> Relation {
|
|
let shared = shared_columns(left, right);
|
|
let left_keys: Vec<usize> = shared.iter().map(|&(li, _)| li).collect();
|
|
let right_keys: Vec<usize> = shared.iter().map(|&(_, ri)| ri).collect();
|
|
|
|
let mut right_set: HashSet<Vec<Value>> = HashSet::new();
|
|
for row in &right.rows {
|
|
right_set.insert(project(row, &right_keys));
|
|
}
|
|
|
|
let mut output = Relation::new(left.columns.clone());
|
|
for row in &left.rows {
|
|
if right_set.contains(&project(row, &left_keys)) {
|
|
output.push(row.clone());
|
|
}
|
|
}
|
|
output
|
|
}
|
|
|
|
#[must_use]
|
|
pub fn natural_join(left: &Relation, right: &Relation) -> Relation {
|
|
let shared = shared_columns(left, right);
|
|
let left_keys: Vec<usize> = shared.iter().map(|&(li, _)| li).collect();
|
|
let right_keys: Vec<usize> = shared.iter().map(|&(_, ri)| ri).collect();
|
|
|
|
let shared_right: HashSet<usize> = right_keys.iter().copied().collect();
|
|
let right_only: Vec<usize> = (0..right.columns.len())
|
|
.filter(|i| !shared_right.contains(i))
|
|
.collect();
|
|
|
|
let mut output_columns = left.columns.clone();
|
|
for &i in &right_only {
|
|
output_columns.push(right.columns[i].clone());
|
|
}
|
|
|
|
let mut right_index: HashMap<Vec<Value>, Vec<&Vec<Value>>> = HashMap::new();
|
|
for row in &right.rows {
|
|
right_index
|
|
.entry(project(row, &right_keys))
|
|
.or_default()
|
|
.push(row);
|
|
}
|
|
|
|
let mut output = Relation::new(output_columns);
|
|
for left_row in &left.rows {
|
|
let key = project(left_row, &left_keys);
|
|
let Some(matches) = right_index.get(&key) else {
|
|
continue;
|
|
};
|
|
for right_row in matches {
|
|
let mut joined = left_row.clone();
|
|
for &i in &right_only {
|
|
joined.push(right_row[i].clone());
|
|
}
|
|
output.push(joined);
|
|
}
|
|
}
|
|
output
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
fn col(name: &str) -> String {
|
|
name.to_string()
|
|
}
|
|
|
|
fn int(value: i64) -> Value {
|
|
Value::Int(value)
|
|
}
|
|
|
|
#[test]
|
|
fn semijoin_keeps_left_rows_matched_on_shared_column() {
|
|
let left = Relation::from_rows(
|
|
vec![col("X"), col("Y")],
|
|
vec![
|
|
vec![int(1), int(10)],
|
|
vec![int(2), int(20)],
|
|
vec![int(3), int(30)],
|
|
],
|
|
);
|
|
let right = Relation::from_rows(vec![col("X")], vec![vec![int(1)], vec![int(3)]]);
|
|
let result = semijoin(&left, &right);
|
|
assert_eq!(result.columns, vec![col("X"), col("Y")]);
|
|
assert_eq!(
|
|
result.rows,
|
|
vec![vec![int(1), int(10)], vec![int(3), int(30)]],
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn semijoin_does_not_duplicate_left_rows_when_right_has_duplicates() {
|
|
let left = Relation::from_rows(vec![col("X")], vec![vec![int(1)], vec![int(2)]]);
|
|
let right = Relation::from_rows(
|
|
vec![col("X"), col("Y")],
|
|
vec![
|
|
vec![int(1), int(100)],
|
|
vec![int(1), int(101)],
|
|
vec![int(2), int(200)],
|
|
],
|
|
);
|
|
let result = semijoin(&left, &right);
|
|
assert_eq!(result.columns, vec![col("X")]);
|
|
assert_eq!(result.rows, vec![vec![int(1)], vec![int(2)]]);
|
|
}
|
|
|
|
#[test]
|
|
fn natural_join_emits_union_of_columns_on_match() {
|
|
let left = Relation::from_rows(
|
|
vec![col("X"), col("Y")],
|
|
vec![vec![int(1), int(10)], vec![int(2), int(20)]],
|
|
);
|
|
let right = Relation::from_rows(
|
|
vec![col("Y"), col("Z")],
|
|
vec![
|
|
vec![int(10), int(100)],
|
|
vec![int(20), int(200)],
|
|
vec![int(20), int(201)],
|
|
],
|
|
);
|
|
let result = natural_join(&left, &right);
|
|
assert_eq!(result.columns, vec![col("X"), col("Y"), col("Z")]);
|
|
assert_eq!(
|
|
result.rows,
|
|
vec![
|
|
vec![int(1), int(10), int(100)],
|
|
vec![int(2), int(20), int(200)],
|
|
vec![int(2), int(20), int(201)],
|
|
],
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn natural_join_with_no_shared_columns_is_cartesian_product() {
|
|
let left = Relation::from_rows(vec![col("X")], vec![vec![int(1)], vec![int(2)]]);
|
|
let right = Relation::from_rows(vec![col("Y")], vec![vec![int(10)], vec![int(20)]]);
|
|
let result = natural_join(&left, &right);
|
|
assert_eq!(result.columns, vec![col("X"), col("Y")]);
|
|
assert_eq!(
|
|
result.rows,
|
|
vec![
|
|
vec![int(1), int(10)],
|
|
vec![int(1), int(20)],
|
|
vec![int(2), int(10)],
|
|
vec![int(2), int(20)],
|
|
],
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn semijoin_returns_empty_when_either_side_is_empty() {
|
|
let nonempty = Relation::from_rows(vec![col("X")], vec![vec![int(1)]]);
|
|
let empty = Relation::from_rows(vec![col("X")], vec![]);
|
|
|
|
let r1 = semijoin(&empty, &nonempty);
|
|
assert_eq!(r1.columns, vec![col("X")]);
|
|
assert!(r1.rows.is_empty());
|
|
|
|
let r2 = semijoin(&nonempty, &empty);
|
|
assert_eq!(r2.columns, vec![col("X")]);
|
|
assert!(r2.rows.is_empty());
|
|
|
|
let r3 = semijoin(&empty, &empty);
|
|
assert_eq!(r3.columns, vec![col("X")]);
|
|
assert!(r3.rows.is_empty());
|
|
}
|
|
|
|
#[test]
|
|
fn natural_join_returns_empty_when_either_side_is_empty() {
|
|
let nonempty = Relation::from_rows(vec![col("X")], vec![vec![int(1)]]);
|
|
let empty = Relation::from_rows(vec![col("X")], vec![]);
|
|
|
|
let r1 = natural_join(&empty, &nonempty);
|
|
assert_eq!(r1.columns, vec![col("X")]);
|
|
assert!(r1.rows.is_empty());
|
|
|
|
let r2 = natural_join(&nonempty, &empty);
|
|
assert_eq!(r2.columns, vec![col("X")]);
|
|
assert!(r2.rows.is_empty());
|
|
|
|
let r3 = natural_join(&empty, &empty);
|
|
assert_eq!(r3.columns, vec![col("X")]);
|
|
assert!(r3.rows.is_empty());
|
|
}
|
|
}
|