//! Semijoin and natural join over binding relations. //! //! Both operators join on the shared column names of their inputs (the //! "overlapping variables" in Datalog terms). //! //! - [`semijoin`] keeps rows of `left` whose shared-column values appear in //! `right`. Output columns are `left.columns` unchanged. //! - [`natural_join`] keeps every pair `(l, r)` that agrees on shared columns, //! emitting one row with the union of columns. Output column order is //! `left.columns` followed by `right.columns` minus the shared ones. use std::collections::{HashMap, HashSet}; use crate::{relation::Relation, value::Value}; fn shared_columns(left: &Relation, right: &Relation) -> Vec<(usize, usize)> { left.columns .iter() .enumerate() .filter_map(|(li, name)| { right .columns .iter() .position(|rname| rname == name) .map(|ri| (li, ri)) }) .collect() } fn project<'a>(row: &'a [Value], indices: impl IntoIterator) -> Vec { indices.into_iter().map(|&i| row[i].clone()).collect() } #[must_use] pub fn semijoin(left: &Relation, right: &Relation) -> Relation { let shared = shared_columns(left, right); let left_keys: Vec = shared.iter().map(|&(li, _)| li).collect(); let right_keys: Vec = shared.iter().map(|&(_, ri)| ri).collect(); let mut right_set: HashSet> = HashSet::new(); for row in &right.rows { right_set.insert(project(row, &right_keys)); } let mut output = Relation::new(left.columns.clone()); for row in &left.rows { if right_set.contains(&project(row, &left_keys)) { output.push(row.clone()); } } output } #[must_use] pub fn natural_join(left: &Relation, right: &Relation) -> Relation { let shared = shared_columns(left, right); let left_keys: Vec = shared.iter().map(|&(li, _)| li).collect(); let right_keys: Vec = shared.iter().map(|&(_, ri)| ri).collect(); let shared_right: HashSet = right_keys.iter().copied().collect(); let right_only: Vec = (0..right.columns.len()) .filter(|i| !shared_right.contains(i)) .collect(); let mut output_columns = left.columns.clone(); for &i in &right_only { output_columns.push(right.columns[i].clone()); } let mut right_index: HashMap, Vec<&Vec>> = HashMap::new(); for row in &right.rows { right_index .entry(project(row, &right_keys)) .or_default() .push(row); } let mut output = Relation::new(output_columns); for left_row in &left.rows { let key = project(left_row, &left_keys); let Some(matches) = right_index.get(&key) else { continue; }; for right_row in matches { let mut joined = left_row.clone(); for &i in &right_only { joined.push(right_row[i].clone()); } output.push(joined); } } output } #[cfg(test)] mod tests { use super::*; fn col(name: &str) -> String { name.to_string() } fn int(value: i64) -> Value { Value::Int(value) } #[test] fn semijoin_keeps_left_rows_matched_on_shared_column() { let left = Relation::from_rows( vec![col("X"), col("Y")], vec![ vec![int(1), int(10)], vec![int(2), int(20)], vec![int(3), int(30)], ], ); let right = Relation::from_rows(vec![col("X")], vec![vec![int(1)], vec![int(3)]]); let result = semijoin(&left, &right); assert_eq!(result.columns, vec![col("X"), col("Y")]); assert_eq!( result.rows, vec![vec![int(1), int(10)], vec![int(3), int(30)]], ); } #[test] fn semijoin_does_not_duplicate_left_rows_when_right_has_duplicates() { let left = Relation::from_rows(vec![col("X")], vec![vec![int(1)], vec![int(2)]]); let right = Relation::from_rows( vec![col("X"), col("Y")], vec![ vec![int(1), int(100)], vec![int(1), int(101)], vec![int(2), int(200)], ], ); let result = semijoin(&left, &right); assert_eq!(result.columns, vec![col("X")]); assert_eq!(result.rows, vec![vec![int(1)], vec![int(2)]]); } #[test] fn natural_join_emits_union_of_columns_on_match() { let left = Relation::from_rows( vec![col("X"), col("Y")], vec![vec![int(1), int(10)], vec![int(2), int(20)]], ); let right = Relation::from_rows( vec![col("Y"), col("Z")], vec![ vec![int(10), int(100)], vec![int(20), int(200)], vec![int(20), int(201)], ], ); let result = natural_join(&left, &right); assert_eq!(result.columns, vec![col("X"), col("Y"), col("Z")]); assert_eq!( result.rows, vec![ vec![int(1), int(10), int(100)], vec![int(2), int(20), int(200)], vec![int(2), int(20), int(201)], ], ); } #[test] fn natural_join_with_no_shared_columns_is_cartesian_product() { let left = Relation::from_rows(vec![col("X")], vec![vec![int(1)], vec![int(2)]]); let right = Relation::from_rows(vec![col("Y")], vec![vec![int(10)], vec![int(20)]]); let result = natural_join(&left, &right); assert_eq!(result.columns, vec![col("X"), col("Y")]); assert_eq!( result.rows, vec![ vec![int(1), int(10)], vec![int(1), int(20)], vec![int(2), int(10)], vec![int(2), int(20)], ], ); } #[test] fn semijoin_returns_empty_when_either_side_is_empty() { let nonempty = Relation::from_rows(vec![col("X")], vec![vec![int(1)]]); let empty = Relation::from_rows(vec![col("X")], vec![]); let r1 = semijoin(&empty, &nonempty); assert_eq!(r1.columns, vec![col("X")]); assert!(r1.rows.is_empty()); let r2 = semijoin(&nonempty, &empty); assert_eq!(r2.columns, vec![col("X")]); assert!(r2.rows.is_empty()); let r3 = semijoin(&empty, &empty); assert_eq!(r3.columns, vec![col("X")]); assert!(r3.rows.is_empty()); } #[test] fn natural_join_returns_empty_when_either_side_is_empty() { let nonempty = Relation::from_rows(vec![col("X")], vec![vec![int(1)]]); let empty = Relation::from_rows(vec![col("X")], vec![]); let r1 = natural_join(&empty, &nonempty); assert_eq!(r1.columns, vec![col("X")]); assert!(r1.rows.is_empty()); let r2 = natural_join(&nonempty, &empty); assert_eq!(r2.columns, vec![col("X")]); assert!(r2.rows.is_empty()); let r3 = natural_join(&empty, &empty); assert_eq!(r3.columns, vec![col("X")]); assert!(r3.rows.is_empty()); } }