storage-engine-playground/crates/storage/src/codec.rs

//! Wire format shared by every byte-oriented backend in this crate.
//!
//! The encoding is hand-rolled (no `serde`, no `bincode`) so that the
//! generated bytes are stable and inspectable. It is **not** versioned: adding
//! a new [`Value`] variant invalidates previously-stored data. That is fine
//! for a playground; production code would prepend a format byte.
//!
//! ## Row Format
//!
//! `[count: u32 LE] [val × count]`
//!
//! ## Value Format
//!
//! `[tag: u8] [payload]`
//!
//! | Tag    | Variant       | Payload                              |
//! |--------|---------------|--------------------------------------|
//! | `0x00` | `Value::Int`  | `i64 LE` (8 bytes)                   |
//! | `0x01` | `Value::Str`  | `[len: u32 LE] [bytes]`              |
//! | `0x02` | `Value::Id`   | `[len: u32 LE] [bytes]`              |
//!
//! ## Row Key Format
//!
//! Synthetic row IDs are `u64` encoded big-endian so lexicographic key order
//! matches insertion order. Backends with named sub-stores per relation can
//! use this directly as the key.
//!
//! ## Metadata Format
//!
//! Per-relation metadata is `[arity: u32 LE] [next_id: u64 LE]` = 12 bytes.

use crate::id::RowId;
use crate::value::Value;

/// Errors raised by [`decode_row`] and [`decode_meta`].
#[derive(Debug)]
pub enum CodecError {
    /// The byte slice ended before the expected number of fields was read.
    UnexpectedEof,
    /// A value tag byte was unrecognized.
    UnknownTag(u8),
    /// A length field declared more bytes than the slice contains.
    LengthOverrun { declared: usize, available: usize },
    /// A UTF-8 string payload could not be decoded.
    InvalidUtf8,
}

impl std::fmt::Display for CodecError {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            Self::UnexpectedEof => write!(f, "unexpected end of bytes"),
            Self::UnknownTag(t) => write!(f, "unknown value tag: 0x{t:02x}"),
            Self::LengthOverrun {
                declared,
                available,
            } => write!(
                f,
                "declared length {declared} exceeds available {available} bytes"
            ),
            Self::InvalidUtf8 => write!(f, "invalid UTF-8 in string payload"),
        }
    }
}

impl std::error::Error for CodecError {}

/// Encode a row of [`Value`]s to bytes.
#[must_use]
pub fn encode_row(row: &[Value]) -> Vec<u8> {
    let mut out = Vec::with_capacity(4 + row.len() * 9);
    out.extend_from_slice(&u32::try_from(row.len()).unwrap_or(u32::MAX).to_le_bytes());
    for value in row {
        match value {
            Value::Int(i) => {
                out.push(0x00);
                out.extend_from_slice(&i.to_le_bytes());
            }
            Value::Str(s) => {
                out.push(0x01);
                let bytes = s.as_bytes();
                out.extend_from_slice(
                    &u32::try_from(bytes.len()).unwrap_or(u32::MAX).to_le_bytes(),
                );
                out.extend_from_slice(bytes);
            }
            Value::Id(id) => {
                out.push(0x02);
                let bytes = id.as_bytes();
                out.extend_from_slice(
                    &u32::try_from(bytes.len()).unwrap_or(u32::MAX).to_le_bytes(),
                );
                out.extend_from_slice(bytes);
            }
        }
    }
    out
}

/// Decode a row of [`Value`]s from bytes.
///
/// # Errors
/// Returns [`CodecError`] if the byte slice is malformed.
pub fn decode_row(mut bytes: &[u8]) -> Result<Vec<Value>, CodecError> {
    let count = read_u32(&mut bytes)? as usize;
    let mut row = Vec::with_capacity(count);
    for _ in 0..count {
        row.push(read_value(&mut bytes)?);
    }
    Ok(row)
}

fn read_value(bytes: &mut &[u8]) -> Result<Value, CodecError> {
    let tag = read_u8(bytes)?;
    match tag {
        0x00 => {
            let i = read_i64(bytes)?;
            Ok(Value::Int(i))
        }
        0x01 => {
            let len = read_u32(bytes)? as usize;
            if bytes.len() < len {
                return Err(CodecError::LengthOverrun {
                    declared: len,
                    available: bytes.len(),
                });
            }
            let (head, tail) = bytes.split_at(len);
            *bytes = tail;
            let s = std::str::from_utf8(head)
                .map_err(|_| CodecError::InvalidUtf8)?
                .to_string();
            Ok(Value::Str(s))
        }
        0x02 => {
            let len = read_u32(bytes)? as usize;
            if bytes.len() < len {
                return Err(CodecError::LengthOverrun {
                    declared: len,
                    available: bytes.len(),
                });
            }
            let (head, tail) = bytes.split_at(len);
            *bytes = tail;
            Ok(Value::Id(RowId::new(head)))
        }
        other => Err(CodecError::UnknownTag(other)),
    }
}

fn read_u8(bytes: &mut &[u8]) -> Result<u8, CodecError> {
    let (head, tail) = bytes.split_first().ok_or(CodecError::UnexpectedEof)?;
    *bytes = tail;
    Ok(*head)
}

fn read_u32(bytes: &mut &[u8]) -> Result<u32, CodecError> {
    if bytes.len() < 4 {
        return Err(CodecError::UnexpectedEof);
    }
    let (head, tail) = bytes.split_at(4);
    *bytes = tail;
    let mut buf = [0u8; 4];
    buf.copy_from_slice(head);
    Ok(u32::from_le_bytes(buf))
}

fn read_u64(bytes: &mut &[u8]) -> Result<u64, CodecError> {
    if bytes.len() < 8 {
        return Err(CodecError::UnexpectedEof);
    }
    let (head, tail) = bytes.split_at(8);
    *bytes = tail;
    let mut buf = [0u8; 8];
    buf.copy_from_slice(head);
    Ok(u64::from_le_bytes(buf))
}

fn read_i64(bytes: &mut &[u8]) -> Result<i64, CodecError> {
    if bytes.len() < 8 {
        return Err(CodecError::UnexpectedEof);
    }
    let (head, tail) = bytes.split_at(8);
    *bytes = tail;
    let mut buf = [0u8; 8];
    buf.copy_from_slice(head);
    Ok(i64::from_le_bytes(buf))
}

/// Encode a row key from a synthetic u64 ID.
///
/// Big-endian so lexicographic key order matches insertion order.
#[must_use]
pub fn row_key(id: u64) -> [u8; 8] {
    id.to_be_bytes()
}

/// Encode per-relation metadata: arity and next row ID.
#[must_use]
pub fn encode_meta(arity: u32, next_id: u64) -> [u8; 12] {
    let mut out = [0u8; 12];
    out[0..4].copy_from_slice(&arity.to_le_bytes());
    out[4..12].copy_from_slice(&next_id.to_le_bytes());
    out
}

/// Decode per-relation metadata.
///
/// # Errors
/// Returns [`CodecError::UnexpectedEof`] if the slice is shorter than 12 bytes.
pub fn decode_meta(mut bytes: &[u8]) -> Result<(u32, u64), CodecError> {
    let arity = read_u32(&mut bytes)?;
    let next_id = read_u64(&mut bytes)?;
    Ok((arity, next_id))
}

#[cfg(test)]
mod tests {
    use super::*;

    fn i(x: i64) -> Value {
        Value::Int(x)
    }

    fn s(x: &str) -> Value {
        Value::Str(x.to_string())
    }

    #[test]
    fn encode_decode_int_only_row() -> Result<(), CodecError> {
        let row = vec![i(1), i(-2), i(i64::MAX)];
        let bytes = encode_row(&row);
        let decoded = decode_row(&bytes)?;
        assert_eq!(decoded, row);
        Ok(())
    }

    #[test]
    fn encode_decode_mixed_row() -> Result<(), CodecError> {
        let row = vec![s("Alice"), i(42), s("a longer string with spaces")];
        let bytes = encode_row(&row);
        let decoded = decode_row(&bytes)?;
        assert_eq!(decoded, row);
        Ok(())
    }

    #[test]
    fn encode_decode_empty_row() -> Result<(), CodecError> {
        let bytes = encode_row(&[]);
        let decoded = decode_row(&bytes)?;
        assert!(decoded.is_empty());
        Ok(())
    }

    #[test]
    fn decode_unknown_tag_fails() {
        let bytes = vec![1, 0, 0, 0, 0xFF];
        assert!(matches!(
            decode_row(&bytes),
            Err(CodecError::UnknownTag(0xFF))
        ));
    }

    #[test]
    fn decode_truncated_fails() {
        let bytes = vec![1, 0, 0, 0, 0x00, 0x01];
        assert!(matches!(decode_row(&bytes), Err(CodecError::UnexpectedEof)));
    }

    #[test]
    fn row_key_preserves_order() {
        assert!(row_key(1) < row_key(2));
        assert!(row_key(255) < row_key(256));
        assert!(row_key(u64::MAX - 1) < row_key(u64::MAX));
    }

    #[test]
    fn meta_roundtrip() -> Result<(), CodecError> {
        let encoded = encode_meta(3, 12345);
        let (arity, next_id) = decode_meta(&encoded)?;
        assert_eq!(arity, 3);
        assert_eq!(next_id, 12345);
        Ok(())
    }
}