Skip to main content

laminar_core/state/
dict_encoder.rs

1//! Dictionary key encoder for low-cardinality GROUP BY keys.
2//!
3//! Maps variable-length byte keys (e.g., ticker symbols like `"AAPL"`, region
4//! names like `"us-east-1"`) to compact fixed-size `u32` codes. This reduces
5//! hash map key sizes from variable-length to 4 bytes, improving:
6//!
7//! - **Hash throughput**: hashing 4 bytes is ~5x faster than hashing 20+ byte strings
8//! - **Cache utilization**: smaller keys mean more entries per cache line
9//! - **Comparison cost**: `u32 == u32` is a single instruction vs `memcmp`
10//!
11//! # Architecture
12//!
13//! - **Ring 2 (setup)**: Call [`DictionaryKeyEncoder::encode_or_insert`] during
14//!   query registration to populate the dictionary with known keys.
15//! - **Ring 0 (hot path)**: Call [`DictionaryKeyEncoder::encode`] for O(1) lookup
16//!   of pre-populated keys. Unknown keys fall through to the raw-byte path.
17//!
18//! # Example
19//!
20//! ```rust
21//! use laminar_core::state::DictionaryKeyEncoder;
22//!
23//! let mut encoder = DictionaryKeyEncoder::new();
24//!
25//! // Ring 2: populate during query setup
26//! let code_aapl = encoder.encode_or_insert(b"AAPL").unwrap();
27//! let code_goog = encoder.encode_or_insert(b"GOOG").unwrap();
28//! assert_ne!(code_aapl, code_goog);
29//!
30//! // Ring 0: fast lookup on hot path
31//! assert_eq!(encoder.encode(b"AAPL"), Some(code_aapl));
32//! assert_eq!(encoder.encode(b"UNKNOWN"), None);
33//!
34//! // Decode back to original key
35//! assert_eq!(encoder.decode(code_aapl), Some(b"AAPL".as_slice()));
36//! ```
37
38use rustc_hash::{FxBuildHasher, FxHashMap};
39
40/// Compact dictionary encoder that maps variable-length byte keys to `u32` codes.
41///
42/// Pre-populate during query setup (Ring 2), then use the fast `encode()` path
43/// during event processing (Ring 0). The encoder is `Send` for transfer to
44/// reactor threads but not `Sync` — it's designed for single-threaded access.
45pub struct DictionaryKeyEncoder {
46    /// Forward mapping: raw key bytes → compact code.
47    key_to_code: FxHashMap<Vec<u8>, u32>,
48    /// Reverse mapping: compact code → raw key bytes.
49    code_to_key: Vec<Vec<u8>>,
50}
51
52impl DictionaryKeyEncoder {
53    /// Creates a new empty encoder.
54    #[must_use]
55    pub fn new() -> Self {
56        Self {
57            key_to_code: FxHashMap::default(),
58            code_to_key: Vec::new(),
59        }
60    }
61
62    /// Creates a new encoder with pre-allocated capacity.
63    ///
64    /// Use this when the approximate number of distinct keys is known
65    /// (e.g., number of ticker symbols, region names).
66    #[must_use]
67    pub fn with_capacity(capacity: usize) -> Self {
68        Self {
69            key_to_code: FxHashMap::with_capacity_and_hasher(capacity, FxBuildHasher),
70            code_to_key: Vec::with_capacity(capacity),
71        }
72    }
73
74    /// Encodes a key to its compact `u32` code (Ring 0 hot path).
75    ///
76    /// Returns `None` if the key has not been inserted. This is a pure
77    /// hash map lookup with no allocation.
78    #[inline]
79    #[must_use]
80    pub fn encode(&self, key: &[u8]) -> Option<u32> {
81        self.key_to_code.get(key).copied()
82    }
83
84    /// Encodes a key, inserting it if not already present (Ring 2 setup).
85    ///
86    /// Allocates on first insertion of a new key. Subsequent calls for the
87    /// same key return the existing code without allocation.
88    ///
89    /// # Errors
90    ///
91    /// Returns `None` if the dictionary already contains `u32::MAX` keys.
92    pub fn encode_or_insert(&mut self, key: &[u8]) -> Option<u32> {
93        if let Some(&code) = self.key_to_code.get(key) {
94            return Some(code);
95        }
96        let code = u32::try_from(self.code_to_key.len()).ok()?;
97        self.key_to_code.insert(key.to_vec(), code);
98        self.code_to_key.push(key.to_vec());
99        Some(code)
100    }
101
102    /// Decodes a compact code back to the original key bytes.
103    ///
104    /// Returns `None` if the code is out of range.
105    #[inline]
106    #[must_use]
107    pub fn decode(&self, code: u32) -> Option<&[u8]> {
108        self.code_to_key.get(code as usize).map(Vec::as_slice)
109    }
110
111    /// Returns the encoded key as a 4-byte little-endian slice.
112    ///
113    /// Useful for passing encoded keys to state stores that expect `&[u8]`.
114    #[inline]
115    #[must_use]
116    pub fn encode_to_bytes(&self, key: &[u8]) -> Option<[u8; 4]> {
117        self.encode(key).map(u32::to_le_bytes)
118    }
119
120    /// Returns the number of distinct keys in the dictionary.
121    #[inline]
122    #[must_use]
123    pub fn len(&self) -> usize {
124        self.code_to_key.len()
125    }
126
127    /// Returns true if the dictionary is empty.
128    #[inline]
129    #[must_use]
130    pub fn is_empty(&self) -> bool {
131        self.code_to_key.is_empty()
132    }
133
134    /// Clears the dictionary, removing all mappings.
135    pub fn clear(&mut self) {
136        self.key_to_code.clear();
137        self.code_to_key.clear();
138    }
139}
140
141impl Default for DictionaryKeyEncoder {
142    fn default() -> Self {
143        Self::new()
144    }
145}
146
147#[cfg(test)]
148mod tests {
149    use super::*;
150
151    #[test]
152    fn test_encode_decode_roundtrip() {
153        let mut enc = DictionaryKeyEncoder::new();
154        let code = enc.encode_or_insert(b"AAPL").unwrap();
155        assert_eq!(enc.decode(code), Some(b"AAPL".as_slice()));
156    }
157
158    #[test]
159    fn test_idempotent_insert() {
160        let mut enc = DictionaryKeyEncoder::new();
161        let c1 = enc.encode_or_insert(b"GOOG").unwrap();
162        let c2 = enc.encode_or_insert(b"GOOG").unwrap();
163        assert_eq!(c1, c2);
164        assert_eq!(enc.len(), 1);
165    }
166
167    #[test]
168    fn test_distinct_codes() {
169        let mut enc = DictionaryKeyEncoder::new();
170        let c1 = enc.encode_or_insert(b"AAPL").unwrap();
171        let c2 = enc.encode_or_insert(b"GOOG").unwrap();
172        let c3 = enc.encode_or_insert(b"MSFT").unwrap();
173        assert_ne!(c1, c2);
174        assert_ne!(c2, c3);
175        assert_eq!(enc.len(), 3);
176    }
177
178    #[test]
179    fn test_encode_unknown_returns_none() {
180        let enc = DictionaryKeyEncoder::new();
181        assert_eq!(enc.encode(b"UNKNOWN"), None);
182    }
183
184    #[test]
185    fn test_encode_to_bytes() {
186        let mut enc = DictionaryKeyEncoder::new();
187        let code = enc.encode_or_insert(b"TEST").unwrap();
188        let bytes = enc.encode_to_bytes(b"TEST").unwrap();
189        assert_eq!(bytes, code.to_le_bytes());
190        assert_eq!(enc.encode_to_bytes(b"MISSING"), None);
191    }
192
193    #[test]
194    fn test_decode_out_of_range() {
195        let enc = DictionaryKeyEncoder::new();
196        assert_eq!(enc.decode(0), None);
197        assert_eq!(enc.decode(999), None);
198    }
199
200    #[test]
201    fn test_with_capacity() {
202        let enc = DictionaryKeyEncoder::with_capacity(1000);
203        assert!(enc.is_empty());
204        assert_eq!(enc.len(), 0);
205    }
206
207    #[test]
208    fn test_clear() {
209        let mut enc = DictionaryKeyEncoder::new();
210        enc.encode_or_insert(b"A").unwrap();
211        enc.encode_or_insert(b"B").unwrap();
212        assert_eq!(enc.len(), 2);
213
214        enc.clear();
215        assert!(enc.is_empty());
216        assert_eq!(enc.encode(b"A"), None);
217    }
218
219    #[test]
220    fn test_sequential_codes() {
221        let mut enc = DictionaryKeyEncoder::new();
222        assert_eq!(enc.encode_or_insert(b"first"), Some(0));
223        assert_eq!(enc.encode_or_insert(b"second"), Some(1));
224        assert_eq!(enc.encode_or_insert(b"third"), Some(2));
225    }
226
227    #[test]
228    fn test_binary_keys() {
229        let mut enc = DictionaryKeyEncoder::new();
230        let key = [0x00, 0xFF, 0xDE, 0xAD];
231        let code = enc.encode_or_insert(&key).unwrap();
232        assert_eq!(enc.decode(code), Some(key.as_slice()));
233    }
234}