hstr/
lib.rs

1#![cfg_attr(feature = "atom_size_128", feature(integer_atomics))]
2//! See [Atom] for more information.
3
4use core::str;
5use std::{
6    borrow::Borrow,
7    fmt::{Debug, Display},
8    hash::Hash,
9    mem::{self, forget, transmute, ManuallyDrop},
10    num::NonZeroU8,
11    ops::Deref,
12    str::from_utf8_unchecked,
13};
14
15use debug_unreachable::debug_unreachable;
16use once_cell::sync::Lazy;
17
18pub use crate::dynamic::{global_atom_store_gc, AtomStore};
19use crate::{
20    macros::{get_hash, impl_from_alias, partial_eq},
21    tagged_value::TaggedValue,
22};
23
24mod dynamic;
25mod global_store;
26mod macros;
27mod tagged_value;
28#[cfg(test)]
29mod tests;
30pub mod wtf8;
31mod wtf8_atom;
32
33pub use wtf8_atom::Wtf8Atom;
34
35/// An immutable string which is cheap to clone, compare, hash, and has small
36/// size.
37///
38/// # Usecase
39///
40/// This type is designed for the compilers or build tools like a bundler
41/// written in Rust. String interning is much costly than simply allocating a
42/// string, but in compilers, hashing and comparison of strings are very
43/// frequent operations for some of strings, and the other strings are mostly
44/// just passed around. According to DoD, we should optimize types
45/// for operations that occur frequently, and this type is the result.
46///
47/// # Features
48///
49/// ## No mutex on creation and destruction.
50///
51/// This type also considers concurrent processing of AST nodes. Parsers
52/// generates lots of [Atom]s, so the creation of [Atom] should never involve a
53/// global mutex, because parsers are embarrassingly parallel. Also, the AST
54/// nodes are typically dropped in parallel. So [Drop] implementation of [Atom]
55/// should not involve a global mutex, too.
56///
57///
58/// ## Small size (One `u64`)
59///
60/// The most of strings are simply passed around, so the size of [Atom] should
61/// be small as possible.
62///
63/// ```rust
64/// # use std::mem::size_of;
65/// # if !cfg!(feature = "atom_size_128") {
66/// use hstr::Atom;
67/// assert!(size_of::<Atom>() == size_of::<u64>());
68/// assert!(size_of::<Option<Atom>>() == size_of::<u64>());
69/// # }
70/// ````
71///
72///
73/// ## Fast equality check (in most cases)
74///
75/// Equality comparison is O(1) in most cases. If two atoms are from the same
76/// [AtomStore], or they are from different stores but they are
77/// [`AtomStore::merge`]d, they are compared by numeric equality.
78///
79/// If two strings are created from different [AtomStore]s, they are compared
80/// using `strcmp` by default. But `hstr` allows you to make `==` faster -
81/// [`AtomStore::merge`].
82///
83///
84/// ## Fast [Hash] implementation
85///
86/// [Atom] precompute the hash value of long strings when they are created, so
87/// it is `O(1)` to compute hash.
88///
89///
90/// ## Small strings as inline data
91///
92/// Small strings are stored in the [Atom] itself without any allocation.
93///
94///
95/// ## Creating atoms
96///
97/// If you are working on a module which creates lots of [Atom]s, you are
98/// recommended to use [AtomStore] API because it's faster. But if you are not,
99/// you can use global APIs for convenience.
100///
101/// ## Dealloc
102///
103/// - Atoms stored in the local `AtomStore` have the same lifetime as the store
104///   itself, and will be deallocated when the store is dropped.
105/// - Atoms created via the `atom!` macro or `String::into` are stored in the
106///   global atom store. By default, these atoms are never deallocated. To clean
107///   up unused atoms, call [global_atom_store_gc].
108#[repr(transparent)]
109pub struct Atom {
110    // If this Atom is a dynamic one, this is *const Entry
111    unsafe_data: TaggedValue,
112}
113
114#[doc(hidden)]
115pub type CachedAtom = Lazy<Atom>;
116
117#[doc(hidden)]
118pub const fn inline_atom(s: &str) -> Option<Atom> {
119    dynamic::inline_atom(s)
120}
121
122/// Create an atom from a string literal. This atom is never dropped.
123#[macro_export]
124macro_rules! atom {
125    ($s:expr) => {{
126        const INLINE: ::core::option::Option<$crate::Atom> = $crate::inline_atom($s);
127        // This condition can be evaluated at compile time to enable inlining as a
128        // simple constant
129        if INLINE.is_some() {
130            INLINE.unwrap()
131        } else {
132            // Otherwise we use a
133            #[inline(never)]
134            fn get_atom() -> $crate::Atom {
135                static CACHE: $crate::CachedAtom =
136                    $crate::CachedAtom::new(|| $crate::Atom::from($s));
137
138                (*CACHE).clone()
139            }
140
141            get_atom()
142        }
143    }};
144}
145
146impl Default for Atom {
147    #[inline(never)]
148    fn default() -> Self {
149        atom!("")
150    }
151}
152
153/// Immutable, so it's safe to be shared between threads
154unsafe impl Send for Atom {}
155
156/// Immutable, so it's safe to be shared between threads
157unsafe impl Sync for Atom {}
158
159impl Display for Atom {
160    #[inline]
161    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
162        Display::fmt(self.as_str(), f)
163    }
164}
165
166impl Debug for Atom {
167    #[inline]
168    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
169        Debug::fmt(self.as_str(), f)
170    }
171}
172
173#[cfg(feature = "serde")]
174impl serde::ser::Serialize for Atom {
175    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
176    where
177        S: serde::ser::Serializer,
178    {
179        serializer.serialize_str(self)
180    }
181}
182
183#[cfg(feature = "serde")]
184impl<'de> serde::de::Deserialize<'de> for Atom {
185    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
186    where
187        D: serde::Deserializer<'de>,
188    {
189        String::deserialize(deserializer).map(Self::new)
190    }
191}
192const DYNAMIC_TAG: u8 = 0b_00;
193const INLINE_TAG: u8 = 0b_01; // len in upper nybble
194const INLINE_TAG_INIT: NonZeroU8 = unsafe { NonZeroU8::new_unchecked(INLINE_TAG) };
195const TAG_MASK: u8 = 0b_11;
196const LEN_OFFSET: usize = 4;
197const LEN_MASK: u8 = 0xf0;
198
199// const STATIC_SHIFT_BITS: usize = 32;
200
201impl Atom {
202    #[inline(always)]
203    pub fn new<S>(s: S) -> Self
204    where
205        Self: From<S>,
206    {
207        Self::from(s)
208    }
209
210    #[inline(always)]
211    fn tag(&self) -> u8 {
212        self.unsafe_data.tag() & TAG_MASK
213    }
214
215    /// Return true if this is a dynamic Atom.
216    #[inline(always)]
217    fn is_dynamic(&self) -> bool {
218        self.tag() == DYNAMIC_TAG
219    }
220}
221
222impl Atom {
223    fn from_mutated_str<F: FnOnce(&mut str)>(s: &str, f: F) -> Self {
224        let mut buffer = mem::MaybeUninit::<[u8; 64]>::uninit();
225        let buffer = unsafe { &mut *buffer.as_mut_ptr() };
226
227        if let Some(buffer_prefix) = buffer.get_mut(..s.len()) {
228            buffer_prefix.copy_from_slice(s.as_bytes());
229            let as_str = unsafe { ::std::str::from_utf8_unchecked_mut(buffer_prefix) };
230            f(as_str);
231            Atom::from(&*as_str)
232        } else {
233            let mut string = s.to_owned();
234            f(&mut string);
235            Atom::from(string)
236        }
237    }
238
239    /// Like [`to_ascii_uppercase`].
240    ///
241    /// [`to_ascii_uppercase`]: https://doc.rust-lang.org/std/ascii/trait.AsciiExt.html#tymethod.to_ascii_uppercase
242    pub fn to_ascii_uppercase(&self) -> Self {
243        for (i, b) in self.bytes().enumerate() {
244            if let b'a'..=b'z' = b {
245                return Atom::from_mutated_str(self, |s| s[i..].make_ascii_uppercase());
246            }
247        }
248        self.clone()
249    }
250
251    /// Like [`to_ascii_lowercase`].
252    ///
253    /// [`to_ascii_lowercase`]: https://doc.rust-lang.org/std/ascii/trait.AsciiExt.html#tymethod.to_ascii_lowercase
254    pub fn to_ascii_lowercase(&self) -> Self {
255        for (i, b) in self.bytes().enumerate() {
256            if let b'A'..=b'Z' = b {
257                return Atom::from_mutated_str(self, |s| s[i..].make_ascii_lowercase());
258            }
259        }
260        self.clone()
261    }
262}
263
264impl Atom {
265    fn get_hash(&self) -> u64 {
266        get_hash!(self)
267    }
268
269    fn as_str(&self) -> &str {
270        match self.tag() {
271            DYNAMIC_TAG => unsafe {
272                let item = crate::dynamic::deref_from(self.unsafe_data);
273                from_utf8_unchecked(transmute::<&[u8], &'static [u8]>(&item.slice))
274            },
275            INLINE_TAG => {
276                let len = (self.unsafe_data.tag() & LEN_MASK) >> LEN_OFFSET;
277                let src = self.unsafe_data.data();
278                unsafe { std::str::from_utf8_unchecked(&src[..(len as usize)]) }
279            }
280            _ => unsafe { debug_unreachable!() },
281        }
282    }
283}
284
285#[cfg(test)]
286impl Atom {
287    pub(crate) fn ref_count(&self) -> usize {
288        match self.tag() {
289            DYNAMIC_TAG => {
290                let ptr = unsafe { crate::dynamic::deref_from(self.unsafe_data) };
291
292                triomphe::ThinArc::strong_count(&ptr.0)
293            }
294            _ => 1,
295        }
296    }
297}
298
299impl PartialEq for Atom {
300    #[inline(never)]
301    fn eq(&self, other: &Self) -> bool {
302        partial_eq!(self, other);
303
304        // If the store is different, the string may be the same, even though the
305        // `unsafe_data` is different
306        self.as_str() == other.as_str()
307    }
308}
309
310impl Eq for Atom {}
311
312impl Hash for Atom {
313    #[inline(always)]
314    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
315        state.write_u64(self.get_hash());
316    }
317}
318
319impl Drop for Atom {
320    #[inline(always)]
321    fn drop(&mut self) {
322        if self.is_dynamic() {
323            unsafe { drop(crate::dynamic::restore_arc(self.unsafe_data)) }
324        }
325    }
326}
327
328impl Clone for Atom {
329    #[inline(always)]
330    fn clone(&self) -> Self {
331        Self::from_alias(self.unsafe_data)
332    }
333}
334
335impl_from_alias!(Atom);
336
337impl Deref for Atom {
338    type Target = str;
339
340    #[inline(always)]
341    fn deref(&self) -> &Self::Target {
342        self.as_str()
343    }
344}
345
346impl AsRef<str> for Atom {
347    #[inline(always)]
348    fn as_ref(&self) -> &str {
349        self.as_str()
350    }
351}
352
353impl PartialEq<str> for Atom {
354    #[inline]
355    fn eq(&self, other: &str) -> bool {
356        self.as_str() == other
357    }
358}
359
360impl PartialEq<&'_ str> for Atom {
361    #[inline]
362    fn eq(&self, other: &&str) -> bool {
363        self.as_str() == *other
364    }
365}
366
367impl PartialEq<Atom> for str {
368    #[inline]
369    fn eq(&self, other: &Atom) -> bool {
370        self == other.as_str()
371    }
372}
373
374impl Borrow<Wtf8Atom> for Atom {
375    #[inline(always)]
376    fn borrow(&self) -> &Wtf8Atom {
377        // SAFETY:
378        // 1. Wtf8Atom is #[repr(transparent)] over TaggedValue
379        // 2. Atom is #[repr(transparent)] over TaggedValue
380        // 3. hstr::Atom and hstr::Wtf8Atom share the same TaggedValue
381        const _: () = assert!(std::mem::size_of::<Atom>() == std::mem::size_of::<Wtf8Atom>());
382        const _: () = assert!(std::mem::align_of::<Atom>() == std::mem::align_of::<Wtf8Atom>());
383        unsafe { transmute::<&Atom, &Wtf8Atom>(self) }
384    }
385}
386
387/// NOT A PUBLIC API
388#[cfg(feature = "rkyv")]
389impl rkyv::Archive for Atom {
390    type Archived = rkyv::string::ArchivedString;
391    type Resolver = rkyv::string::StringResolver;
392
393    #[allow(clippy::unit_arg)]
394    unsafe fn resolve(&self, pos: usize, resolver: Self::Resolver, out: *mut Self::Archived) {
395        rkyv::string::ArchivedString::resolve_from_str(self, pos, resolver, out)
396    }
397}
398
399/// NOT A PUBLIC API
400#[cfg(feature = "rkyv")]
401impl<S: rkyv::ser::Serializer + ?Sized> rkyv::Serialize<S> for Atom {
402    fn serialize(&self, serializer: &mut S) -> Result<Self::Resolver, S::Error> {
403        String::serialize(&self.to_string(), serializer)
404    }
405}
406
407/// NOT A PUBLIC API
408#[cfg(feature = "rkyv")]
409impl<D> rkyv::Deserialize<Atom, D> for rkyv::string::ArchivedString
410where
411    D: ?Sized + rkyv::Fallible,
412{
413    fn deserialize(&self, deserializer: &mut D) -> Result<Atom, <D as rkyv::Fallible>::Error> {
414        let s: String = self.deserialize(deserializer)?;
415
416        Ok(Atom::new(s))
417    }
418}
419
420impl Atom {
421    /// Converts a WTF-8 encoded [Wtf8Atom] to a regular UTF-8 [Atom] without
422    /// validation.
423    ///
424    /// # Safety
425    ///
426    /// The caller must ensure that the WTF-8 atom contains only valid UTF-8
427    /// data (no unpaired surrogates). This function performs no validation
428    /// and will create an invalid `Atom` if the input contains unpaired
429    /// surrogates.
430    ///
431    /// This is a zero-cost conversion that preserves all internal optimizations
432    /// (inline storage, precomputed hashes, etc.) since both types have
433    /// identical internal representation.
434    pub unsafe fn from_wtf8_unchecked(s: Wtf8Atom) -> Self {
435        let s = ManuallyDrop::new(s);
436        Atom {
437            unsafe_data: s.unsafe_data,
438        }
439    }
440}
441
442#[cfg(test)]
443mod macro_tests {
444
445    use super::*;
446    #[test]
447    fn test_atom() {
448        // Test enough to exceed the small string optimization
449        assert_eq!(atom!(""), Atom::default());
450        assert_eq!(atom!(""), Atom::from(""));
451        assert_eq!(atom!("a"), Atom::from("a"));
452        assert_eq!(atom!("ab"), Atom::from("ab"));
453        assert_eq!(atom!("abc"), Atom::from("abc"));
454        assert_eq!(atom!("abcd"), Atom::from("abcd"));
455        assert_eq!(atom!("abcde"), Atom::from("abcde"));
456        assert_eq!(atom!("abcdef"), Atom::from("abcdef"));
457        assert_eq!(atom!("abcdefg"), Atom::from("abcdefg"));
458        assert_eq!(atom!("abcdefgh"), Atom::from("abcdefgh"));
459        assert_eq!(atom!("abcdefghi"), Atom::from("abcdefghi"));
460    }
461
462    #[test]
463    fn test_inline_atom() {
464        // This is a silly test, just asserts that rustc can evaluate the pattern from
465        // our macro in a constant context.
466        const STR: Atom = {
467            let inline = inline_atom("hello");
468            if inline.is_some() {
469                inline.unwrap()
470            } else {
471                unreachable!();
472            }
473        };
474        assert_eq!(STR, Atom::from("hello"));
475    }
476}