diff --git a/build.rs b/build.rs index 80286bd..a1db064 100644 --- a/build.rs +++ b/build.rs @@ -1,9 +1,13 @@ +use self::shared::ModifierSet; use std::fmt::Write; use std::iter::Peekable; use std::path::Path; type StrResult = Result; +#[path = "src/shared.rs"] +mod shared; + /// A module of definitions. struct Module<'a>(Vec<(&'a str, Binding<'a>)>); @@ -29,7 +33,7 @@ enum Def<'a> { /// A symbol, either a leaf or with modifiers. enum Symbol<'a> { Single(char), - Multi(Vec<(&'a str, char)>), + Multi(Vec<(ModifierSet<&'a str>, char)>), } /// A single line during parsing. @@ -40,7 +44,7 @@ enum Line<'a> { ModuleStart(&'a str), ModuleEnd, Symbol(&'a str, Option), - Variant(&'a str, char), + Variant(ModifierSet<&'a str>, char), } fn main() { @@ -110,7 +114,7 @@ fn tokenize(line: &str) -> StrResult { validate_ident(part)?; } let c = decode_char(tail.ok_or("missing char")?)?; - Line::Variant(rest, c) + Line::Variant(ModifierSet::new_unchecked(rest), c) } else { validate_ident(head)?; let c = tail.map(decode_char).transpose()?; @@ -167,7 +171,7 @@ fn parse<'a>( let symbol = if variants.len() > 0 { if let Some(c) = c { - variants.insert(0, ("", c)); + variants.insert(0, (ModifierSet::default(), c)); } Symbol::Multi(variants) } else { diff --git a/src/lib.rs b/src/lib.rs index ae64ee1..4bd101d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,6 +1,16 @@ -/*! -Human-friendly notation for Unicode symbols. -*/ +//! Human-friendly notation for Unicode symbols. +//! +//! ## Model +//! A [`Symbol`] is a collection of one or more _variants_. +//! Each variant is identified by a set of [_modifiers_](ModifierSet) +//! and has a single character as its value. +//! The modifiers themselves can in principle be any non-empty strings +//! that don't contain the character `.`, but codex only defines +//! ones that are entirely made of ASCII alphabetical characters. + +pub use self::shared::ModifierSet; + +mod shared; /// A module of definitions. #[derive(Debug, Copy, Clone)] @@ -52,7 +62,41 @@ pub enum Symbol { /// A symbol without modifiers. Single(char), /// A symbol with named modifiers. The symbol defaults to its first variant. - Multi(&'static [(&'static str, char)]), + Multi(&'static [(ModifierSet<&'static str>, char)]), +} + +impl Symbol { + /// Get the symbol's character for a given set of modifiers. + pub fn get(&self, modifs: ModifierSet<&str>) -> Option { + match self { + Self::Single(c) => modifs.is_empty().then_some(*c), + Self::Multi(list) => modifs.best_match_in(list.iter().copied()), + } + } + + /// The characters that are covered by this symbol. + pub fn variants(&self) -> impl Iterator, char)> { + enum Variants { + Single(std::iter::Once), + Multi(std::slice::Iter<'static, (ModifierSet<&'static str>, char)>), + } + let mut iter = match self { + Self::Single(c) => Variants::Single(std::iter::once(*c)), + Self::Multi(sl) => Variants::Multi(sl.iter()), + }; + std::iter::from_fn(move || match &mut iter { + Variants::Single(iter) => Some((ModifierSet::default(), iter.next()?)), + Variants::Multi(iter) => iter.next().copied(), + }) + } + + /// Possible modifiers for this symbol. + pub fn modifiers(&self) -> impl Iterator + '_ { + self.variants() + .flat_map(|(m, _)| m.into_iter()) + .collect::>() + .into_iter() + } } /// A module that contains the other top-level modules. diff --git a/src/shared.rs b/src/shared.rs new file mode 100644 index 0000000..f46fb09 --- /dev/null +++ b/src/shared.rs @@ -0,0 +1,214 @@ +use std::ops::Deref; + +/// A set of modifiers. +/// +/// Beware: The [`Eq`] and [`Hash`] implementations are dependent on the ordering +/// of the modifiers, in opposition to what a set would usually constitute. +/// To test for set-wise equality, use [`iter`](Self::iter) and collect into a +/// true set type like [`HashSet`](std::collections::HashSet). +#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)] +// note: the visibility needs to be `pub(crate)`, +// since build.rs outputs `ModifierSet(...)` +pub struct ModifierSet(pub(crate) S); + +impl Default for ModifierSet { + /// Construct the default modifier set. + /// + /// This is typically the empty set, + /// though the remark from [`Self::new_unchecked`] applies + /// since `S::default()` could technically be anything. + fn default() -> Self { + Self(S::default()) + } +} + +impl> ModifierSet { + /// Convert the underlying string to a slice. + pub fn as_deref(&self) -> ModifierSet<&str> { + ModifierSet(&self.0) + } + + /// Get the string of modifiers separated by `.`. + pub fn as_str(&self) -> &str { + &self.0 + } + + /// Construct a modifier set from a string, + /// where modifiers are separated by the character `.`. + /// + /// It is not unsafe to use this function wrongly, but it can produce + /// unexpected results down the line. Correct usage should ensure that + /// `s` does not contain any empty modifiers (i.e. the sequence `..`) + /// and that no modifier occurs twice. + pub fn new_unchecked(s: S) -> Self { + Self(s) + } + + /// Whether `self` is empty. + pub fn is_empty(&self) -> bool { + self.0.is_empty() + } + + /// Add a modifier to the set, without checking that it is a valid modifier. + /// + /// It is not unsafe to use this method wrongly, but that can produce + /// unexpected results down the line. Correct usage should ensure that + /// `modifier` is not empty and doesn't contain the character `.`. + pub fn add_unchecked(&mut self, m: &str) + where + S: for<'a> std::ops::AddAssign<&'a str>, + { + if !self.0.is_empty() { + self.0 += "."; + } + self.0 += m; + } + + /// Iterate over the list of modifiers in an arbitrary order. + pub fn iter(&self) -> impl Iterator { + self.into_iter() + } + + /// Whether the set contains the modifier `m`. + pub fn contains(&self, m: &str) -> bool { + self.iter().any(|lhs| lhs == m) + } + + /// Whether all modifiers in `self` are also present in `other`. + pub fn is_subset(&self, other: ModifierSet<&str>) -> bool { + self.iter().all(|m| other.contains(m)) + } + + /// Find the best match from the list. + /// + /// To be considered a match, the modifier set must be a superset of + /// (or equal to) `self`. Among different matches, the best one is selected + /// by the following two criteria (in order): + /// 1. Number of modifiers in common with `self` (more is better). + /// 2. Total number of modifiers (fewer is better). + /// + /// If there are multiple best matches, the first of them is returned. + pub fn best_match_in<'a, T>( + &self, + variants: impl Iterator, T)>, + ) -> Option { + let mut best = None; + let mut best_score = None; + + // Find the best table entry with this name. + for candidate in variants.filter(|(set, _)| self.is_subset(*set)) { + let mut matching = 0; + let mut total = 0; + for modifier in candidate.0.iter() { + if self.contains(modifier) { + matching += 1; + } + total += 1; + } + + let score = (matching, std::cmp::Reverse(total)); + if best_score.is_none_or(|b| score > b) { + best = Some(candidate.1); + best_score = Some(score); + } + } + + best + } +} + +impl<'a, S: Deref> IntoIterator for &'a ModifierSet { + type Item = &'a str; + type IntoIter = std::str::Split<'a, char>; + + /// Iterate over the list of modifiers in an arbitrary order. + fn into_iter(self) -> Self::IntoIter { + let mut iter = self.0.split('.'); + if self.0.is_empty() { + // empty the iterator + let _ = iter.next(); + } + iter + } +} + +impl<'a> IntoIterator for ModifierSet<&'a str> { + type Item = &'a str; + type IntoIter = std::str::Split<'a, char>; + + /// Iterate over the list of modifiers in an arbitrary order. + fn into_iter(self) -> Self::IntoIter { + let mut iter = self.0.split('.'); + if self.0.is_empty() { + // empty the iterator + let _ = iter.next(); + } + iter + } +} + +#[cfg(test)] +mod tests { + type ModifierSet = super::ModifierSet<&'static str>; + + #[test] + fn default_is_empty() { + assert!(ModifierSet::default().is_empty()); + } + + #[test] + fn iter_count() { + assert_eq!(ModifierSet::default().iter().count(), 0); + assert_eq!(ModifierSet::new_unchecked("a").iter().count(), 1); + assert_eq!(ModifierSet::new_unchecked("a.b").iter().count(), 2); + assert_eq!(ModifierSet::new_unchecked("a.b.c").iter().count(), 3); + } + + #[test] + fn subset() { + assert!( + ModifierSet::new_unchecked("a").is_subset(ModifierSet::new_unchecked("a.b")) + ); + assert!( + ModifierSet::new_unchecked("a").is_subset(ModifierSet::new_unchecked("b.a")) + ); + assert!(ModifierSet::new_unchecked("a.b") + .is_subset(ModifierSet::new_unchecked("b.c.a"))); + } + + #[test] + fn best_match() { + // 1. more modifiers in common with self + assert_eq!( + ModifierSet::new_unchecked("a.b").best_match_in([ + (ModifierSet::new_unchecked("a.c"), 1), + (ModifierSet::new_unchecked("a.b"), 2), + ].into_iter()), + Some(2) + ); + // 2. fewer modifiers in general + assert_eq!( + ModifierSet::new_unchecked("a").best_match_in([ + (ModifierSet::new_unchecked("a"), 1), + (ModifierSet::new_unchecked("a.b"), 2), + ].into_iter()), + Some(1) + ); + // the first rule takes priority over the second + assert_eq!( + ModifierSet::new_unchecked("a.b").best_match_in([ + (ModifierSet::new_unchecked("a"), 1), + (ModifierSet::new_unchecked("a.b"), 2), + ].into_iter()), + Some(2) + ); + // among multiple best matches, the first one is returned + assert_eq!( + ModifierSet::default().best_match_in([ + (ModifierSet::new_unchecked("a"), 1), + (ModifierSet::new_unchecked("b"), 2) + ].into_iter()), + Some(1) + ); + } +}