diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index fe3508d..9a7d4d3 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -7,5 +7,5 @@ jobs: steps: - uses: actions/checkout@v4 - uses: dtolnay/rust-toolchain@stable - - run: cargo build + - run: cargo build --features unicode_names2 - run: cargo test diff --git a/Cargo.lock b/Cargo.lock index b0bfd13..33d1268 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,6 +2,226 @@ # It is not intended for manual editing. version = 3 +[[package]] +name = "byteorder" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + [[package]] name = "codex" version = "0.1.0" +dependencies = [ + "unicode_names2", +] + +[[package]] +name = "getopts" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "14dbbfd5c71d70241ecf9e6f13737f7b5ce823821063188d7e46c41d371eebd5" +dependencies = [ + "unicode-width", +] + +[[package]] +name = "getrandom" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7" +dependencies = [ + "cfg-if", + "libc", + "wasi", +] + +[[package]] +name = "libc" +version = "0.2.164" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "433bfe06b8c75da9b2e3fbea6e5329ff87748f0b144ef75306e674c3f6f7c13f" + +[[package]] +name = "log" +version = "0.4.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24" + +[[package]] +name = "phf" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ade2d8b8f33c7333b51bcf0428d37e217e9f32192ae4772156f65063b8ce03dc" +dependencies = [ + "phf_shared", +] + +[[package]] +name = "phf_codegen" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e8d39688d359e6b34654d328e262234662d16cc0f60ec8dcbe5e718709342a5a" +dependencies = [ + "phf_generator", + "phf_shared", +] + +[[package]] +name = "phf_generator" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48e4cc64c2ad9ebe670cb8fd69dd50ae301650392e81c05f9bfcb2d5bdbc24b0" +dependencies = [ + "phf_shared", + "rand", +] + +[[package]] +name = "phf_shared" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "90fcb95eef784c2ac79119d1dd819e162b5da872ce6f3c3abe1e8ca1c082f72b" +dependencies = [ + "siphasher", +] + +[[package]] +name = "ppv-lite86" +version = "0.2.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77957b295656769bb8ad2b6a6b09d897d94f05c41b069aede1fcdaa675eaea04" +dependencies = [ + "zerocopy", +] + +[[package]] +name = "proc-macro2" +version = "1.0.89" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f139b0662de085916d1fb67d2b4169d1addddda1919e696f3252b740b629986e" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.37" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "rand" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" +dependencies = [ + "libc", + "rand_chacha", + "rand_core", +] + +[[package]] +name = "rand_chacha" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" +dependencies = [ + "ppv-lite86", + "rand_core", +] + +[[package]] +name = "rand_core" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" +dependencies = [ + "getrandom", +] + +[[package]] +name = "siphasher" +version = "0.3.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38b58827f4464d87d377d175e90bf58eb00fd8716ff0a62f80356b5e61555d0d" + +[[package]] +name = "syn" +version = "2.0.87" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "25aa4ce346d03a6dcd68dd8b4010bcb74e54e62c90c573f394c46eae99aba32d" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "unicode-ident" +version = "1.0.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e91b56cd4cadaeb79bbf1a5645f6b4f8dc5bde8834ad5894a8db35fda9efa1fe" + +[[package]] +name = "unicode-width" +version = "0.1.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7dd6e30e90baa6f72411720665d41d89b9a3d039dc45b8faea1ddd07f617f6af" + +[[package]] +name = "unicode_names2" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d1673eca9782c84de5f81b82e4109dcfb3611c8ba0d52930ec4a9478f547b2dd" +dependencies = [ + "phf", + "unicode_names2_generator", +] + +[[package]] +name = "unicode_names2_generator" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b91e5b84611016120197efd7dc93ef76774f4e084cd73c9fb3ea4a86c570c56e" +dependencies = [ + "getopts", + "log", + "phf_codegen", + "rand", +] + +[[package]] +name = "wasi" +version = "0.11.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" + +[[package]] +name = "zerocopy" +version = "0.7.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0" +dependencies = [ + "byteorder", + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.7.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] diff --git a/Cargo.toml b/Cargo.toml index f21d3db..89f5b50 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,3 +9,6 @@ readme = "README.md" license = "Apache-2.0" categories = ["encoding", "text-processing"] keywords = ["unicode", "symbols"] + +[build-dependencies] +unicode_names2 = { version = "1.3.0", optional = true } diff --git a/build.rs b/build.rs index 2e2fd45..8d199ed 100644 --- a/build.rs +++ b/build.rs @@ -86,8 +86,8 @@ fn tokenize(line: &str) -> StrResult { return Ok(Line::Blank); } - let (head, tail) = match line.split_once(' ') { - Some((a, b)) => (a, Some(b)), + let (head, tail) = match line.split_once(char::is_whitespace) { + Some((a, b)) => (a, Some(b.trim_start())), None => (line, None), }; @@ -121,10 +121,29 @@ fn validate_ident(string: &str) -> StrResult<()> { /// Extracts either a single char or parses a U+XXXX escape. fn decode_char(text: &str) -> StrResult { if let Some(hex) = text.strip_prefix("U+") { - u32::from_str_radix(hex, 16) + let (hex, name) = match hex.split_once(char::is_whitespace) { + Some((hex, name)) => (hex, Some(name.trim_start())), + None => (hex, None), + }; + + let ch = u32::from_str_radix(hex, 16) .ok() - .and_then(|n| char::try_from(n).ok()) - .ok_or_else(|| format!("invalid unicode escape {text:?}")) + .and_then(|n| char::from_u32(n)) + .ok_or_else(|| format!("invalid unicode escape {hex:?}"))?; + + #[cfg_attr(not(feature = "unicode_names2"), expect(unused_variables))] + if let Some(name) = name { + #[cfg(feature = "unicode_names2")] + if unicode_names2::character(name) != Some(ch) { + return Err(format!( + "Incorrect name supplied for character U+{hex}: '{name}'{}", + unicode_names2::name(ch) + .map_or("".to_string(), |name| format!(" (expected {name})")) + )); + } + } + + Ok(ch) } else { let mut chars = text.chars(); match (chars.next(), chars.next()) { diff --git a/src/modules/sym.txt b/src/modules/sym.txt index 481122b..7cea030 100644 --- a/src/modules/sym.txt +++ b/src/modules/sym.txt @@ -1,25 +1,25 @@ -// Control. -wj U+2060 -zwj U+200D -zwnj U+200C -zws U+200B -lrm U+200E -rlm U+200F +// Layout control. +wj U+2060 Word joiner +zwj U+200D Zero width joiner +zwnj U+200C Zero width non-joiner +zws U+200B Zero width space +lrm U+200E Left-to-right mark +rlm U+200F Right-to-left mark // Spaces. -space U+20 - .nobreak U+A0 - .nobreak.narrow U+202F - .en U+2002 - .quad U+2003 - .third U+2004 - .quarter U+2005 - .sixth U+2006 - .med U+205F - .fig U+2007 - .punct U+2008 - .thin U+2009 - .hair U+200A +space U+20 Space + .nobreak U+A0 No-break space + .nobreak.narrow U+202F Narrow no-break space + .en U+2002 En space + .quad U+2003 Em space + .third U+2004 Three-per-em space + .quarter U+2005 Four-per-em space + .sixth U+2006 Six-per-em space + .fig U+2007 Figure space + .punct U+2008 Punctuation space + .thin U+2009 Thin space + .hair U+200A Hair space + .med U+205F Medium mathematical space // Delimiters. paren @@ -30,9 +30,9 @@ paren .t ⏜ .b ⏝ brace - .l U+7B + .l U+7B Left curly bracket .l.double ⦃ - .r U+7D + .r U+7D Right curly bracket .r.double ⦄ .t ⏞ .b ⏟ @@ -130,14 +130,14 @@ dash .wave.double 〰 dot .op ⋅ - .basic U+2E + .basic U+2E Full stop .c · .circle ⊙ .circle.big ⨀ .square ⊡ .double ¨ - .triple U+20DB - .quad U+20DC + .triple U+20DB Combining three dots above + .quad U+20DC Combining four dots above excl ! .double ‼ .inv ¡ @@ -149,10 +149,10 @@ quest ? interrobang ‽ hash # hyph ‐ - .minus U+2D - .nobreak U+2011 + .minus U+2D Hyphen-minus + .nobreak U+2011 Non-breaking hyphen + .soft U+AD Soft hyphen .point ‧ - .soft U+AD numero № percent % permille ‰