Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
# ens-normalize-rs

![tests](https://github.com/sevenzing/ens-normalize-rs/actions/workflows/tests.yml/badge.svg)
![Crates.io Version](https://img.shields.io/crates/v/ens-normalize-rs)


A Rust implementation of ENS (Ethereum Name Service) name normalization.

Expand Down
2 changes: 1 addition & 1 deletion examples/tokens.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ fn main() {
let normalizer = EnsNameNormalizer::default();

let name = "Nàme‍🧙‍♂.eth";
let result = normalizer.tokenize(name).unwrap();
let result = normalizer.tokenize(name);

for token in result.tokens {
if token.is_disallowed() {
Expand Down
3 changes: 2 additions & 1 deletion src/code_points/specs.rs
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ impl CodePointsSpecs {
.collect();
let valid = compute_valid(&groups, &decomp);
let whole_map = compute_whole_map(spec.whole_map);

let emoji_str_list = emoji
.iter()
.map(|cps| utils::cps2str(cps))
Expand Down Expand Up @@ -95,7 +96,7 @@ impl CodePointsSpecs {
.unwrap_or(false)
}

pub fn finditer_emoji<'a>(&'a self, s: &'a str) -> impl Iterator<Item = regex::Match<'_>> {
pub fn finditer_emoji<'a>(&'a self, s: &'a str) -> impl Iterator<Item = regex::Match<'a>> {
self.emoji_regex.find_iter(s)
}

Expand Down
2 changes: 2 additions & 0 deletions src/code_points/types.rs
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ impl ParsedGroup {
pub type ParsedWholeMap = HashMap<CodePoint, ParsedWholeValue>;

pub enum ParsedWholeValue {
#[allow(dead_code)]
Number(u32),
WholeObject(ParsedWholeObject),
}
Expand All @@ -59,6 +60,7 @@ impl TryFrom<spec_json::WholeValue> for ParsedWholeValue {
}

pub struct ParsedWholeObject {
#[allow(dead_code)]
pub v: HashSet<CodePoint>,
pub m: HashMap<CodePoint, HashSet<String>>,
}
Expand Down
13 changes: 9 additions & 4 deletions src/error.rs
Original file line number Diff line number Diff line change
@@ -1,11 +1,8 @@
use crate::CodePoint;

/// Errors that can occur during processing of an ENS name.
#[derive(Debug, Clone, thiserror::Error, PartialEq, Eq)]
pub enum ProcessError {
#[error("contains visually confusing characters from multiple scripts: {0}")]
Confused(String),
#[error("contains visually confusing characters from {group1} and {group2} scripts")]
ConfusedGroups { group1: String, group2: String },
#[error("invalid character ('{sequence}') at position {index}: {inner}")]
CurrableError {
inner: CurrableError,
Expand All @@ -17,6 +14,7 @@ pub enum ProcessError {
DisallowedSequence(#[from] DisallowedSequence),
}

/// Errors that can be cured by the normalizer.
#[derive(Debug, Clone, thiserror::Error, PartialEq, Eq)]
pub enum CurrableError {
#[error("underscore in middle")]
Expand All @@ -33,8 +31,13 @@ pub enum CurrableError {
FencedTrailing,
#[error("consecutive sequence of fenced characters")]
FencedConsecutive,
#[error("contains visually confusing characters from multiple scripts: character with code '{cp}' not in group '{group_name}'")]
Confused { group_name: String, cp: CodePoint },
#[error("contains a disallowed character")]
Disallowed,
}

/// Errors regarding disallowed sequences.
#[derive(Debug, Clone, thiserror::Error, PartialEq, Eq)]
pub enum DisallowedSequence {
#[error("disallowed character: {0}")]
Expand All @@ -47,4 +50,6 @@ pub enum DisallowedSequence {
NsmTooMany,
#[error("nsm repeated")]
NsmRepeated,
#[error("contains visually confusing characters from {group1} and {group2} scripts")]
ConfusedGroups { group1: String, group2: String },
}
2 changes: 1 addition & 1 deletion src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ mod tokens;
mod utils;
mod validate;

pub use code_points::*;
pub(crate) use code_points::*;
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Changing visibility to pub(crate) may impact external users

Modifying pub use code_points::*; to pub(crate) use code_points::*; restricts the visibility of code_points exports to within the crate. If any external code depends on these exports, this change can introduce breaking changes. Please verify that no public APIs are affected or consider deprecating before removal.

pub use error::{CurrableError, DisallowedSequence, ProcessError};
pub use normalizer::{beautify, normalize, process, tokenize, EnsNameNormalizer, ProcessedName};
pub use tokens::*;
Expand Down
20 changes: 17 additions & 3 deletions src/normalizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,16 @@ use crate::{
ProcessError, TokenizedName, ValidatedLabel,
};

/// Main struct to handle ENS name normalization including
/// tokenization, validation, beautification and normalization
#[derive(Default)]
pub struct EnsNameNormalizer {
specs: CodePointsSpecs,
}

/// Result of processing an ENS name.
/// Contains tokenized name as intermediate processing result and validated labels.
/// Validated labels can be normalized and beautified.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct ProcessedName {
pub labels: Vec<ValidatedLabel>,
Expand All @@ -19,21 +24,26 @@ impl EnsNameNormalizer {
Self { specs }
}

pub fn tokenize(&self, input: impl AsRef<str>) -> Result<TokenizedName, ProcessError> {
/// Tokenize the input string, return a `TokenizedName` object with `Vec<EnsNameToken>` inside
pub fn tokenize(&self, input: impl AsRef<str>) -> TokenizedName {
TokenizedName::from_input(input.as_ref(), &self.specs, true)
}

/// Process the input string, return a `ProcessedName` object with `Vec<ValidatedLabel>` inside
/// This function will tokenize and validate the name. Processed name can be normalized and beautified.
pub fn process(&self, input: impl AsRef<str>) -> Result<ProcessedName, ProcessError> {
let input = input.as_ref();
let tokenized = self.tokenize(input)?;
let tokenized = self.tokenize(input);
let labels = validate_name(&tokenized, &self.specs)?;
Ok(ProcessedName { tokenized, labels })
}

/// Normalize the input string, return a normalized version of ENS name
pub fn normalize(&self, input: impl AsRef<str>) -> Result<String, ProcessError> {
self.process(input).map(|processed| processed.normalize())
}

/// Beautify the input string, return a beautified version of ENS name/// Beautify the input string, return a beautified version of ENS name
pub fn beautify(&self, input: impl AsRef<str>) -> Result<String, ProcessError> {
self.process(input).map(|processed| processed.beautify())
}
Expand All @@ -49,18 +59,22 @@ impl ProcessedName {
}
}

pub fn tokenize(input: impl AsRef<str>) -> Result<TokenizedName, ProcessError> {
/// `no-cache` version of [`EnsNameNormalizer::tokenize`]
pub fn tokenize(input: impl AsRef<str>) -> TokenizedName {
EnsNameNormalizer::default().tokenize(input)
}

/// `no-cache` version of [`EnsNameNormalizer::process`]
pub fn process(input: impl AsRef<str>) -> Result<ProcessedName, ProcessError> {
EnsNameNormalizer::default().process(input)
}

/// `no-cache` version of [`EnsNameNormalizer::normalize`]
pub fn normalize(input: impl AsRef<str>) -> Result<String, ProcessError> {
EnsNameNormalizer::default().normalize(input)
}

/// `no-cache` version of [`EnsNameNormalizer::beautify`]
pub fn beautify(input: impl AsRef<str>) -> Result<String, ProcessError> {
EnsNameNormalizer::default().beautify(input)
}
38 changes: 11 additions & 27 deletions src/tokens/tokenize.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,25 +3,17 @@ use crate::{
CollapsedEnsNameToken, EnsNameToken, TokenDisallowed, TokenEmoji, TokenIgnored,
TokenMapped, TokenNfc, TokenStop, TokenValid,
},
utils, CodePoint, CodePointsSpecs, ProcessError,
utils, CodePoint, CodePointsSpecs,
};

/// Represents a full ENS name, including the original input and the sequence of tokens
/// vitalik.eth
/// ^^^^^^^^^^^
/// name
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct TokenizedName {
pub input: String,
pub tokens: Vec<EnsNameToken>,
}

/// Represents a tokenized ENS label (part of a name separated by periods), including sequence of tokens
/// vitalik.eth
/// ^^^^^^^
/// label 1
/// ^^^
/// label 2
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct TokenizedLabel<'a> {
pub tokens: &'a [EnsNameToken],
Expand All @@ -36,11 +28,7 @@ impl TokenizedName {
}

/// Tokenizes an input string, applying NFC normalization if requested.
pub fn from_input(
input: impl AsRef<str>,
specs: &CodePointsSpecs,
apply_nfc: bool,
) -> Result<Self, ProcessError> {
pub fn from_input(input: impl AsRef<str>, specs: &CodePointsSpecs, apply_nfc: bool) -> Self {
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Reconsider removing error handling from from_input method

Changing from_input to return Self instead of Result<Self, ProcessError> might suppress errors that can occur during tokenization. To prevent unexpected behaviors, consider maintaining error propagation to handle invalid inputs gracefully.

tokenize_name(input, specs, apply_nfc)
}

Expand Down Expand Up @@ -143,27 +131,23 @@ where
}
}

fn tokenize_name(
name: impl AsRef<str>,
specs: &CodePointsSpecs,
apply_nfc: bool,
) -> Result<TokenizedName, ProcessError> {
fn tokenize_name(name: impl AsRef<str>, specs: &CodePointsSpecs, apply_nfc: bool) -> TokenizedName {
let name = name.as_ref();
if name.is_empty() {
return Ok(TokenizedName::empty());
return TokenizedName::empty();
}
let tokens = tokenize_input(name, specs, apply_nfc)?;
Ok(TokenizedName {
let tokens = tokenize_input(name, specs, apply_nfc);
TokenizedName {
input: name.to_string(),
tokens,
})
}
}

fn tokenize_input(
input: impl AsRef<str>,
specs: &CodePointsSpecs,
apply_nfc: bool,
) -> Result<Vec<EnsNameToken>, ProcessError> {
) -> Vec<EnsNameToken> {
let input = input.as_ref();
let emojis = specs.finditer_emoji(input).collect::<Vec<_>>();

Expand Down Expand Up @@ -192,7 +176,7 @@ fn tokenize_input(
perform_nfc_transform(&mut tokens, specs);
}
collapse_valid_tokens(&mut tokens);
Ok(tokens)
tokens
}

fn perform_nfc_transform(tokens: &mut Vec<EnsNameToken>, specs: &CodePointsSpecs) {
Expand Down Expand Up @@ -470,7 +454,7 @@ mod tests {
#[case] expected: Vec<EnsNameToken>,
specs: &CodePointsSpecs,
) {
let tokens = tokenize_input(input, specs, apply_nfc).expect("tokenize");
let tokens = tokenize_input(input, specs, apply_nfc);
assert_eq!(tokens, expected);
}

Expand All @@ -494,7 +478,7 @@ mod tests {
#[case] expected: Vec<CollapsedEnsNameToken>,
specs: &CodePointsSpecs,
) {
let tokens = tokenize_input(input, specs, true).expect("tokenize");
let tokens = tokenize_input(input, specs, true);
let label = TokenizedLabel::from(&tokens);
let result = label.collapse_into_text_or_emoji();
assert_eq!(result, expected);
Expand Down
16 changes: 15 additions & 1 deletion src/tokens/types.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
use crate::{constants, utils, CodePoint};

/// Represents a token in an ENS name.
/// see https://docs.ens.domains/ensip/15#tokenize for more details.
/// see <https://docs.ens.domains/ensip/15#tokenize> for more details.
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum EnsNameToken {
Valid(TokenValid),
Expand Down Expand Up @@ -72,35 +72,48 @@ impl EnsNameToken {
}
}

/// A valid vector of code points
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct TokenValid {
pub cps: Vec<CodePoint>,
}

/// Code point should be mapped to vector of code points
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct TokenMapped {
pub cps: Vec<CodePoint>,
pub cp: CodePoint,
}

/// Code point should be ignored
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct TokenIgnored {
pub cp: CodePoint,
}

/// Code point is disallowed
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct TokenDisallowed {
pub cp: CodePoint,
}

/// Represents a stop token (.)
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct TokenStop {
pub cp: CodePoint,
}

/// Represents a vector of code points that should be normalized using NFC
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct TokenNfc {
pub cps: Vec<CodePoint>,
pub input: Vec<CodePoint>,
}

/// Represents a vector of code points of emoji
/// `cps_input` contains vector of code from input string
/// `emoji` contains vector of beautified emoji code points
/// `cps_no_fe0f` contains vector of code points of emoji without `FE0F`
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct TokenEmoji {
pub input: String,
Expand All @@ -109,6 +122,7 @@ pub struct TokenEmoji {
pub cps_no_fe0f: Vec<CodePoint>,
}

/// Represents a collapsed token in an ENS name: either text or emoji
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum CollapsedEnsNameToken {
Text(TokenValid),
Expand Down
Loading
Loading