taler-rust

GNU Taler code in Rust. Largely core banking integrations.
Log | Files | Refs | Submodules | README | LICENSE

commit 2fb89d3901e0177c1e1130c6306b4a9a22a6f2ec
parent bb173f9b41774547e2ea7b37b9b0c9f3dbb1aa34
Author: Antoine A <>
Date:   Wed, 25 Dec 2024 02:52:18 +0100

taler-common: improve and optimize subject parsing

Diffstat:
MCargo.lock | 1-
Mtaler-common/Cargo.toml | 1-
Mtaler-common/benches/subject.rs | 56++++++++++++++++++++++++++++++--------------------------
Mtaler-common/src/subject.rs | 296+++++++++++++++++++++++++++++++++++++++++++++++++------------------------------
Mtaler-common/src/types.rs | 2+-
5 files changed, 216 insertions(+), 140 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock @@ -2414,7 +2414,6 @@ dependencies = [ "fastrand", "jiff", "rand", - "regex", "serde", "serde_json", "serde_urlencoded", diff --git a/taler-common/Cargo.toml b/taler-common/Cargo.toml @@ -9,7 +9,6 @@ rand = "0.8" fastrand = "2.2.0" serde_urlencoded = "0.7" jiff = { version = "0.1", default-features = false, features = ["std"] } -regex = { version = "1.11", default-features = false, features = ["perf"] } ed25519-dalek = { version = "2.1.1", default-features = false } serde = { workspace = true, features = ["derive"] } serde_json = { workspace = true, features = ["raw_value"] } diff --git a/taler-common/benches/subject.rs b/taler-common/benches/subject.rs @@ -15,15 +15,26 @@ */ use criterion::{black_box, criterion_group, criterion_main, Criterion}; -use taler_common::subject::SubjectParse; +use taler_common::subject::parse_incoming_unstructured; fn parser(c: &mut Criterion) { const CHARS: &[u8] = b"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789::: \t\t\t\t\n\n\n\n----++++"; let mut rng = fastrand::Rng::with_seed(42); - let real = [ + let real_simple = [ + "Taler TEGY6d9mh9pgwvwpgs0z0095z854xegfy7jj202yd0esp8p0za60", + "00Q979QSMJ29S7BJT3DDAVC5A0DR5Z05B7N0QT1RCBQ8FXJPZ6RG", + "Taler NDDCAM9XN4HJZFTBD8V6FNE2FJE8GY734PJ5AGQMY06C8D4HB3Z0", + "Taler KYC:TEGY6d9mh9pgwvwpgs0z0095z854xegfy7jj202yd0esp8p0za60", + "KYC:00Q979QSMJ29S7BJT3DDAVC5A0DR5Z05B7N0QT1RCBQ8FXJPZ6RG", + "Taler KYC:NDDCAM9XN4HJZFTBD8V6FNE2FJE8GY734PJ5AGQMY06C8D4HB3Z0", + ]; + let real_splitted = [ "Taler TEGY6d9mh9pgwvwpgs0z0095z854xegfy7j j202yd0esp8p0za60", "00Q979QSMJ29S7BJT3DDAVC5A0DR5Z05B7N 0QT1RCBQ8FXJPZ6RG", "Taler NDDCAM9XN4HJZFTBD8V6FNE2FJE8G Y734PJ5AGQMY06C8D4HB3Z0", + "Taler KYC:TEGY6d9mh9pgwvwpgs0z0095z854xegfy7j j202yd0esp8p0za60", + "KYC: 00Q979QSMJ29S7BJT3DDAVC5A0DR5Z05B7N 0QT1RCBQ8FXJPZ6RG", + "Taler KIC:NDDCAM9XN4HJZFTBD8V6FNE2FJE8G Y734PJ5AGQMY06C8D4HB3Z0", ]; let randoms: Vec<String> = (0..30) .map(|_| { @@ -49,31 +60,24 @@ fn parser(c: &mut Criterion) { }) .collect(); - let parser = SubjectParse::new(); - - c.bench_function("real", |b| { - b.iter(|| { - for case in real { - parser.parse_incoming_unstructured(black_box(case)); - } - }) - }); + fn bench<A: AsRef<str>>( + c: &mut Criterion, + name: &str, + cases: impl IntoIterator<Item = A> + Copy, + ) { + c.bench_function(name, |b| { + b.iter(|| { + for case in cases { + parse_incoming_unstructured(black_box(case.as_ref())); + } + }) + }); + } - c.bench_function("rng", |b| { - b.iter(|| { - for case in &randoms { - parser.parse_incoming_unstructured(black_box(case)); - } - }) - }); - - c.bench_function("chunks", |b| { - b.iter(|| { - for case in &chunks { - parser.parse_incoming_unstructured(black_box(case)); - } - }) - }); + bench(c, "real_simple", real_simple); + bench(c, "real_splitted", real_splitted); + bench(c, "rng", &randoms); + bench(c, "chunks", &chunks); } criterion_group!(benches, parser); diff --git a/taler-common/src/subject.rs b/taler-common/src/subject.rs @@ -16,11 +16,9 @@ use std::{fmt::Debug, str::FromStr}; -use regex::Regex; +use crate::{api_common::EddsaPublicKey, types::base32::CROCKFORD_ALPHABET}; -use crate::api_common::EddsaPublicKey; - -#[derive(Debug, PartialEq, Eq)] +#[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum IncomingType { Reserve, Kyc, @@ -30,123 +28,178 @@ pub enum IncomingType { #[derive(Debug, PartialEq, Eq)] pub struct IncomingSubject(IncomingType, EddsaPublicKey); -#[derive(Debug, PartialEq, Eq)] -pub enum IncomingSubjectResult { - Success(IncomingSubject), - Ambiguous, +/** Base32 quality by proximity to spec and error probability */ +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] +enum Base32Quality { + /// Both mixed casing and mixed characters, thats weird + Mixed, + /// Standard but use lowercase, maybe the client shown lowercase in the UI + Standard, + /// Uppercase but mixed characters, its common when making typos + Upper, + /// Both uppercase and use the standard alphabet as it should + UpperStandard, } -pub struct SubjectParse { - part_regex: Regex, - key_regex: Regex, -} - -impl SubjectParse { - pub fn new() -> Self { - Self { - part_regex: Regex::new("[:a-z0-9A-Z]*").expect("part_pattern regex"), - key_regex: Regex::new("(KYC:)?([a-z0-9A-Z]{52})").expect("key_regex regex"), +impl Base32Quality { + pub fn measure(s: &str) -> Self { + let mut uppercase = true; + let mut standard = true; + for b in s.bytes() { + uppercase &= b.is_ascii_uppercase(); + standard &= CROCKFORD_ALPHABET.contains(&b) + } + match (uppercase, standard) { + (true, true) => Base32Quality::UpperStandard, + (true, false) => Base32Quality::Upper, + (false, true) => Base32Quality::Standard, + (false, false) => Base32Quality::Mixed, } } +} - /** - * Extract the public key from an unstructured incoming transfer subject - * - * We first try to find a whole key. If none are found we try to reconstruct - * one from parts. - **/ - pub fn parse_incoming_unstructured(&self, subject: &str) -> Option<IncomingSubjectResult> { - /** Parse an incoming subject */ - fn parse_single(str: &str) -> Option<IncomingSubject> { - // Check key type - let (kind, key) = if let Some(key) = str.strip_prefix("KYC:") { - (IncomingType::Kyc, key) - } else { - (IncomingType::Reserve, str) - }; +#[derive(Debug)] +pub struct Candidate { + subject: IncomingSubject, + quality: Base32Quality, +} - // Check key validity - let key = EddsaPublicKey::from_str(key).ok()?; - if ed25519_dalek::VerifyingKey::from_bytes(&key).is_err() { - return None; - } +#[derive(Debug, PartialEq, Eq)] +pub enum IncomingSubjectResult { + Success(IncomingSubject), + Ambiguous, +} + +/** + * Extract the public key from an unstructured incoming transfer subject. + * + * When a user enters the transfer object in an unstructured way, for ex in + * their banking UI, they may mistakenly enter separators such as ' \n-+' and + * make typos. + * To parse them while ignoring user errors, we reconstruct valid keys from key + * parts, resolving ambiguities where possible. + **/ +pub fn parse_incoming_unstructured(subject: &str) -> Option<IncomingSubjectResult> { + // We expect subject to be less than 65KB + assert!(subject.len() <= u16::MAX as usize); + /** Parse an incoming subject */ + fn parse_single(str: &str) -> Option<Candidate> { + // Check key type + let (kind, raw) = if let Some(key) = str.strip_prefix("KYC:") { + (IncomingType::Kyc, key) + } else { + (IncomingType::Reserve, str) + }; - Some(IncomingSubject(kind, key)) + // Check key validity + let key = EddsaPublicKey::from_str(raw).ok()?; + if ed25519_dalek::VerifyingKey::from_bytes(&key).is_err() { + return None; } - /** Parse a unique incoming subject */ - fn parse_unique( - mut candidates: impl Iterator<Item = IncomingSubject>, - ) -> Option<IncomingSubjectResult> { - // Check none - let first = candidates.next()?; + let quality = Base32Quality::measure(raw); + Some(Candidate { + subject: IncomingSubject(kind, key), + quality, + }) + } - // Check many - if candidates.next().is_some() { - return Some(IncomingSubjectResult::Ambiguous); + // Find and concatenate valid parts of a keys + let (parts, concatenated) = { + let mut parts = Vec::new(); + let mut concatenated = String::new(); + let mut part = None; + for (i, b) in subject.as_bytes().iter().enumerate() { + if matches!(b, b'0'..=b':' | b'A'..=b'Z' | b'a'..=b'z') { + if part.is_none() { + part = Some(i); + } + } else if let Some(from) = part { + let start = concatenated.len() as u16; + concatenated.push_str(&subject[from..i]); + let end = concatenated.len() as u16; + parts.push(start..end); + part = None; } - Some(IncomingSubjectResult::Success(first)) } - - // Check whole key - if let Some(res) = parse_unique( - self.key_regex - .find_iter(subject) - .filter_map(|it| parse_single(it.as_str())), - ) { - return Some(res); + if let Some(from) = part { + let start = concatenated.len() as u16; + concatenated.push_str(&subject[from..]); + let end = concatenated.len() as u16; + parts.push(start..end); } + (parts, concatenated) + }; - // Check from parts - let parts: Vec<_> = self - .part_regex - .find_iter(subject) - .map(|it| it.as_str()) - .collect(); - // We start by assembling big chunks first as we need to detect keys with prefix before other ones - let mut concatenated = String::with_capacity(56); - for window_size in (2..=parts.len()).rev() { - if let Some(res) = parse_unique(parts.windows(window_size).filter_map(|parts| { - // Fast skipping path - let total_len: usize = parts.iter().map(|it| it.len()).sum(); - if total_len != 52 && total_len != 56 { - // 52 for key and 56 for 'KIC:' prefix + key - return None; - } - - // Reuse a single allocation - concatenated.clear(); - for part in parts { - concatenated.push_str(part); + // Find best candidates + let mut best: Option<Candidate> = None; + // For each part as a starting point + for (i, start) in parts.iter().enumerate() { + // Use progressively longer concatenation + for end in parts[i..].iter() { + let range = start.start..end.end; + // Until they are to long to be a key + if range.len() > 56 { + break; + } + // If the slice is the right size for a key (56B with prefix else 54B) + if range.len() == 52 || range.len() == 56 { + // Parse the concatenated parts + let slice = + unsafe { &concatenated.get_unchecked(start.start as usize..end.end as usize) }; + if let Some(other) = parse_single(slice) { + // On success update best candidate + match &mut best { + Some(best) => { + if other.quality > best.quality // We prefer high quality keys + || matches!( // We prefer prefixed keys over reserve keys + (&best.subject.0, &other.subject.0), + (IncomingType::Reserve, IncomingType::Kyc | IncomingType::Wad) + ) + { + *best = other + } else if best.subject.1 != other.subject.1 // If keys are different + && best.quality == other.quality // Of same quality + && !matches!( // And prefixing is diferent + (&best.subject.0, &other.subject.0), + (IncomingType::Kyc | IncomingType::Wad, IncomingType::Reserve) + ) + { + return Some(IncomingSubjectResult::Ambiguous); + } + } + None => best = Some(other), + } } - parse_single(&concatenated) - })) { - return Some(res); } } - - None } + + best.map(|it| IncomingSubjectResult::Success(it.subject)) } #[test] /** Test parsing logic */ fn parse() { - let parser = SubjectParse::new(); + let key = "4MZT6RS3RVB3B0E2RDMYW0YRA3Y0VPHYV0CYDE6XBB0YMPFXCEG0"; + let other = "00Q979QSMJ29S7BJT3DDAVC5A0DR5Z05B7N0QT1RCBQ8FXJPZ6RG"; + + // Common checkes for (ty, prefix) in [(IncomingType::Reserve, ""), (IncomingType::Kyc, "KYC:")] { - let key = IncomingSubject( - ty, - EddsaPublicKey::from_str("4MZT6RS3RVB3B0E2RDMYW0YRA3Y0VPHYV0CYDE6XBB0YMPFXCEG0") - .unwrap(), - ); - let result = Some(IncomingSubjectResult::Success(key)); - let upper = &format!("{prefix}4MZT6RS3RVB3B0E2RDMYW0YRA3Y0VPHYV0CYDE6XBB0YMPFXCEG0"); - let (upper_l, upper_r) = upper.split_at(upper.len() / 2); + let standard = &format!("{prefix}{key}"); + let (upper_l, upper_r) = standard.split_at(standard.len() / 2); let mixed = &format!("{prefix}4mzt6RS3rvb3b0e2rdmyw0yra3y0vphyv0cyde6xbb0ympfxceg0"); let (mixed_l, mixed_r) = mixed.split_at(mixed.len() / 2); + let other_standard = &format!("{prefix}{other}"); + let other_mixed = &format!("{prefix}TEGY6d9mh9pgwvwpgs0z0095z854xegfy7jj202yd0esp8p0za60"); + + let result = Some(IncomingSubjectResult::Success(IncomingSubject( + ty, + EddsaPublicKey::from_str(key).unwrap(), + ))); // Check succeed if upper or mixed - for case in [upper, mixed] { + for case in [standard, mixed] { for test in [ format!("noise {case} noise"), format!("{case} noise to the right"), @@ -155,7 +208,7 @@ fn parse() { format!("noise\n{case}\nnoise"), format!("Test+{case}"), ] { - assert_eq!(parser.parse_incoming_unstructured(&test), result); + assert_eq!(parse_incoming_unstructured(&test), result); } } @@ -173,42 +226,65 @@ fn parse() { format!("left {l} \n {r} right"), format!("left {l} - + \n {r} right"), ] { - assert_eq!(parser.parse_incoming_unstructured(&case), result); + assert_eq!(parse_incoming_unstructured(&case), result); } } // Check concat parts - for chunk_size in 1..upper.len() { - let chunked: String = upper + for chunk_size in 1..standard.len() { + let chunked: String = standard .as_bytes() .chunks(chunk_size) .flat_map(|c| [std::str::from_utf8(c).unwrap(), " "]) .collect(); - for case in &[chunked.clone(), format!("left {chunked} right")] { - assert_eq!(parser.parse_incoming_unstructured(&case), result); + for case in [chunked.clone(), format!("left {chunked} right")] { + assert_eq!(parse_incoming_unstructured(&case), result); } } - // Check failure when multiple keys match + // Check failed when multiple key for case in [ - format!("{upper} {upper}"), - format!("{mixed} {mixed}"), - format!("{mixed} {upper}"), - format!("{mixed_l}-{mixed_r} {upper_l}-{upper_r}"), + format!("{standard} {other_standard}"), + format!("{mixed} {other_mixed}"), ] { assert_eq!( - parser.parse_incoming_unstructured(&case), + parse_incoming_unstructured(&case), Some(IncomingSubjectResult::Ambiguous) ); } + // Check accept redundant key + for case in [ + format!("{standard} {standard} {mixed} {mixed}"), // Accept redundant key + format!("{standard} {other_mixed}"), // Prefer high quality + ] { + assert_eq!(parse_incoming_unstructured(&case), result); + } + + // Check prefer prefixed over simple + for case in [format!("{mixed_l}-{mixed_r} {upper_l}-{upper_r}")] { + let res = parse_incoming_unstructured(&case); + if ty == IncomingType::Reserve { + assert_eq!(res, Some(IncomingSubjectResult::Ambiguous)); + } else { + assert_eq!(res, result); + } + } + // Check failure if malformed or missing for case in [ - "does not contain any reserve", // Check fail if none - &upper[0..upper.len() - 1], // Check fail if missing char + "does not contain any reserve", // Check fail if none + &standard[0..standard.len() - 1], // Check fail if missing char "2MZT6RS3RVB3B0E2RDMYW0YRA3Y0VPHYV0CYDE6XBB0YMPFXCEG0", // Check fail if not a valid key ] { - assert_eq!(parser.parse_incoming_unstructured(&case), None); + assert_eq!(parse_incoming_unstructured(&case), None); + } + + if ty == IncomingType::Kyc { + // Prefer prefixed over unprefixed + for case in [format!("{other} {standard}"), format!("{other} {mixed}")] { + assert_eq!(parse_incoming_unstructured(&case), result); + } } } } @@ -216,8 +292,6 @@ fn parse() { #[test] /** Test parsing logic using real cases */ fn real() { - let parser = SubjectParse::new(); - // Good case for (subject, key) in [ ( @@ -238,7 +312,7 @@ fn real() { IncomingType::Reserve, EddsaPublicKey::from_str(key).unwrap(), ))), - parser.parse_incoming_unstructured(subject) + parse_incoming_unstructured(subject) ) } } diff --git a/taler-common/src/types.rs b/taler-common/src/types.rs @@ -15,9 +15,9 @@ */ pub mod amount; +pub mod base32; pub mod payto; pub mod timestamp; -pub mod base32; use url::Url;