commit 2fb89d3901e0177c1e1130c6306b4a9a22a6f2ec
parent bb173f9b41774547e2ea7b37b9b0c9f3dbb1aa34
Author: Antoine A <>
Date: Wed, 25 Dec 2024 02:52:18 +0100
taler-common: improve and optimize subject parsing
Diffstat:
5 files changed, 216 insertions(+), 140 deletions(-)
diff --git a/Cargo.lock b/Cargo.lock
@@ -2414,7 +2414,6 @@ dependencies = [
"fastrand",
"jiff",
"rand",
- "regex",
"serde",
"serde_json",
"serde_urlencoded",
diff --git a/taler-common/Cargo.toml b/taler-common/Cargo.toml
@@ -9,7 +9,6 @@ rand = "0.8"
fastrand = "2.2.0"
serde_urlencoded = "0.7"
jiff = { version = "0.1", default-features = false, features = ["std"] }
-regex = { version = "1.11", default-features = false, features = ["perf"] }
ed25519-dalek = { version = "2.1.1", default-features = false }
serde = { workspace = true, features = ["derive"] }
serde_json = { workspace = true, features = ["raw_value"] }
diff --git a/taler-common/benches/subject.rs b/taler-common/benches/subject.rs
@@ -15,15 +15,26 @@
*/
use criterion::{black_box, criterion_group, criterion_main, Criterion};
-use taler_common::subject::SubjectParse;
+use taler_common::subject::parse_incoming_unstructured;
fn parser(c: &mut Criterion) {
const CHARS: &[u8] = b"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789::: \t\t\t\t\n\n\n\n----++++";
let mut rng = fastrand::Rng::with_seed(42);
- let real = [
+ let real_simple = [
+ "Taler TEGY6d9mh9pgwvwpgs0z0095z854xegfy7jj202yd0esp8p0za60",
+ "00Q979QSMJ29S7BJT3DDAVC5A0DR5Z05B7N0QT1RCBQ8FXJPZ6RG",
+ "Taler NDDCAM9XN4HJZFTBD8V6FNE2FJE8GY734PJ5AGQMY06C8D4HB3Z0",
+ "Taler KYC:TEGY6d9mh9pgwvwpgs0z0095z854xegfy7jj202yd0esp8p0za60",
+ "KYC:00Q979QSMJ29S7BJT3DDAVC5A0DR5Z05B7N0QT1RCBQ8FXJPZ6RG",
+ "Taler KYC:NDDCAM9XN4HJZFTBD8V6FNE2FJE8GY734PJ5AGQMY06C8D4HB3Z0",
+ ];
+ let real_splitted = [
"Taler TEGY6d9mh9pgwvwpgs0z0095z854xegfy7j j202yd0esp8p0za60",
"00Q979QSMJ29S7BJT3DDAVC5A0DR5Z05B7N 0QT1RCBQ8FXJPZ6RG",
"Taler NDDCAM9XN4HJZFTBD8V6FNE2FJE8G Y734PJ5AGQMY06C8D4HB3Z0",
+ "Taler KYC:TEGY6d9mh9pgwvwpgs0z0095z854xegfy7j j202yd0esp8p0za60",
+ "KYC: 00Q979QSMJ29S7BJT3DDAVC5A0DR5Z05B7N 0QT1RCBQ8FXJPZ6RG",
+ "Taler KIC:NDDCAM9XN4HJZFTBD8V6FNE2FJE8G Y734PJ5AGQMY06C8D4HB3Z0",
];
let randoms: Vec<String> = (0..30)
.map(|_| {
@@ -49,31 +60,24 @@ fn parser(c: &mut Criterion) {
})
.collect();
- let parser = SubjectParse::new();
-
- c.bench_function("real", |b| {
- b.iter(|| {
- for case in real {
- parser.parse_incoming_unstructured(black_box(case));
- }
- })
- });
+ fn bench<A: AsRef<str>>(
+ c: &mut Criterion,
+ name: &str,
+ cases: impl IntoIterator<Item = A> + Copy,
+ ) {
+ c.bench_function(name, |b| {
+ b.iter(|| {
+ for case in cases {
+ parse_incoming_unstructured(black_box(case.as_ref()));
+ }
+ })
+ });
+ }
- c.bench_function("rng", |b| {
- b.iter(|| {
- for case in &randoms {
- parser.parse_incoming_unstructured(black_box(case));
- }
- })
- });
-
- c.bench_function("chunks", |b| {
- b.iter(|| {
- for case in &chunks {
- parser.parse_incoming_unstructured(black_box(case));
- }
- })
- });
+ bench(c, "real_simple", real_simple);
+ bench(c, "real_splitted", real_splitted);
+ bench(c, "rng", &randoms);
+ bench(c, "chunks", &chunks);
}
criterion_group!(benches, parser);
diff --git a/taler-common/src/subject.rs b/taler-common/src/subject.rs
@@ -16,11 +16,9 @@
use std::{fmt::Debug, str::FromStr};
-use regex::Regex;
+use crate::{api_common::EddsaPublicKey, types::base32::CROCKFORD_ALPHABET};
-use crate::api_common::EddsaPublicKey;
-
-#[derive(Debug, PartialEq, Eq)]
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum IncomingType {
Reserve,
Kyc,
@@ -30,123 +28,178 @@ pub enum IncomingType {
#[derive(Debug, PartialEq, Eq)]
pub struct IncomingSubject(IncomingType, EddsaPublicKey);
-#[derive(Debug, PartialEq, Eq)]
-pub enum IncomingSubjectResult {
- Success(IncomingSubject),
- Ambiguous,
+/** Base32 quality by proximity to spec and error probability */
+#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
+enum Base32Quality {
+ /// Both mixed casing and mixed characters, thats weird
+ Mixed,
+ /// Standard but use lowercase, maybe the client shown lowercase in the UI
+ Standard,
+ /// Uppercase but mixed characters, its common when making typos
+ Upper,
+ /// Both uppercase and use the standard alphabet as it should
+ UpperStandard,
}
-pub struct SubjectParse {
- part_regex: Regex,
- key_regex: Regex,
-}
-
-impl SubjectParse {
- pub fn new() -> Self {
- Self {
- part_regex: Regex::new("[:a-z0-9A-Z]*").expect("part_pattern regex"),
- key_regex: Regex::new("(KYC:)?([a-z0-9A-Z]{52})").expect("key_regex regex"),
+impl Base32Quality {
+ pub fn measure(s: &str) -> Self {
+ let mut uppercase = true;
+ let mut standard = true;
+ for b in s.bytes() {
+ uppercase &= b.is_ascii_uppercase();
+ standard &= CROCKFORD_ALPHABET.contains(&b)
+ }
+ match (uppercase, standard) {
+ (true, true) => Base32Quality::UpperStandard,
+ (true, false) => Base32Quality::Upper,
+ (false, true) => Base32Quality::Standard,
+ (false, false) => Base32Quality::Mixed,
}
}
+}
- /**
- * Extract the public key from an unstructured incoming transfer subject
- *
- * We first try to find a whole key. If none are found we try to reconstruct
- * one from parts.
- **/
- pub fn parse_incoming_unstructured(&self, subject: &str) -> Option<IncomingSubjectResult> {
- /** Parse an incoming subject */
- fn parse_single(str: &str) -> Option<IncomingSubject> {
- // Check key type
- let (kind, key) = if let Some(key) = str.strip_prefix("KYC:") {
- (IncomingType::Kyc, key)
- } else {
- (IncomingType::Reserve, str)
- };
+#[derive(Debug)]
+pub struct Candidate {
+ subject: IncomingSubject,
+ quality: Base32Quality,
+}
- // Check key validity
- let key = EddsaPublicKey::from_str(key).ok()?;
- if ed25519_dalek::VerifyingKey::from_bytes(&key).is_err() {
- return None;
- }
+#[derive(Debug, PartialEq, Eq)]
+pub enum IncomingSubjectResult {
+ Success(IncomingSubject),
+ Ambiguous,
+}
+
+/**
+ * Extract the public key from an unstructured incoming transfer subject.
+ *
+ * When a user enters the transfer object in an unstructured way, for ex in
+ * their banking UI, they may mistakenly enter separators such as ' \n-+' and
+ * make typos.
+ * To parse them while ignoring user errors, we reconstruct valid keys from key
+ * parts, resolving ambiguities where possible.
+ **/
+pub fn parse_incoming_unstructured(subject: &str) -> Option<IncomingSubjectResult> {
+ // We expect subject to be less than 65KB
+ assert!(subject.len() <= u16::MAX as usize);
+ /** Parse an incoming subject */
+ fn parse_single(str: &str) -> Option<Candidate> {
+ // Check key type
+ let (kind, raw) = if let Some(key) = str.strip_prefix("KYC:") {
+ (IncomingType::Kyc, key)
+ } else {
+ (IncomingType::Reserve, str)
+ };
- Some(IncomingSubject(kind, key))
+ // Check key validity
+ let key = EddsaPublicKey::from_str(raw).ok()?;
+ if ed25519_dalek::VerifyingKey::from_bytes(&key).is_err() {
+ return None;
}
- /** Parse a unique incoming subject */
- fn parse_unique(
- mut candidates: impl Iterator<Item = IncomingSubject>,
- ) -> Option<IncomingSubjectResult> {
- // Check none
- let first = candidates.next()?;
+ let quality = Base32Quality::measure(raw);
+ Some(Candidate {
+ subject: IncomingSubject(kind, key),
+ quality,
+ })
+ }
- // Check many
- if candidates.next().is_some() {
- return Some(IncomingSubjectResult::Ambiguous);
+ // Find and concatenate valid parts of a keys
+ let (parts, concatenated) = {
+ let mut parts = Vec::new();
+ let mut concatenated = String::new();
+ let mut part = None;
+ for (i, b) in subject.as_bytes().iter().enumerate() {
+ if matches!(b, b'0'..=b':' | b'A'..=b'Z' | b'a'..=b'z') {
+ if part.is_none() {
+ part = Some(i);
+ }
+ } else if let Some(from) = part {
+ let start = concatenated.len() as u16;
+ concatenated.push_str(&subject[from..i]);
+ let end = concatenated.len() as u16;
+ parts.push(start..end);
+ part = None;
}
- Some(IncomingSubjectResult::Success(first))
}
-
- // Check whole key
- if let Some(res) = parse_unique(
- self.key_regex
- .find_iter(subject)
- .filter_map(|it| parse_single(it.as_str())),
- ) {
- return Some(res);
+ if let Some(from) = part {
+ let start = concatenated.len() as u16;
+ concatenated.push_str(&subject[from..]);
+ let end = concatenated.len() as u16;
+ parts.push(start..end);
}
+ (parts, concatenated)
+ };
- // Check from parts
- let parts: Vec<_> = self
- .part_regex
- .find_iter(subject)
- .map(|it| it.as_str())
- .collect();
- // We start by assembling big chunks first as we need to detect keys with prefix before other ones
- let mut concatenated = String::with_capacity(56);
- for window_size in (2..=parts.len()).rev() {
- if let Some(res) = parse_unique(parts.windows(window_size).filter_map(|parts| {
- // Fast skipping path
- let total_len: usize = parts.iter().map(|it| it.len()).sum();
- if total_len != 52 && total_len != 56 {
- // 52 for key and 56 for 'KIC:' prefix + key
- return None;
- }
-
- // Reuse a single allocation
- concatenated.clear();
- for part in parts {
- concatenated.push_str(part);
+ // Find best candidates
+ let mut best: Option<Candidate> = None;
+ // For each part as a starting point
+ for (i, start) in parts.iter().enumerate() {
+ // Use progressively longer concatenation
+ for end in parts[i..].iter() {
+ let range = start.start..end.end;
+ // Until they are to long to be a key
+ if range.len() > 56 {
+ break;
+ }
+ // If the slice is the right size for a key (56B with prefix else 54B)
+ if range.len() == 52 || range.len() == 56 {
+ // Parse the concatenated parts
+ let slice =
+ unsafe { &concatenated.get_unchecked(start.start as usize..end.end as usize) };
+ if let Some(other) = parse_single(slice) {
+ // On success update best candidate
+ match &mut best {
+ Some(best) => {
+ if other.quality > best.quality // We prefer high quality keys
+ || matches!( // We prefer prefixed keys over reserve keys
+ (&best.subject.0, &other.subject.0),
+ (IncomingType::Reserve, IncomingType::Kyc | IncomingType::Wad)
+ )
+ {
+ *best = other
+ } else if best.subject.1 != other.subject.1 // If keys are different
+ && best.quality == other.quality // Of same quality
+ && !matches!( // And prefixing is diferent
+ (&best.subject.0, &other.subject.0),
+ (IncomingType::Kyc | IncomingType::Wad, IncomingType::Reserve)
+ )
+ {
+ return Some(IncomingSubjectResult::Ambiguous);
+ }
+ }
+ None => best = Some(other),
+ }
}
- parse_single(&concatenated)
- })) {
- return Some(res);
}
}
-
- None
}
+
+ best.map(|it| IncomingSubjectResult::Success(it.subject))
}
#[test]
/** Test parsing logic */
fn parse() {
- let parser = SubjectParse::new();
+ let key = "4MZT6RS3RVB3B0E2RDMYW0YRA3Y0VPHYV0CYDE6XBB0YMPFXCEG0";
+ let other = "00Q979QSMJ29S7BJT3DDAVC5A0DR5Z05B7N0QT1RCBQ8FXJPZ6RG";
+
+ // Common checkes
for (ty, prefix) in [(IncomingType::Reserve, ""), (IncomingType::Kyc, "KYC:")] {
- let key = IncomingSubject(
- ty,
- EddsaPublicKey::from_str("4MZT6RS3RVB3B0E2RDMYW0YRA3Y0VPHYV0CYDE6XBB0YMPFXCEG0")
- .unwrap(),
- );
- let result = Some(IncomingSubjectResult::Success(key));
- let upper = &format!("{prefix}4MZT6RS3RVB3B0E2RDMYW0YRA3Y0VPHYV0CYDE6XBB0YMPFXCEG0");
- let (upper_l, upper_r) = upper.split_at(upper.len() / 2);
+ let standard = &format!("{prefix}{key}");
+ let (upper_l, upper_r) = standard.split_at(standard.len() / 2);
let mixed = &format!("{prefix}4mzt6RS3rvb3b0e2rdmyw0yra3y0vphyv0cyde6xbb0ympfxceg0");
let (mixed_l, mixed_r) = mixed.split_at(mixed.len() / 2);
+ let other_standard = &format!("{prefix}{other}");
+ let other_mixed = &format!("{prefix}TEGY6d9mh9pgwvwpgs0z0095z854xegfy7jj202yd0esp8p0za60");
+
+ let result = Some(IncomingSubjectResult::Success(IncomingSubject(
+ ty,
+ EddsaPublicKey::from_str(key).unwrap(),
+ )));
// Check succeed if upper or mixed
- for case in [upper, mixed] {
+ for case in [standard, mixed] {
for test in [
format!("noise {case} noise"),
format!("{case} noise to the right"),
@@ -155,7 +208,7 @@ fn parse() {
format!("noise\n{case}\nnoise"),
format!("Test+{case}"),
] {
- assert_eq!(parser.parse_incoming_unstructured(&test), result);
+ assert_eq!(parse_incoming_unstructured(&test), result);
}
}
@@ -173,42 +226,65 @@ fn parse() {
format!("left {l} \n {r} right"),
format!("left {l} - + \n {r} right"),
] {
- assert_eq!(parser.parse_incoming_unstructured(&case), result);
+ assert_eq!(parse_incoming_unstructured(&case), result);
}
}
// Check concat parts
- for chunk_size in 1..upper.len() {
- let chunked: String = upper
+ for chunk_size in 1..standard.len() {
+ let chunked: String = standard
.as_bytes()
.chunks(chunk_size)
.flat_map(|c| [std::str::from_utf8(c).unwrap(), " "])
.collect();
- for case in &[chunked.clone(), format!("left {chunked} right")] {
- assert_eq!(parser.parse_incoming_unstructured(&case), result);
+ for case in [chunked.clone(), format!("left {chunked} right")] {
+ assert_eq!(parse_incoming_unstructured(&case), result);
}
}
- // Check failure when multiple keys match
+ // Check failed when multiple key
for case in [
- format!("{upper} {upper}"),
- format!("{mixed} {mixed}"),
- format!("{mixed} {upper}"),
- format!("{mixed_l}-{mixed_r} {upper_l}-{upper_r}"),
+ format!("{standard} {other_standard}"),
+ format!("{mixed} {other_mixed}"),
] {
assert_eq!(
- parser.parse_incoming_unstructured(&case),
+ parse_incoming_unstructured(&case),
Some(IncomingSubjectResult::Ambiguous)
);
}
+ // Check accept redundant key
+ for case in [
+ format!("{standard} {standard} {mixed} {mixed}"), // Accept redundant key
+ format!("{standard} {other_mixed}"), // Prefer high quality
+ ] {
+ assert_eq!(parse_incoming_unstructured(&case), result);
+ }
+
+ // Check prefer prefixed over simple
+ for case in [format!("{mixed_l}-{mixed_r} {upper_l}-{upper_r}")] {
+ let res = parse_incoming_unstructured(&case);
+ if ty == IncomingType::Reserve {
+ assert_eq!(res, Some(IncomingSubjectResult::Ambiguous));
+ } else {
+ assert_eq!(res, result);
+ }
+ }
+
// Check failure if malformed or missing
for case in [
- "does not contain any reserve", // Check fail if none
- &upper[0..upper.len() - 1], // Check fail if missing char
+ "does not contain any reserve", // Check fail if none
+ &standard[0..standard.len() - 1], // Check fail if missing char
"2MZT6RS3RVB3B0E2RDMYW0YRA3Y0VPHYV0CYDE6XBB0YMPFXCEG0", // Check fail if not a valid key
] {
- assert_eq!(parser.parse_incoming_unstructured(&case), None);
+ assert_eq!(parse_incoming_unstructured(&case), None);
+ }
+
+ if ty == IncomingType::Kyc {
+ // Prefer prefixed over unprefixed
+ for case in [format!("{other} {standard}"), format!("{other} {mixed}")] {
+ assert_eq!(parse_incoming_unstructured(&case), result);
+ }
}
}
}
@@ -216,8 +292,6 @@ fn parse() {
#[test]
/** Test parsing logic using real cases */
fn real() {
- let parser = SubjectParse::new();
-
// Good case
for (subject, key) in [
(
@@ -238,7 +312,7 @@ fn real() {
IncomingType::Reserve,
EddsaPublicKey::from_str(key).unwrap(),
))),
- parser.parse_incoming_unstructured(subject)
+ parse_incoming_unstructured(subject)
)
}
}
diff --git a/taler-common/src/types.rs b/taler-common/src/types.rs
@@ -15,9 +15,9 @@
*/
pub mod amount;
+pub mod base32;
pub mod payto;
pub mod timestamp;
-pub mod base32;
use url::Url;