taler-common: optimize base32 and subject parsing - taler-rust - GNU Taler code in Rust. Largely core banking integrations.

commit b94ab22e7b48693d804a00c11166ab19a6860dd9
parent 746650e94729c23fe3765a3f561525cfb35d59c4
Author: Antoine A <>
Date:   Tue, 24 Dec 2024 12:44:23 +0100

taler-common: optimize base32 and subject parsing

Diffstat:
M Cargo.lock  | 7 -------
M taler-common/Cargo.toml  | 1 -
M taler-common/src/api_common.rs  | 80 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------
M taler-common/src/lib.rs  | 2 +-
M taler-common/src/subject.rs  | 31 ++++++++++++++++---------------

5 files changed, 91 insertions(+), 30 deletions(-)
diff --git a/Cargo.lock b/Cargo.lock
@@ -220,12 +220,6 @@ dependencies = [
 ]
 
 [[package]]
-name = "base32"
-version = "0.5.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "022dfe9eb35f19ebbcb51e0b40a5ab759f46ad60cadf7297e0bd085afb50e076"
-
-[[package]]
 name = "base64"
 version = "0.22.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -2415,7 +2409,6 @@ dependencies = [
 name = "taler-common"
 version = "0.1.0"
 dependencies = [
- "base32",
  "criterion",
  "ed25519-dalek",
  "fastrand",
diff --git a/taler-common/Cargo.toml b/taler-common/Cargo.toml
@@ -4,7 +4,6 @@ version = "0.1.0"
 edition = "2021"
 
 [dependencies]
-base32 = "0.5.1"
 serde_with = "3.11.0"
 rand = "0.8"
 fastrand = "2.2.0"
diff --git a/taler-common/src/api_common.rs b/taler-common/src/api_common.rs
@@ -136,20 +136,88 @@ impl<const L: usize> Deref for Base32<L> {
     }
 }
 
+#[rustfmt::skip]
+/*
+     0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  :,  ;,  <,  =,  >,  ?,  @,  A,  B,  C,
+     D,  E,  F,  G,  H,  I,  J,  K,  L,  M,  N,  O,  P,  Q,  R,  S,  T,  U,  V,  W,
+     X,  Y,  Z,  [,  \,  ],  ^,  _,  `,  a,  b,  c,  d,  e,  f,  g,  h,  i,  j,  k,
+     l,  m,  n,  o,  p,  q,  r,  s,  t,  u,  v,  w,  x,  y,  z,
+*/
+const CROCKFORD_INV: [i8; 75] = [
+     0,  1,  2,  3,  4,  5,  6,  7,  8,  9, -1, -1, -1, -1, -1, -1, -1, 10, 11, 12,
+    13, 14, 15, 16, 17,  1, 18, 19,  1, 20, 21,  0, 22, 23, 24, 25, 26, -1, 27, 28,
+    29, 30, 31, -1, -1, -1, -1, -1, -1, 10, 11, 12, 13, 14, 15, 16, 17,  1, 18, 19,
+     1, 20, 21,  0, 22, 23, 24, 25, 26, -1, 27, 28, 29, 30, 31,
+];
+
 impl<const L: usize> FromStr for Base32<L> {
     type Err = Base32Error<L>;
 
     fn from_str(s: &str) -> Result<Self, Self::Err> {
-        let bytes = base32::decode(base32::Alphabet::Crockford, s).ok_or(Base32Error::Format)?;
-        let exact: [u8; L] = bytes
-            .try_into()
-            .map_err(|vec: Vec<u8>| Base32Error::Length(vec.len()))?;
-        Ok(Self(exact))
+        let encoded = s.as_bytes();
+
+        // Check decode length
+        let output_length = encoded.len() * 5 / 8;
+        if output_length != L {
+            return Err(Base32Error::Length(output_length));
+        }
+
+        let mut decoded = [0u8; L];
+        let mut bits = 0;
+        let mut buf = 0u16;
+        let mut cursor = 0usize;
+
+        for b in encoded {
+            // Read input
+            match CROCKFORD_INV.get(b.wrapping_sub(b'0') as usize).copied() {
+                Some(-1) | None => return Err(Base32Error::Format),
+                Some(lookup) => {
+                    // Add 5 bits
+                    buf = (buf << 5) | (lookup as u16);
+                    bits += 5;
+
+                    // Write byte if full
+                    if bits >= 8 {
+                        bits -= 8;
+                        unsafe {
+                            // SAFETY we know this algorithm never produce more than L bytes
+                            *decoded.get_unchecked_mut(cursor) = (buf >> bits) as u8;
+                        }
+                        cursor += 1;
+                    }
+                }
+            }
+        }
+        Ok(Self(decoded))
     }
 }
 
 pub fn base32(bytes: &[u8]) -> String {
-    base32::encode(base32::Alphabet::Crockford, bytes)
+    const CROCKFORD_ALPHABET: &[u8] = b"0123456789ABCDEFGHJKMNPQRSTVWXYZ";
+    let mut encoded = Vec::with_capacity((bytes.len() + 3) / 4 * 5);
+    let mut bits = 0;
+    let mut buf = 0u32;
+
+    for &byte in bytes {
+        // Add a byte
+        buf = (buf << 8) | byte as u32;
+        bits += 8;
+
+        // Write 5 bits
+        while bits >= 5 {
+            bits -= 5;
+            let index = (buf >> bits) & 0b11111; // Extract top 5 bits
+            encoded.push(CROCKFORD_ALPHABET[index as usize]);
+        }
+    }
+
+    if bits > 0 {
+        // Handle remaining bits (padding)
+        let index = (buf << (5 - bits)) & 0b11111; // Shift to fill 5 bits
+        encoded.push(CROCKFORD_ALPHABET[index as usize]);
+    }
+
+    unsafe { String::from_utf8_unchecked(encoded) }
 }
 
 impl<const L: usize> Display for Base32<L> {
diff --git a/taler-common/src/lib.rs b/taler-common/src/lib.rs
@@ -18,8 +18,8 @@ pub mod api_common;
 pub mod api_params;
 pub mod api_wire;
 pub mod error_code;
-pub mod types;
 pub mod subject;
+pub mod types;
 pub mod config {
     // TODO
 }
diff --git a/taler-common/src/subject.rs b/taler-common/src/subject.rs
@@ -39,7 +39,6 @@ pub enum IncomingSubjectResult {
 pub struct SubjectParse {
     part_regex: Regex,
     key_regex: Regex,
-    whole_regex: Regex,
 }
 
 impl SubjectParse {
@@ -47,7 +46,6 @@ impl SubjectParse {
         Self {
             part_regex: Regex::new("[:a-z0-9A-Z]*").expect("part_pattern regex"),
             key_regex: Regex::new("(KYC:)?([a-z0-9A-Z]{52})").expect("key_regex regex"),
-            whole_regex: Regex::new("^(KYC:)?([a-z0-9A-Z]{52})$").expect("whole_pattern regex"),
         }
     }
 
@@ -58,7 +56,7 @@ impl SubjectParse {
      * one from parts.
      **/
     pub fn parse_incoming_unstructured(&self, subject: &str) -> Option<IncomingSubjectResult> {
-        /** Parse an incoming subject   */
+        /** Parse an incoming subject */
         fn parse_single(str: &str) -> Option<IncomingSubject> {
             // Check key type
             let (kind, key) = if let Some(key) = str.strip_prefix("KYC:") {
@@ -77,22 +75,25 @@ impl SubjectParse {
         }
 
         /** Parse a unique incoming subject */
-        fn parse_unique<A: AsRef<str> + Debug>(
-            candidates: impl Iterator<Item = A>,
+        fn parse_unique(
+            mut candidates: impl Iterator<Item = IncomingSubject>,
         ) -> Option<IncomingSubjectResult> {
-            let mut keys = candidates.filter_map(|s| parse_single(s.as_ref()));
             // Check none
-            let first = keys.next()?;
+            let first = candidates.next()?;
 
             // Check many
-            if keys.next().is_some() {
+            if candidates.next().is_some() {
                 return Some(IncomingSubjectResult::Ambiguous);
             }
             Some(IncomingSubjectResult::Success(first))
         }
 
         // Check whole key
-        if let Some(res) = parse_unique(self.key_regex.find_iter(subject).map(|it| it.as_str())) {
+        if let Some(res) = parse_unique(
+            self.key_regex
+                .find_iter(subject)
+                .filter_map(|it| parse_single(it.as_str())),
+        ) {
             return Some(res);
         }
 
@@ -103,6 +104,7 @@ impl SubjectParse {
             .map(|it| it.as_str())
             .collect();
         // We start by assembling big chunks first as we need to detect keys with prefix before other ones
+        let mut concatenated = String::with_capacity(56);
         for window_size in (2..=parts.len()).rev() {
             if let Some(res) = parse_unique(parts.windows(window_size).filter_map(|parts| {
                 // Fast skipping path
@@ -112,13 +114,12 @@ impl SubjectParse {
                     return None;
                 }
 
-                // TODO lending iterator to reduce allocations
-                let concatenated: String = parts.iter().copied().collect();
-                if self.whole_regex.is_match(&concatenated) {
-                    Some(concatenated)
-                } else {
-                    None
+                // Reuse a single allocation
+                concatenated.clear();
+                for part in parts {
+                    concatenated.push_str(part);
                 }
+                parse_single(&concatenated)
             })) {
                 return Some(res);
             }

	taler-rust GNU Taler code in Rust. Largely core banking integrations.
	Log \| Files \| Refs \| Submodules \| README \| LICENSE

M	Cargo.lock	\|	7	-------
M	taler-common/Cargo.toml	\|	1	-
M	taler-common/src/api_common.rs	\|	80	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------
M	taler-common/src/lib.rs	\|	2	+-
M	taler-common/src/subject.rs	\|	31	++++++++++++++++---------------