depolymerization

wire gateway for Bitcoin/Ethereum
Log | Files | Refs | Submodules | README | LICENSE

lib.rs (9239B)


      1 /*
      2   This file is part of TALER
      3   Copyright (C) 2022-2025 Taler Systems SA
      4 
      5   TALER is free software; you can redistribute it and/or modify it under the
      6   terms of the GNU Affero General Public License as published by the Free Software
      7   Foundation; either version 3, or (at your option) any later version.
      8 
      9   TALER is distributed in the hope that it will be useful, but WITHOUT ANY
     10   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
     11   A PARTICULAR PURPOSE.  See the GNU Affero General Public License for more details.
     12 
     13   You should have received a copy of the GNU Affero General Public License along with
     14   TALER; see the file COPYING.  If not, see <http://www.gnu.org/licenses/>
     15 */
     16 //!
     17 //! uri-pack is an efficient binary format for URI
     18 //!
     19 //! ## Format
     20 //!
     21 //! Most commonly used characters (a-z . / - %) are encoded using 5b, remaining
     22 //! ascii characters are encoded using 11b.If more than half the characters in an
     23 //! uri are encoded with 5b, the encoded size is smaller than a simple ascii
     24 //! format.
     25 //!
     26 //! On the majestic_million database, 98.77% of the domain name where smaller,
     27 //! going from an average of 14b to an average of 10b.
     28 //!
     29 //! ## Usage
     30 //!
     31 //! ``` rust
     32 //! use uri_pack::{pack_uri, unpack_uri};
     33 //!
     34 //! let domain = "http://example.com/static_file/image.png";
     35 //! let encoded = pack_uri(domain).unwrap();
     36 //! let decoded = unpack_uri(&encoded).unwrap();
     37 //! assert_eq!(domain, decoded);
     38 //! ```
     39 //!
     40 
     41 /// Pack an URI ascii char
     42 /// Panic if char not supported
     43 fn pack_ascii(c: u8) -> u8 {
     44     [
     45         67, 68, 69, 70, 29, 71, 72, 73, 74, 75, 76, 77, 28, 26, 27, 57, 58, 59, 60, 61, 62, 63, 64,
     46         65, 66, 78, 79, 80, 81, 82, 83, 84, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
     47         45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 85, 86, 87, 88, 30, 89, 0, 1, 2, 3, 4, 5,
     48         6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 90, 91, 92, 93,
     49     ][(c - b'!') as usize]
     50 }
     51 
     52 /// Unpack an URI ascii char
     53 /// Panic if char not supported
     54 fn unpack_ascii(c: u8) -> u8 {
     55     [
     56         b'a', b'b', b'c', b'd', b'e', b'f', b'g', b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o',
     57         b'p', b'q', b'r', b's', b't', b'u', b'v', b'w', b'x', b'y', b'z', b'.', b'/', b'-', b'%',
     58         b'_', b'A', b'B', b'C', b'D', b'E', b'F', b'G', b'H', b'I', b'J', b'K', b'L', b'M', b'N',
     59         b'O', b'P', b'Q', b'R', b'S', b'T', b'U', b'V', b'W', b'X', b'Y', b'Z', b'0', b'1', b'2',
     60         b'3', b'4', b'5', b'6', b'7', b'8', b'9', b'!', b'"', b'#', b'$', b'&', b'\'', b'(', b')',
     61         b'*', b'+', b',', b':', b';', b'<', b'=', b'>', b'?', b'@', b'[', b'\\', b']', b'^', b'`',
     62         b'{', b'|', b'}', b'~',
     63     ][c as usize]
     64 }
     65 
     66 /// Check if an ascii char is supported by the encoding
     67 fn supported_ascii(c: &u8) -> bool {
     68     (b'!'..=b'~').contains(c)
     69 }
     70 
     71 /// Extended packing limit
     72 const EXTENDED: u8 = 30;
     73 /// EOF u5 encoding
     74 const TERMINATOR: u8 = 31;
     75 
     76 #[derive(Debug, Clone, Copy, thiserror::Error)]
     77 pub enum EncodeErr {
     78     #[error("{0} is not a valid uri char")]
     79     UnsupportedChar(u8),
     80 }
     81 
     82 #[derive(Debug, Clone, Copy, thiserror::Error)]
     83 pub enum DecodeErr {
     84     #[error("An extended encoded char have been passed as an simple one")]
     85     ExpectedExtended,
     86     #[error("{0} is not an simple encoded char")]
     87     UnexpectedSimpleChar(u8),
     88     #[error("{0} is not an extended encoded char")]
     89     UnexpectedExtendedChar(u8),
     90     #[error("Missing bits")]
     91     UnexpectedEOF,
     92 }
     93 
     94 /// Pack an uri string into an optimized binary format
     95 pub fn pack_uri(uri: &str) -> Result<Vec<u8>, EncodeErr> {
     96     let len = uri.len();
     97     let mut vec = Vec::with_capacity(len);
     98 
     99     if let Some(char) = uri.as_bytes().iter().find(|c| !supported_ascii(c)) {
    100         return Err(EncodeErr::UnsupportedChar(*char));
    101     }
    102 
    103     // Holds [buff_bits] pending bits beginning from the most significant bits
    104     let (mut buff, mut buff_bits) = (0u8, 0u8);
    105 
    106     // Write [nb_bits] less significant bits from [nb] to [buff]
    107     let mut write_bits = |nb: u8, mut nb_bits: u8| {
    108         while nb_bits > 0 {
    109             // Amount of bits we can write in buffer
    110             let writable = (8 - buff_bits).min(nb_bits);
    111             // Remove non writable bits
    112             let rmv_right = nb >> (nb_bits - writable);
    113             let rmv_left = rmv_right << (8 - writable);
    114             // Align remaining bits with buff blank bits
    115             let align = rmv_left >> buff_bits;
    116 
    117             // Write bits in buffer
    118             buff |= align;
    119             buff_bits += writable;
    120             nb_bits -= writable;
    121 
    122             // Store buffer if full
    123             if buff_bits == 8 {
    124                 vec.push(buff);
    125                 buff = 0;
    126                 buff_bits = 0;
    127             }
    128         }
    129     };
    130 
    131     for c in uri.bytes() {
    132         let nb = pack_ascii(c);
    133         if nb < EXTENDED {
    134             write_bits(nb, 5)
    135         } else {
    136             write_bits(EXTENDED, 5);
    137             write_bits(nb - EXTENDED, 6);
    138         }
    139     }
    140     write_bits(TERMINATOR, 5);
    141 
    142     // Push pending buffer if not empty
    143     if buff_bits > 0 {
    144         vec.push(buff);
    145     }
    146 
    147     Ok(vec)
    148 }
    149 
    150 /// Unpack an uri string from its optimized binary format
    151 pub fn unpack_uri(bytes: &[u8]) -> Result<String, DecodeErr> {
    152     let mut buf = String::with_capacity(bytes.len());
    153     let mut iter = bytes.iter();
    154 
    155     // Holds [buff_bits] pending bits beginning from the most significant bits
    156     let (mut buff, mut buff_bits) = (0u8, 0u8);
    157 
    158     // Write [nb_bits] less significant bits from [buff] to [nb]
    159     let mut read_nb = |mut nb_bits: u8| -> Result<u8, DecodeErr> {
    160         let mut nb = 0;
    161         while nb_bits > 0 {
    162             // Load buff if empty
    163             if buff_bits == 0 {
    164                 buff = *iter.next().ok_or(DecodeErr::UnexpectedEOF)?;
    165                 buff_bits = 8;
    166             }
    167             // Amount of bits we can read from buff
    168             let readable = buff_bits.min(nb_bits);
    169             // Remove non writable bits
    170             let rmv_left = buff << (8 - buff_bits);
    171             // Align remaining bits with nb blank bits
    172             let align = rmv_left >> (8 - readable);
    173             // Read bits from buff
    174             nb = (nb << readable) | align;
    175             buff_bits -= readable;
    176             nb_bits -= readable;
    177         }
    178         Ok(nb)
    179     };
    180 
    181     loop {
    182         let encoded = match read_nb(5)? {
    183             TERMINATOR => break,
    184             EXTENDED => read_nb(6)? + EXTENDED,
    185             nb => nb,
    186         };
    187         buf.push(unpack_ascii(encoded) as char);
    188     }
    189 
    190     Ok(buf)
    191 }
    192 
    193 #[cfg(test)]
    194 #[macro_use(quickcheck)]
    195 extern crate quickcheck_macros;
    196 
    197 #[cfg(test)]
    198 mod test {
    199     use std::str::FromStr;
    200 
    201     use serde_json::Value;
    202 
    203     use crate::{EXTENDED, pack_ascii, pack_uri, supported_ascii, unpack_ascii, unpack_uri};
    204 
    205     /// Ascii char that can be packed into 5 bits
    206     const PACKED: [u8; 30] = [
    207         b'a', b'b', b'c', b'd', b'e', b'f', b'g', b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o',
    208         b'p', b'q', b'r', b's', b't', b'u', b'v', b'w', b'x', b'y', b'z', b'.', b'/', b'-', b'%',
    209     ];
    210 
    211     #[test]
    212     /// Check support every packable ascii character is packed
    213     fn packed() {
    214         for c in PACKED {
    215             assert!(pack_ascii(c) < EXTENDED);
    216         }
    217     }
    218 
    219     #[test]
    220     /// Check support every ascii graphic character and space
    221     fn supported() {
    222         for c in (0..=255u8).filter(supported_ascii) {
    223             assert_eq!(unpack_ascii(pack_ascii(c)), c);
    224         }
    225     }
    226 
    227     #[test]
    228     /// Check error on unsupported char
    229     fn unsupported() {
    230         for c in (0..=255u8).filter(|c| !supported_ascii(c)) {
    231             let string = String::from(c as char);
    232             assert!(pack_uri(&string).is_err());
    233         }
    234     }
    235 
    236     #[test]
    237     fn url_simple() {
    238         let mut majestic =
    239             csv::Reader::from_reader(include_str!("majestic_million.csv").as_bytes());
    240         for record in majestic.records() {
    241             let domain = &record.unwrap()[2];
    242             let encoded = pack_uri(domain).unwrap();
    243             let decoded = unpack_uri(&encoded).unwrap();
    244             assert_eq!(domain, decoded);
    245         }
    246     }
    247 
    248     #[test]
    249     fn url_complex() {
    250         let mut json = Value::from_str(include_str!("urltestdata.json"))
    251             .expect("JSON parse error in urltestdata.json");
    252         for entry in json.as_array_mut().unwrap() {
    253             if entry.is_string() {
    254                 continue; // ignore comments
    255             }
    256 
    257             let href = entry.get("href").and_then(|it| it.as_str()).unwrap_or("");
    258             if href.chars().any(|c| !c.is_ascii_graphic() || c != ' ') {
    259                 continue; // extended ascii
    260             }
    261             let encoded = pack_uri(&href).expect(&format!("Failed to encode {}", &href));
    262             let decoded =
    263                 unpack_uri(&encoded).expect(&format!("Failed to decode encoded {}", &href));
    264             assert_eq!(href, decoded);
    265         }
    266     }
    267 
    268     #[quickcheck]
    269     fn fuzz(input: String) -> bool {
    270         if input.as_bytes().iter().all(supported_ascii) {
    271             let packed = pack_uri(&input).unwrap();
    272             let unpacked = unpack_uri(&packed).unwrap();
    273             input == unpacked
    274         } else {
    275             true
    276         }
    277     }
    278 }