lib.rs (9239B)
1 /* 2 This file is part of TALER 3 Copyright (C) 2022-2025 Taler Systems SA 4 5 TALER is free software; you can redistribute it and/or modify it under the 6 terms of the GNU Affero General Public License as published by the Free Software 7 Foundation; either version 3, or (at your option) any later version. 8 9 TALER is distributed in the hope that it will be useful, but WITHOUT ANY 10 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 11 A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. 12 13 You should have received a copy of the GNU Affero General Public License along with 14 TALER; see the file COPYING. If not, see <http://www.gnu.org/licenses/> 15 */ 16 //! 17 //! uri-pack is an efficient binary format for URI 18 //! 19 //! ## Format 20 //! 21 //! Most commonly used characters (a-z . / - %) are encoded using 5b, remaining 22 //! ascii characters are encoded using 11b.If more than half the characters in an 23 //! uri are encoded with 5b, the encoded size is smaller than a simple ascii 24 //! format. 25 //! 26 //! On the majestic_million database, 98.77% of the domain name where smaller, 27 //! going from an average of 14b to an average of 10b. 28 //! 29 //! ## Usage 30 //! 31 //! ``` rust 32 //! use uri_pack::{pack_uri, unpack_uri}; 33 //! 34 //! let domain = "http://example.com/static_file/image.png"; 35 //! let encoded = pack_uri(domain).unwrap(); 36 //! let decoded = unpack_uri(&encoded).unwrap(); 37 //! assert_eq!(domain, decoded); 38 //! ``` 39 //! 40 41 /// Pack an URI ascii char 42 /// Panic if char not supported 43 fn pack_ascii(c: u8) -> u8 { 44 [ 45 67, 68, 69, 70, 29, 71, 72, 73, 74, 75, 76, 77, 28, 26, 27, 57, 58, 59, 60, 61, 62, 63, 64, 46 65, 66, 78, 79, 80, 81, 82, 83, 84, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 47 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 85, 86, 87, 88, 30, 89, 0, 1, 2, 3, 4, 5, 48 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 90, 91, 92, 93, 49 ][(c - b'!') as usize] 50 } 51 52 /// Unpack an URI ascii char 53 /// Panic if char not supported 54 fn unpack_ascii(c: u8) -> u8 { 55 [ 56 b'a', b'b', b'c', b'd', b'e', b'f', b'g', b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o', 57 b'p', b'q', b'r', b's', b't', b'u', b'v', b'w', b'x', b'y', b'z', b'.', b'/', b'-', b'%', 58 b'_', b'A', b'B', b'C', b'D', b'E', b'F', b'G', b'H', b'I', b'J', b'K', b'L', b'M', b'N', 59 b'O', b'P', b'Q', b'R', b'S', b'T', b'U', b'V', b'W', b'X', b'Y', b'Z', b'0', b'1', b'2', 60 b'3', b'4', b'5', b'6', b'7', b'8', b'9', b'!', b'"', b'#', b'$', b'&', b'\'', b'(', b')', 61 b'*', b'+', b',', b':', b';', b'<', b'=', b'>', b'?', b'@', b'[', b'\\', b']', b'^', b'`', 62 b'{', b'|', b'}', b'~', 63 ][c as usize] 64 } 65 66 /// Check if an ascii char is supported by the encoding 67 fn supported_ascii(c: &u8) -> bool { 68 (b'!'..=b'~').contains(c) 69 } 70 71 /// Extended packing limit 72 const EXTENDED: u8 = 30; 73 /// EOF u5 encoding 74 const TERMINATOR: u8 = 31; 75 76 #[derive(Debug, Clone, Copy, thiserror::Error)] 77 pub enum EncodeErr { 78 #[error("{0} is not a valid uri char")] 79 UnsupportedChar(u8), 80 } 81 82 #[derive(Debug, Clone, Copy, thiserror::Error)] 83 pub enum DecodeErr { 84 #[error("An extended encoded char have been passed as an simple one")] 85 ExpectedExtended, 86 #[error("{0} is not an simple encoded char")] 87 UnexpectedSimpleChar(u8), 88 #[error("{0} is not an extended encoded char")] 89 UnexpectedExtendedChar(u8), 90 #[error("Missing bits")] 91 UnexpectedEOF, 92 } 93 94 /// Pack an uri string into an optimized binary format 95 pub fn pack_uri(uri: &str) -> Result<Vec<u8>, EncodeErr> { 96 let len = uri.len(); 97 let mut vec = Vec::with_capacity(len); 98 99 if let Some(char) = uri.as_bytes().iter().find(|c| !supported_ascii(c)) { 100 return Err(EncodeErr::UnsupportedChar(*char)); 101 } 102 103 // Holds [buff_bits] pending bits beginning from the most significant bits 104 let (mut buff, mut buff_bits) = (0u8, 0u8); 105 106 // Write [nb_bits] less significant bits from [nb] to [buff] 107 let mut write_bits = |nb: u8, mut nb_bits: u8| { 108 while nb_bits > 0 { 109 // Amount of bits we can write in buffer 110 let writable = (8 - buff_bits).min(nb_bits); 111 // Remove non writable bits 112 let rmv_right = nb >> (nb_bits - writable); 113 let rmv_left = rmv_right << (8 - writable); 114 // Align remaining bits with buff blank bits 115 let align = rmv_left >> buff_bits; 116 117 // Write bits in buffer 118 buff |= align; 119 buff_bits += writable; 120 nb_bits -= writable; 121 122 // Store buffer if full 123 if buff_bits == 8 { 124 vec.push(buff); 125 buff = 0; 126 buff_bits = 0; 127 } 128 } 129 }; 130 131 for c in uri.bytes() { 132 let nb = pack_ascii(c); 133 if nb < EXTENDED { 134 write_bits(nb, 5) 135 } else { 136 write_bits(EXTENDED, 5); 137 write_bits(nb - EXTENDED, 6); 138 } 139 } 140 write_bits(TERMINATOR, 5); 141 142 // Push pending buffer if not empty 143 if buff_bits > 0 { 144 vec.push(buff); 145 } 146 147 Ok(vec) 148 } 149 150 /// Unpack an uri string from its optimized binary format 151 pub fn unpack_uri(bytes: &[u8]) -> Result<String, DecodeErr> { 152 let mut buf = String::with_capacity(bytes.len()); 153 let mut iter = bytes.iter(); 154 155 // Holds [buff_bits] pending bits beginning from the most significant bits 156 let (mut buff, mut buff_bits) = (0u8, 0u8); 157 158 // Write [nb_bits] less significant bits from [buff] to [nb] 159 let mut read_nb = |mut nb_bits: u8| -> Result<u8, DecodeErr> { 160 let mut nb = 0; 161 while nb_bits > 0 { 162 // Load buff if empty 163 if buff_bits == 0 { 164 buff = *iter.next().ok_or(DecodeErr::UnexpectedEOF)?; 165 buff_bits = 8; 166 } 167 // Amount of bits we can read from buff 168 let readable = buff_bits.min(nb_bits); 169 // Remove non writable bits 170 let rmv_left = buff << (8 - buff_bits); 171 // Align remaining bits with nb blank bits 172 let align = rmv_left >> (8 - readable); 173 // Read bits from buff 174 nb = (nb << readable) | align; 175 buff_bits -= readable; 176 nb_bits -= readable; 177 } 178 Ok(nb) 179 }; 180 181 loop { 182 let encoded = match read_nb(5)? { 183 TERMINATOR => break, 184 EXTENDED => read_nb(6)? + EXTENDED, 185 nb => nb, 186 }; 187 buf.push(unpack_ascii(encoded) as char); 188 } 189 190 Ok(buf) 191 } 192 193 #[cfg(test)] 194 #[macro_use(quickcheck)] 195 extern crate quickcheck_macros; 196 197 #[cfg(test)] 198 mod test { 199 use std::str::FromStr; 200 201 use serde_json::Value; 202 203 use crate::{EXTENDED, pack_ascii, pack_uri, supported_ascii, unpack_ascii, unpack_uri}; 204 205 /// Ascii char that can be packed into 5 bits 206 const PACKED: [u8; 30] = [ 207 b'a', b'b', b'c', b'd', b'e', b'f', b'g', b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o', 208 b'p', b'q', b'r', b's', b't', b'u', b'v', b'w', b'x', b'y', b'z', b'.', b'/', b'-', b'%', 209 ]; 210 211 #[test] 212 /// Check support every packable ascii character is packed 213 fn packed() { 214 for c in PACKED { 215 assert!(pack_ascii(c) < EXTENDED); 216 } 217 } 218 219 #[test] 220 /// Check support every ascii graphic character and space 221 fn supported() { 222 for c in (0..=255u8).filter(supported_ascii) { 223 assert_eq!(unpack_ascii(pack_ascii(c)), c); 224 } 225 } 226 227 #[test] 228 /// Check error on unsupported char 229 fn unsupported() { 230 for c in (0..=255u8).filter(|c| !supported_ascii(c)) { 231 let string = String::from(c as char); 232 assert!(pack_uri(&string).is_err()); 233 } 234 } 235 236 #[test] 237 fn url_simple() { 238 let mut majestic = 239 csv::Reader::from_reader(include_str!("majestic_million.csv").as_bytes()); 240 for record in majestic.records() { 241 let domain = &record.unwrap()[2]; 242 let encoded = pack_uri(domain).unwrap(); 243 let decoded = unpack_uri(&encoded).unwrap(); 244 assert_eq!(domain, decoded); 245 } 246 } 247 248 #[test] 249 fn url_complex() { 250 let mut json = Value::from_str(include_str!("urltestdata.json")) 251 .expect("JSON parse error in urltestdata.json"); 252 for entry in json.as_array_mut().unwrap() { 253 if entry.is_string() { 254 continue; // ignore comments 255 } 256 257 let href = entry.get("href").and_then(|it| it.as_str()).unwrap_or(""); 258 if href.chars().any(|c| !c.is_ascii_graphic() || c != ' ') { 259 continue; // extended ascii 260 } 261 let encoded = pack_uri(&href).expect(&format!("Failed to encode {}", &href)); 262 let decoded = 263 unpack_uri(&encoded).expect(&format!("Failed to decode encoded {}", &href)); 264 assert_eq!(href, decoded); 265 } 266 } 267 268 #[quickcheck] 269 fn fuzz(input: String) -> bool { 270 if input.as_bytes().iter().all(supported_ascii) { 271 let packed = pack_uri(&input).unwrap(); 272 let unpacked = unpack_uri(&packed).unwrap(); 273 input == unpacked 274 } else { 275 true 276 } 277 } 278 }