commit 99f474072cebe75081d835c02838553b734b87fc
parent 676a7ee88d82f734c729270a7ccefad957b90549
Author: Antoine A <>
Date: Thu, 25 Nov 2021 20:55:03 +0100
pack bits for real
Diffstat:
| M | uri-pack/src/main.rs | | | 86 | ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------- |
1 file changed, 70 insertions(+), 16 deletions(-)
diff --git a/uri-pack/src/main.rs b/uri-pack/src/main.rs
@@ -154,29 +154,72 @@ pub fn encode_str(str: &str) -> Result<Vec<u8>, EncodeErr> {
assert!(str.as_bytes().iter().all(|c| supported_char(*c as char)));
+ // Amount of pending bits stored in buffer.
+ let mut buffer_bits = 0u8;
+ // Holds pending bits beginning from the most significant bits
+ let mut buffer: u8 = 0;
+ let mut write_bits = |nb: u8, mut nb_bits: u8| {
+ while nb_bits > 0 {
+ let writable = (8 - buffer_bits).min(nb_bits);
+ let remove_right = nb_bits - writable;
+ let remove_left = 8 - writable;
+ let mask = ((nb >> remove_right) << (remove_left)) >> buffer_bits;
+ buffer = buffer | mask;
+ buffer_bits += writable;
+ nb_bits -= writable;
+ // Write filled byte
+ if buffer_bits == 8 {
+ vec.push(buffer);
+ buffer = 0;
+ buffer_bits = 0;
+ }
+ }
+ };
+
for c in str.bytes() {
- let nb = match encode_ascii(c) {
- Encoded::Simple(nb) => nb,
+ match encode_ascii(c) {
+ Encoded::Simple(nb) => write_bits(nb, 5),
Encoded::Extended(nb) => {
- vec.push(EXTENDED);
- nb
+ write_bits(EXTENDED, 5);
+ write_bits(nb, 6);
}
- };
- vec.push(nb);
+ }
+ }
+
+ if buffer_bits > 0 {
+ vec.push(buffer);
}
return Ok(vec);
}
-pub fn decode_str(bytes: &[u8]) -> Result<String, DecodeErr> {
- let mut buf = String::new();
+pub fn decode_str(bytes: &[u8], len: usize) -> Result<String, DecodeErr> {
+ let mut buf = String::with_capacity(len);
let mut iter = bytes.iter();
+ // Amount of pending bits stored in buffer.
+ let mut buffer_bits = 0u8;
+ // Holds pending bits beginning from the most significant bits
+ let mut buffer: u8 = 0;
+ let mut read_nb = |mut nb_bits: u8| -> u8 {
+ let mut nb = 8;
+ while nb_bits > 0 {
+ if buffer_bits == 0 {
+ buffer = *iter.next().unwrap();
+ buffer_bits = 8;
+ }
+ let readable = buffer_bits.min(nb_bits);
+ let mask = (buffer << 8 - buffer_bits) >> (8 - readable);
+ nb = (nb << readable) | mask;
+ buffer_bits -= readable;
+ nb_bits -= readable;
+ }
+ return nb;
+ };
- while let Some(next) = iter.next() {
- let encoded = if *next == EXTENDED {
- Encoded::Extended(*iter.next().unwrap())
- } else {
- Encoded::Simple(*next)
+ for _ in 0..len {
+ let encoded = match read_nb(5) {
+ EXTENDED => Encoded::Extended(read_nb(6)),
+ nb => Encoded::Simple(nb),
};
buf.push(decode_ascii(encoded) as char);
}
@@ -205,7 +248,18 @@ mod test {
}
#[test]
- fn url_test() {
+ fn url_simple() {
+ let mut majestic =
+ csv::Reader::from_reader(include_str!("majestic_million.csv").as_bytes());
+ for record in majestic.records() {
+ let domain = &record.unwrap()[2];
+ let decoded = decode_str(&encode_str(domain).unwrap(), domain.len()).unwrap();
+ assert_eq!(domain, decoded);
+ }
+ }
+
+ #[test]
+ fn url_complex() {
let mut json = Value::from_str(include_str!("urltestdata.json"))
.expect("JSON parse error in urltestdata.json");
for entry in json.as_array_mut().unwrap() {
@@ -218,8 +272,8 @@ mod test {
continue; // extended ascii
}
let encoded = encode_str(&href).expect(&format!("Failed to encode {}", &href));
- let decoded =
- decode_str(&encoded).expect(&format!("Failed to decode encoded {}", &href));
+ let decoded = decode_str(&encoded, href.len())
+ .expect(&format!("Failed to decode encoded {}", &href));
assert_eq!(href, decoded);
}
}