robocop

Checks KYC attributes against sanction lists
Log | Files | Refs | Submodules | README | LICENSE

commit 09e679666a8461e79a5f32eec3341947e9624985
parent fb27b3edb21d058307ce9031670056ac29e6e698
Author: Christian Grothoff <christian@grothoff.org>
Date:   Sun,  8 Jun 2025 01:12:55 +0200

rewrite robocop in rust

Diffstat:
ACargo.toml | 7+++++++
Asrc/main.rs | 281+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 288 insertions(+), 0 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml @@ -0,0 +1,7 @@ +[package] +name = "robocop" +version = "0.1.0" +edition = "2021" + +[dependencies] +serde_json = "1.0" diff --git a/src/main.rs b/src/main.rs @@ -0,0 +1,281 @@ +// This file is part of Robocop +// +// Robocop is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// Robocop is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program. If not, see <https://www.gnu.org/licenses/>. +// +// Copyright (C) 2025 Taler Systems SA + +use std::collections::HashMap; +use std::env; +use std::fs; +use std::io::{self, BufRead, BufReader}; +use serde_json::{Value, Map}; + +// Finite State Machine for efficient string matching +#[derive(Debug, Clone)] +struct Matching { + // Maps (state, char) -> new_state + transitions: HashMap<(usize, char), usize>, + // Final states with their associated values and costs + final_states: HashMap<usize, (String, usize)>, + max_state: usize, +} + +impl Matching { + fn new() -> Self { + Self { + transitions: HashMap::new(), + final_states: HashMap::new(), + max_state: 0, + } + } + + + fn add_string(&mut self, s: &str) { + let chars: Vec<char> = s.chars().collect(); + let mut state_ids = Vec::new(); + + // Pre-allocate state IDs to avoid multiple mutable borrows + for _ in 0..=chars.len() { + let state_id = self.max_state; + self.max_state += 1; + state_ids.push(state_id); + } + + // Add final states + for (i, &state_id) in state_ids.iter().enumerate() { + self.final_states.insert( + state_id, + (s.to_string(), chars.len() - i) + ); + } + + // Build transitions for exact matches + for (i, &ch) in chars.iter().enumerate() { + let current_state = state_ids[i]; + let next_state = state_ids[i + 1]; + self.transitions.insert((current_state, ch), next_state); + } + } + + + fn find_best_match(&self, input: &str) -> Option<(String, f64)> { + let mut best_match = None; + let mut best_score = 0.0; + + for (_, (candidate, _)) in &self.final_states { + let distance = levenshtein_distance(input, candidate); + let max_len = input.len().max(candidate.len()); + let score = if max_len == 0 { + 1.0 + } else { + 1.0 - (distance as f64 / max_len as f64) + }; + + if score > best_score { + best_score = score; + best_match = Some((candidate.clone(), score)); + } + } + + best_match + } +} + +// Record structure for matching +#[derive(Debug, Clone)] +struct Record { + ssid: String, + fields: HashMap<String, Matching>, +} + +impl Record { + fn new(ssid: String) -> Self { + Self { + ssid, + fields: HashMap::new(), + } + } + + fn add_field_values(&mut self, key: &str, values: &[String]) { + let mut fsm = Matching::new(); + for value in values { + fsm.add_string(value); + } + self.fields.insert(key.to_string(), fsm); + } +} + +// Matching engine +struct MatchingEngine { + records: Vec<Record>, +} + +impl MatchingEngine { + fn new() -> Self { + Self { + records: Vec::new(), + } + } + + fn load_from_json(&mut self, filename: &str) -> Result<(), Box<dyn std::error::Error>> { + let content = fs::read_to_string(filename)?; + let json_array: Vec<Value> = serde_json::from_str(&content)?; + + for (idx, item) in json_array.iter().enumerate() { + if let Value::Object(obj) = item { + let ssid = obj.get("ssid") + .and_then(|v| v.as_str()) + .unwrap_or(&format!("record_{}", idx)) + .to_string(); + + let mut record = Record::new(ssid); + + for (key, value) in obj { + if key == "ssid" { + continue; + } + + // Only process arrays + if let Value::Array(arr) = value { + let string_values: Vec<String> = arr + .iter() + .filter_map(|v| v.as_str().map(|s| s.to_string())) + .collect(); + + if !string_values.is_empty() { + record.add_field_values(key, &string_values); + } + } + } + + self.records.push(record); + } + } + + Ok(()) + } + + fn find_best_match(&self, input: &Map<String, Value>) -> (f64, f64, String) { + let mut best_overall_score = 0.0; + let mut best_ssid = String::new(); + let mut best_avg_score = 0.0; + let mut best_confidence = 0; + let mut max_fields = 0; + + for record in &self.records { + let mut total_score = 0.0; + let mut matching_fields = 0; + let total_fields = record.fields.len(); + + for (key, input_value) in input { + if let Some(input_str) = input_value.as_str() { + + if let Some(fsm) = record.fields.get(key) { + if let Some((_, score)) = fsm.find_best_match(input_str) { + total_score += score; + matching_fields += 1; + } + } + } + } + max_fields = max_fields.max(total_fields); + if total_fields > 0 { + if total_score > best_overall_score { + best_overall_score = total_score; + best_avg_score = total_score / matching_fields as f64; + best_confidence = matching_fields; + best_ssid = record.ssid.clone(); + } + } + } + + (best_avg_score, best_confidence as f64 / max_fields as f64, best_ssid) + } +} + +// Levenshtein distance implementation +fn levenshtein_distance(s1: &str, s2: &str) -> usize { + let chars1: Vec<char> = s1.chars().collect(); + let chars2: Vec<char> = s2.chars().collect(); + let len1 = chars1.len(); + let len2 = chars2.len(); + + let mut matrix = vec![vec![0; len2 + 1]; len1 + 1]; + + // Initialize first row and column + for i in 0..=len1 { + matrix[i][0] = i; + } + for j in 0..=len2 { + matrix[0][j] = j; + } + + // Fill the matrix + for i in 1..=len1 { + for j in 1..=len2 { + let cost = if chars1[i - 1] == chars2[j - 1] { 0 } else { 1 }; + matrix[i][j] = std::cmp::min( + std::cmp::min( + matrix[i - 1][j] + 1, // deletion + matrix[i][j - 1] + 1 // insertion + ), + matrix[i - 1][j - 1] + cost // substitution + ); + } + } + + matrix[len1][len2] +} + +fn main() -> Result<(), Box<dyn std::error::Error>> { + let args: Vec<String> = env::args().collect(); + if args.len() != 2 { + eprintln!("Usage: {} <json_file>", args[0]); + std::process::exit(1); + } + + let filename = &args[1]; + + // Load and pre-process the JSON database + let mut engine = MatchingEngine::new(); + engine.load_from_json(filename)?; + + // Read JSON objects from stdin + let stdin = io::stdin(); + let reader = BufReader::new(stdin); + + for line in reader.lines() { + let line = line?; + if line.trim().is_empty() { + continue; + } + + match serde_json::from_str::<Value>(&line) { + Ok(Value::Object(obj)) => { + let (quality, confidence, ssid) = engine.find_best_match(&obj); + println!("{:.6} {:.6} {}", quality, confidence, ssid); + } + Ok(_) => { + eprintln!("Warning: Skipping non-object JSON: {}", line); + std::process::exit(1); + } + Err(e) => { + eprintln!("Warning: Failed to parse JSON: {} - {}", line, e); + std::process::exit(1); + } + } + } + + Ok(()) +}