From 5c0397c4562a420bb933f179f51770a71b407dd0 Mon Sep 17 00:00:00 2001 From: xffxff <1247714429@qq.com> Date: Thu, 24 Aug 2023 09:29:08 +0800 Subject: [PATCH 1/7] add some comments to filter_with_lists and add a basic test --- Cargo.lock | 1 + Cargo.toml | 1 + src/results/aggregator.rs | 66 ++++++++++++++++++++++++++++++++++++++- 3 files changed, 67 insertions(+), 1 deletion(-) diff --git a/Cargo.lock b/Cargo.lock index eccdff7..1af829d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3566,6 +3566,7 @@ dependencies = [ "scraper", "serde", "serde_json", + "tempfile", "tokio 1.32.0", ] diff --git a/Cargo.toml b/Cargo.toml index c5f9013..9e92f5b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -27,6 +27,7 @@ once_cell = {version="1.18.0"} error-stack = {version="0.3.1"} async-trait = {version="0.1.73"} regex = {version="1.9.3", features=["perf"]} +tempfile = "3.8.0" [dev-dependencies] rusty-hook = "^0.11.2" diff --git a/src/results/aggregator.rs b/src/results/aggregator.rs index 23ed091..e753aea 100644 --- a/src/results/aggregator.rs +++ b/src/results/aggregator.rs @@ -175,22 +175,86 @@ pub async fn aggregate( )) } -fn filter_with_lists( +/// Filters a map of search results using a list of regex patterns. +/// +/// # Arguments +/// +/// * `map_to_be_filtered` - A mutable reference to a `HashMap` of search results to filter, where the filtered results will be removed from. +/// * `resultant_map` - A mutable reference to a `HashMap` to hold the filtered results. +/// * `file_path` - A `&str` representing the path to a file containing regex patterns to use for filtering. +/// +/// # Errors +/// +/// Returns an error if the file at `file_path` cannot be opened or read, or if a regex pattern is invalid. +pub fn filter_with_lists( map_to_be_filtered: &mut HashMap, resultant_map: &mut HashMap, file_path: &str, ) -> Result<(), Box> { let mut reader = BufReader::new(File::open(file_path)?); + for line in reader.by_ref().lines() { let re = Regex::new(&line?)?; + + // Iterate over each search result in the map and check if it matches the regex pattern for (url, search_result) in map_to_be_filtered.clone().into_iter() { if re.is_match(&url.to_lowercase()) || re.is_match(&search_result.title.to_lowercase()) || re.is_match(&search_result.description.to_lowercase()) { + // If the search result matches the regex pattern, move it from the original map to the resultant map resultant_map.insert(url.clone(), map_to_be_filtered.remove(&url).unwrap()); } } } + Ok(()) } + +#[cfg(test)] +mod tests { + use super::*; + use std::collections::HashMap; + use std::io::Write; + use tempfile::NamedTempFile; + + #[test] + fn test_filter_with_lists() -> Result<(), Box> { + // Create a map of search results to filter + let mut map_to_be_filtered = HashMap::new(); + map_to_be_filtered.insert( + "https://www.example.com".to_string(), + SearchResult { + title: "Example Domain".to_string(), + url: "https://www.example.com".to_string(), + description: "This domain is for use in illustrative examples in documents.".to_string(), + engine: vec!["Google".to_string(), "Bing".to_string()], + }, + ); + map_to_be_filtered.insert( + "https://www.rust-lang.org/".to_string(), + SearchResult { + title: "Rust Programming Language".to_string(), + url: "https://www.rust-lang.org/".to_string(), + description: "A systems programming language that runs blazingly fast, prevents segfaults, and guarantees thread safety.".to_string(), + engine: vec!["Google".to_string(), "DuckDuckGo".to_string()], + }, + ); + + // Create a temporary file with regex patterns + let mut file = NamedTempFile::new()?; + writeln!(file, "example")?; + writeln!(file, "rust")?; + file.flush()?; + + let mut resultant_map = HashMap::new(); + filter_with_lists(&mut map_to_be_filtered, &mut resultant_map, file.path().to_str().unwrap())?; + + assert_eq!(resultant_map.len(), 2); + assert!(resultant_map.contains_key("https://www.example.com")); + assert!(resultant_map.contains_key("https://www.rust-lang.org/")); + assert_eq!(map_to_be_filtered.len(), 0); + + Ok(()) + } +} \ No newline at end of file From 4280545e8c1078187486958209b5f82ed660c615 Mon Sep 17 00:00:00 2001 From: xffxff <1247714429@qq.com> Date: Thu, 24 Aug 2023 09:32:22 +0800 Subject: [PATCH 2/7] add a test for non-existent file --- src/results/aggregator.rs | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/results/aggregator.rs b/src/results/aggregator.rs index e753aea..c2d6885 100644 --- a/src/results/aggregator.rs +++ b/src/results/aggregator.rs @@ -257,4 +257,16 @@ mod tests { Ok(()) } + + #[test] + fn test_filter_with_lists_file_not_found() { + let mut map_to_be_filtered = HashMap::new(); + + let mut resultant_map = HashMap::new(); + + // Call the `filter_with_lists` function with a non-existent file path + let result = filter_with_lists(&mut map_to_be_filtered, &mut resultant_map, "non-existent-file.txt"); + + assert!(result.is_err()); + } } \ No newline at end of file From a2fc10ca3943b920d63b668d5a8935f4ea842d0b Mon Sep 17 00:00:00 2001 From: xffxff <1247714429@qq.com> Date: Thu, 24 Aug 2023 09:36:08 +0800 Subject: [PATCH 3/7] add a test for invalid regex --- src/results/aggregator.rs | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/src/results/aggregator.rs b/src/results/aggregator.rs index c2d6885..c2a2cdb 100644 --- a/src/results/aggregator.rs +++ b/src/results/aggregator.rs @@ -269,4 +269,29 @@ mod tests { assert!(result.is_err()); } + + #[test] + fn test_filter_with_lists_invalid_regex() { + let mut map_to_be_filtered = HashMap::new(); + map_to_be_filtered.insert( + "https://www.example.com".to_string(), + SearchResult { + title: "Example Domain".to_string(), + url: "https://www.example.com".to_string(), + description: "This domain is for use in illustrative examples in documents.".to_string(), + engine: vec!["Google".to_string(), "Bing".to_string()], + }, + ); + + let mut resultant_map = HashMap::new(); + + // Create a temporary file with an invalid regex pattern + let mut file = NamedTempFile::new().unwrap(); + writeln!(file, "example(").unwrap(); + file.flush().unwrap(); + + let result = filter_with_lists(&mut map_to_be_filtered, &mut resultant_map, file.path().to_str().unwrap()); + + assert!(result.is_err()); +} } \ No newline at end of file From 23ff24bdf3b6067aa38f2ffb61112c424c1d0a49 Mon Sep 17 00:00:00 2001 From: xffxff <1247714429@qq.com> Date: Thu, 24 Aug 2023 09:46:01 +0800 Subject: [PATCH 4/7] add a test to check if the regex wildcard .* matches any character --- src/results/aggregator.rs | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/src/results/aggregator.rs b/src/results/aggregator.rs index c2a2cdb..9b93f18 100644 --- a/src/results/aggregator.rs +++ b/src/results/aggregator.rs @@ -258,6 +258,45 @@ mod tests { Ok(()) } + #[test] + fn test_filter_with_lists_wildcard() -> Result<(), Box> { + let mut map_to_be_filtered = HashMap::new(); + map_to_be_filtered.insert( + "https://www.example.com".to_string(), + SearchResult { + title: "Example Domain".to_string(), + url: "https://www.example.com".to_string(), + description: "This domain is for use in illustrative examples in documents.".to_string(), + engine: vec!["Google".to_string(), "Bing".to_string()], + }, + ); + map_to_be_filtered.insert( + "https://www.rust-lang.org/".to_string(), + SearchResult { + title: "Rust Programming Language".to_string(), + url: "https://www.rust-lang.org/".to_string(), + description: "A systems programming language that runs blazingly fast, prevents segfaults, and guarantees thread safety.".to_string(), + engine: vec!["Google".to_string(), "DuckDuckGo".to_string()], + }, + ); + + // Create a temporary file with a regex pattern containing a wildcard + let mut file = NamedTempFile::new()?; + writeln!(file, "ex.*le")?; + file.flush()?; + + let mut resultant_map = HashMap::new(); + + filter_with_lists(&mut map_to_be_filtered, &mut resultant_map, file.path().to_str().unwrap())?; + + assert_eq!(resultant_map.len(), 1); + assert!(resultant_map.contains_key("https://www.example.com")); + assert_eq!(map_to_be_filtered.len(), 1); + assert!(map_to_be_filtered.contains_key("https://www.rust-lang.org/")); + + Ok(()) + } + #[test] fn test_filter_with_lists_file_not_found() { let mut map_to_be_filtered = HashMap::new(); From c3a7c917f66969f0e77dbc8ac47ec36de9881b26 Mon Sep 17 00:00:00 2001 From: xffxff <1247714429@qq.com> Date: Thu, 24 Aug 2023 09:50:19 +0800 Subject: [PATCH 5/7] make format happy --- src/results/aggregator.rs | 35 +++++++++++++++++++++++++++-------- 1 file changed, 27 insertions(+), 8 deletions(-) diff --git a/src/results/aggregator.rs b/src/results/aggregator.rs index 9b93f18..3f06ecb 100644 --- a/src/results/aggregator.rs +++ b/src/results/aggregator.rs @@ -227,7 +227,8 @@ mod tests { SearchResult { title: "Example Domain".to_string(), url: "https://www.example.com".to_string(), - description: "This domain is for use in illustrative examples in documents.".to_string(), + description: "This domain is for use in illustrative examples in documents." + .to_string(), engine: vec!["Google".to_string(), "Bing".to_string()], }, ); @@ -248,7 +249,11 @@ mod tests { file.flush()?; let mut resultant_map = HashMap::new(); - filter_with_lists(&mut map_to_be_filtered, &mut resultant_map, file.path().to_str().unwrap())?; + filter_with_lists( + &mut map_to_be_filtered, + &mut resultant_map, + file.path().to_str().unwrap(), + )?; assert_eq!(resultant_map.len(), 2); assert!(resultant_map.contains_key("https://www.example.com")); @@ -266,7 +271,8 @@ mod tests { SearchResult { title: "Example Domain".to_string(), url: "https://www.example.com".to_string(), - description: "This domain is for use in illustrative examples in documents.".to_string(), + description: "This domain is for use in illustrative examples in documents." + .to_string(), engine: vec!["Google".to_string(), "Bing".to_string()], }, ); @@ -287,7 +293,11 @@ mod tests { let mut resultant_map = HashMap::new(); - filter_with_lists(&mut map_to_be_filtered, &mut resultant_map, file.path().to_str().unwrap())?; + filter_with_lists( + &mut map_to_be_filtered, + &mut resultant_map, + file.path().to_str().unwrap(), + )?; assert_eq!(resultant_map.len(), 1); assert!(resultant_map.contains_key("https://www.example.com")); @@ -304,7 +314,11 @@ mod tests { let mut resultant_map = HashMap::new(); // Call the `filter_with_lists` function with a non-existent file path - let result = filter_with_lists(&mut map_to_be_filtered, &mut resultant_map, "non-existent-file.txt"); + let result = filter_with_lists( + &mut map_to_be_filtered, + &mut resultant_map, + "non-existent-file.txt", + ); assert!(result.is_err()); } @@ -317,7 +331,8 @@ mod tests { SearchResult { title: "Example Domain".to_string(), url: "https://www.example.com".to_string(), - description: "This domain is for use in illustrative examples in documents.".to_string(), + description: "This domain is for use in illustrative examples in documents." + .to_string(), engine: vec!["Google".to_string(), "Bing".to_string()], }, ); @@ -329,8 +344,12 @@ mod tests { writeln!(file, "example(").unwrap(); file.flush().unwrap(); - let result = filter_with_lists(&mut map_to_be_filtered, &mut resultant_map, file.path().to_str().unwrap()); + let result = filter_with_lists( + &mut map_to_be_filtered, + &mut resultant_map, + file.path().to_str().unwrap(), + ); assert!(result.is_err()); + } } -} \ No newline at end of file From e5a022776246ce068aca69f913c4ab49ca57c3b2 Mon Sep 17 00:00:00 2001 From: xffxff <1247714429@qq.com> Date: Thu, 24 Aug 2023 17:10:40 +0800 Subject: [PATCH 6/7] put `tempfile` under `dev.dependencies` --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 9e92f5b..31f29cd 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -27,11 +27,11 @@ once_cell = {version="1.18.0"} error-stack = {version="0.3.1"} async-trait = {version="0.1.73"} regex = {version="1.9.3", features=["perf"]} -tempfile = "3.8.0" [dev-dependencies] rusty-hook = "^0.11.2" criterion = "0.5.1" +tempfile = "3.8.0" [profile.dev] opt-level = 0 From 64948b84f1eb5ccd66af6ff8da808d05ad66864e Mon Sep 17 00:00:00 2001 From: xffxff <1247714429@qq.com> Date: Thu, 24 Aug 2023 17:11:09 +0800 Subject: [PATCH 7/7] bump version --- Cargo.lock | 2 +- Cargo.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 1af829d..412ae83 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3543,7 +3543,7 @@ dependencies = [ [[package]] name = "websurfx" -version = "0.18.0" +version = "0.18.1" dependencies = [ "actix-cors", "actix-files", diff --git a/Cargo.toml b/Cargo.toml index 31f29cd..d36117b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "websurfx" -version = "0.18.0" +version = "0.18.1" edition = "2021" description = "An open-source alternative to Searx that provides clean, ad-free, and organic results with incredible speed while keeping privacy and security in mind." repository = "https://github.com/neon-mmd/websurfx"