From 52f27655b88ed7797bf49a48380cb0f8ad337378 Mon Sep 17 00:00:00 2001 From: neon_arch Date: Tue, 6 Feb 2024 22:28:19 +0300 Subject: [PATCH] :zap: perf: replace hashmaps with vectors for fetching & aggregating results (#486) - replace hashmaps with vectors for fetching, collecting & aggregating results as it tends to be contigous & cache efficient data structure. - refactor & redesign algorithms for fetching & aggregating results centered around vectors in aggregate function. --- src/engines/bing.rs | 2 +- src/engines/brave.rs | 2 +- src/engines/duckduckgo.rs | 2 +- src/engines/librex.rs | 2 +- src/engines/mojeek.rs | 2 +- src/engines/search_result_parser.rs | 3 +- src/engines/searx.rs | 2 +- src/engines/startpage.rs | 2 +- src/models/engine_models.rs | 4 +- src/results/aggregator.rs | 107 +++++++++++++--------------- 10 files changed, 61 insertions(+), 67 deletions(-) diff --git a/src/engines/bing.rs b/src/engines/bing.rs index 84dbf93..ec582e4 100644 --- a/src/engines/bing.rs +++ b/src/engines/bing.rs @@ -48,7 +48,7 @@ impl SearchEngine for Bing { user_agent: &str, client: &Client, _safe_search: u8, - ) -> Result, EngineError> { + ) -> Result, EngineError> { // Bing uses `start results from this number` convention // So, for 10 results per page, page 0 starts at 1, page 1 // starts at 11, and so on. diff --git a/src/engines/brave.rs b/src/engines/brave.rs index 49626e3..65067fc 100644 --- a/src/engines/brave.rs +++ b/src/engines/brave.rs @@ -44,7 +44,7 @@ impl SearchEngine for Brave { user_agent: &str, client: &Client, safe_search: u8, - ) -> Result, EngineError> { + ) -> Result, EngineError> { let url = format!("https://search.brave.com/search?q={query}&offset={page}"); let safe_search_level = match safe_search { diff --git a/src/engines/duckduckgo.rs b/src/engines/duckduckgo.rs index c48522f..02ee481 100644 --- a/src/engines/duckduckgo.rs +++ b/src/engines/duckduckgo.rs @@ -47,7 +47,7 @@ impl SearchEngine for DuckDuckGo { user_agent: &str, client: &Client, _safe_search: u8, - ) -> Result, EngineError> { + ) -> Result, EngineError> { // Page number can be missing or empty string and so appropriate handling is required // so that upstream server recieves valid page number. let url: String = match page { diff --git a/src/engines/librex.rs b/src/engines/librex.rs index b34393f..69e4611 100644 --- a/src/engines/librex.rs +++ b/src/engines/librex.rs @@ -62,7 +62,7 @@ impl SearchEngine for LibreX { user_agent: &str, client: &Client, _safe_search: u8, - ) -> Result, EngineError> { + ) -> Result, EngineError> { // Page number can be missing or empty string and so appropriate handling is required // so that upstream server recieves valid page number. let url: String = format!( diff --git a/src/engines/mojeek.rs b/src/engines/mojeek.rs index 3f7fbb1..7718c59 100644 --- a/src/engines/mojeek.rs +++ b/src/engines/mojeek.rs @@ -47,7 +47,7 @@ impl SearchEngine for Mojeek { user_agent: &str, client: &Client, safe_search: u8, - ) -> Result, EngineError> { + ) -> Result, EngineError> { // Mojeek uses `start results from this number` convention // So, for 10 results per page, page 0 starts at 1, page 1 // starts at 11, and so on. diff --git a/src/engines/search_result_parser.rs b/src/engines/search_result_parser.rs index 0512bdd..8c16b65 100644 --- a/src/engines/search_result_parser.rs +++ b/src/engines/search_result_parser.rs @@ -1,5 +1,4 @@ //! This modules provides helper functionalities for parsing a html document into internal SearchResult. -use std::collections::HashMap; use crate::models::{aggregation_models::SearchResult, engine_models::EngineError}; use error_stack::{Report, Result}; @@ -47,7 +46,7 @@ impl SearchResultParser { &self, document: &Html, builder: impl Fn(&ElementRef<'_>, &ElementRef<'_>, &ElementRef<'_>) -> Option, - ) -> Result, EngineError> { + ) -> Result, EngineError> { let res = document .select(&self.results) .filter_map(|result| { diff --git a/src/engines/searx.rs b/src/engines/searx.rs index 9bb297c..324087c 100644 --- a/src/engines/searx.rs +++ b/src/engines/searx.rs @@ -43,7 +43,7 @@ impl SearchEngine for Searx { user_agent: &str, client: &Client, mut safe_search: u8, - ) -> Result, EngineError> { + ) -> Result, EngineError> { // Page number can be missing or empty string and so appropriate handling is required // so that upstream server recieves valid page number. if safe_search == 3 { diff --git a/src/engines/startpage.rs b/src/engines/startpage.rs index 540a0ce..97b7a40 100644 --- a/src/engines/startpage.rs +++ b/src/engines/startpage.rs @@ -47,7 +47,7 @@ impl SearchEngine for Startpage { user_agent: &str, client: &Client, _safe_search: u8, - ) -> Result, EngineError> { + ) -> Result, EngineError> { // Page number can be missing or empty string and so appropriate handling is required // so that upstream server recieves valid page number. let url: String = format!( diff --git a/src/models/engine_models.rs b/src/models/engine_models.rs index 4d56836..932afce 100644 --- a/src/models/engine_models.rs +++ b/src/models/engine_models.rs @@ -4,7 +4,7 @@ use super::aggregation_models::SearchResult; use error_stack::{Report, Result, ResultExt}; use reqwest::Client; -use std::{collections::HashMap, fmt}; +use std::fmt; /// A custom error type used for handle engine associated errors. #[derive(Debug)] @@ -147,7 +147,7 @@ pub trait SearchEngine: Sync + Send { user_agent: &str, client: &Client, safe_search: u8, - ) -> Result, EngineError>; + ) -> Result, EngineError>; } /// A named struct which stores the engine struct with the name of the associated engine. diff --git a/src/results/aggregator.rs b/src/results/aggregator.rs index e7fc090..67fff56 100644 --- a/src/results/aggregator.rs +++ b/src/results/aggregator.rs @@ -11,19 +11,18 @@ use error_stack::Report; use regex::Regex; use reqwest::{Client, ClientBuilder}; use std::time::{SystemTime, UNIX_EPOCH}; +use std::{fs::File, io::BufRead}; use std::{ - collections::HashMap, io::{BufReader, Read}, time::Duration, }; -use std::{fs::File, io::BufRead}; use tokio::task::JoinHandle; /// A constant for holding the prebuilt Client globally in the app. static CLIENT: std::sync::OnceLock = std::sync::OnceLock::new(); /// Aliases for long type annotations -type FutureVec = Vec, Report>>>; +type FutureVec = Vec, Report>>>; /// The function aggregates the scraped results from the user-selected upstream search engines. /// These engines can be chosen either from the user interface (UI) or from the configuration file. @@ -36,7 +35,7 @@ type FutureVec = Vec, Report = HashMap::new(); + let mut result_map: Vec<(String, SearchResult)> = Vec::new(); let mut engine_errors_info: Vec = Vec::new(); let mut handle_error = |error: &Report, engine_name: &'static str| { @@ -134,35 +133,27 @@ pub async fn aggregate( if result_map.is_empty() { match response { - Ok(results) => { - result_map = results.clone(); - } - Err(error) => { - handle_error(&error, engine); - } - } + Ok(results) => result_map = results, + Err(error) => handle_error(&error, engine), + }; continue; } match response { Ok(result) => { result.into_iter().for_each(|(key, value)| { - result_map - .entry(key) - .and_modify(|result| { - result.add_engines(engine); - }) - .or_insert_with(|| -> SearchResult { value }); + match result_map.iter().find(|(key_s, _)| key_s == &key) { + Some(value) => value.1.to_owned().add_engines(engine), + None => result_map.push((key, value)), + }; }); } - Err(error) => { - handle_error(&error, engine); - } - } + Err(error) => handle_error(&error, engine), + }; } if safe_search >= 3 { - let mut blacklist_map: HashMap = HashMap::new(); + let mut blacklist_map: Vec<(String, SearchResult)> = Vec::new(); filter_with_lists( &mut result_map, &mut blacklist_map, @@ -178,7 +169,7 @@ pub async fn aggregate( drop(blacklist_map); } - let results: Vec = result_map.into_values().collect(); + let results: Vec = result_map.iter().map(|(_, value)| value.clone()).collect(); Ok(SearchResults::new(results, &engine_errors_info)) } @@ -187,16 +178,16 @@ pub async fn aggregate( /// /// # Arguments /// -/// * `map_to_be_filtered` - A mutable reference to a `HashMap` of search results to filter, where the filtered results will be removed from. -/// * `resultant_map` - A mutable reference to a `HashMap` to hold the filtered results. +/// * `map_to_be_filtered` - A mutable reference to a `Vec` of search results to filter, where the filtered results will be removed from. +/// * `resultant_map` - A mutable reference to a `Vec` to hold the filtered results. /// * `file_path` - A `&str` representing the path to a file containing regex patterns to use for filtering. /// /// # Errors /// /// Returns an error if the file at `file_path` cannot be opened or read, or if a regex pattern is invalid. pub fn filter_with_lists( - map_to_be_filtered: &mut HashMap, - resultant_map: &mut HashMap, + map_to_be_filtered: &mut Vec<(String, SearchResult)>, + resultant_map: &mut Vec<(String, SearchResult)>, file_path: &str, ) -> Result<(), Box> { let mut reader = BufReader::new(File::open(file_path)?); @@ -205,16 +196,13 @@ pub fn filter_with_lists( let re = Regex::new(line?.trim())?; // Iterate over each search result in the map and check if it matches the regex pattern - for (url, search_result) in map_to_be_filtered.clone().into_iter() { + for (index, (url, search_result)) in map_to_be_filtered.clone().into_iter().enumerate() { if re.is_match(&url.to_lowercase()) || re.is_match(&search_result.title.to_lowercase()) || re.is_match(&search_result.description.to_lowercase()) { // If the search result matches the regex pattern, move it from the original map to the resultant map - resultant_map.insert( - url.to_owned(), - map_to_be_filtered.remove(&url.to_owned()).unwrap(), - ); + resultant_map.push(map_to_be_filtered.remove(index)); } } } @@ -226,15 +214,14 @@ pub fn filter_with_lists( mod tests { use super::*; use smallvec::smallvec; - use std::collections::HashMap; use std::io::Write; use tempfile::NamedTempFile; #[test] fn test_filter_with_lists() -> Result<(), Box> { // Create a map of search results to filter - let mut map_to_be_filtered = HashMap::new(); - map_to_be_filtered.insert( + let mut map_to_be_filtered = Vec::new(); + map_to_be_filtered.push(( "https://www.example.com".to_owned(), SearchResult { title: "Example Domain".to_owned(), @@ -243,15 +230,15 @@ mod tests { .to_owned(), engine: smallvec!["Google".to_owned(), "Bing".to_owned()], }, - ); - map_to_be_filtered.insert( + )); + map_to_be_filtered.push(( "https://www.rust-lang.org/".to_owned(), SearchResult { title: "Rust Programming Language".to_owned(), url: "https://www.rust-lang.org/".to_owned(), description: "A systems programming language that runs blazingly fast, prevents segfaults, and guarantees thread safety.".to_owned(), engine: smallvec!["Google".to_owned(), "DuckDuckGo".to_owned()], - }, + },) ); // Create a temporary file with regex patterns @@ -260,7 +247,7 @@ mod tests { writeln!(file, "rust")?; file.flush()?; - let mut resultant_map = HashMap::new(); + let mut resultant_map = Vec::new(); filter_with_lists( &mut map_to_be_filtered, &mut resultant_map, @@ -268,8 +255,12 @@ mod tests { )?; assert_eq!(resultant_map.len(), 2); - assert!(resultant_map.contains_key("https://www.example.com")); - assert!(resultant_map.contains_key("https://www.rust-lang.org/")); + assert!(resultant_map + .iter() + .any(|(key, _)| key == "https://www.example.com")); + assert!(resultant_map + .iter() + .any(|(key, _)| key == "https://www.rust-lang.org/")); assert_eq!(map_to_be_filtered.len(), 0); Ok(()) @@ -277,8 +268,8 @@ mod tests { #[test] fn test_filter_with_lists_wildcard() -> Result<(), Box> { - let mut map_to_be_filtered = HashMap::new(); - map_to_be_filtered.insert( + let mut map_to_be_filtered = Vec::new(); + map_to_be_filtered.push(( "https://www.example.com".to_owned(), SearchResult { title: "Example Domain".to_owned(), @@ -287,8 +278,8 @@ mod tests { .to_owned(), engine: smallvec!["Google".to_owned(), "Bing".to_owned()], }, - ); - map_to_be_filtered.insert( + )); + map_to_be_filtered.push(( "https://www.rust-lang.org/".to_owned(), SearchResult { title: "Rust Programming Language".to_owned(), @@ -296,14 +287,14 @@ mod tests { description: "A systems programming language that runs blazingly fast, prevents segfaults, and guarantees thread safety.".to_owned(), engine: smallvec!["Google".to_owned(), "DuckDuckGo".to_owned()], }, - ); + )); // Create a temporary file with a regex pattern containing a wildcard let mut file = NamedTempFile::new()?; writeln!(file, "ex.*le")?; file.flush()?; - let mut resultant_map = HashMap::new(); + let mut resultant_map = Vec::new(); filter_with_lists( &mut map_to_be_filtered, @@ -312,18 +303,22 @@ mod tests { )?; assert_eq!(resultant_map.len(), 1); - assert!(resultant_map.contains_key("https://www.example.com")); + assert!(resultant_map + .iter() + .any(|(key, _)| key == "https://www.example.com")); assert_eq!(map_to_be_filtered.len(), 1); - assert!(map_to_be_filtered.contains_key("https://www.rust-lang.org/")); + assert!(map_to_be_filtered + .iter() + .any(|(key, _)| key == "https://www.rust-lang.org/")); Ok(()) } #[test] fn test_filter_with_lists_file_not_found() { - let mut map_to_be_filtered = HashMap::new(); + let mut map_to_be_filtered = Vec::new(); - let mut resultant_map = HashMap::new(); + let mut resultant_map = Vec::new(); // Call the `filter_with_lists` function with a non-existent file path let result = filter_with_lists( @@ -337,8 +332,8 @@ mod tests { #[test] fn test_filter_with_lists_invalid_regex() { - let mut map_to_be_filtered = HashMap::new(); - map_to_be_filtered.insert( + let mut map_to_be_filtered = Vec::new(); + map_to_be_filtered.push(( "https://www.example.com".to_owned(), SearchResult { title: "Example Domain".to_owned(), @@ -347,9 +342,9 @@ mod tests { .to_owned(), engine: smallvec!["Google".to_owned(), "Bing".to_owned()], }, - ); + )); - let mut resultant_map = HashMap::new(); + let mut resultant_map = Vec::new(); // Create a temporary file with an invalid regex pattern let mut file = NamedTempFile::new().unwrap();