0
0
mirror of https://github.com/neon-mmd/websurfx.git synced 2024-11-23 06:28:23 -05:00

perf: replace hashmaps with vectors for fetching & aggregating results (#486)

- replace hashmaps with vectors for fetching, collecting & aggregating results as it tends to be contigous & cache efficient data structure.
- refactor & redesign algorithms for fetching & aggregating results
  centered around vectors in aggregate function.
This commit is contained in:
neon_arch 2024-02-06 22:28:19 +03:00
parent 1a7675f779
commit 52f27655b8
10 changed files with 61 additions and 67 deletions

View File

@ -48,7 +48,7 @@ impl SearchEngine for Bing {
user_agent: &str, user_agent: &str,
client: &Client, client: &Client,
_safe_search: u8, _safe_search: u8,
) -> Result<HashMap<String, SearchResult>, EngineError> { ) -> Result<Vec<(String, SearchResult)>, EngineError> {
// Bing uses `start results from this number` convention // Bing uses `start results from this number` convention
// So, for 10 results per page, page 0 starts at 1, page 1 // So, for 10 results per page, page 0 starts at 1, page 1
// starts at 11, and so on. // starts at 11, and so on.

View File

@ -44,7 +44,7 @@ impl SearchEngine for Brave {
user_agent: &str, user_agent: &str,
client: &Client, client: &Client,
safe_search: u8, safe_search: u8,
) -> Result<HashMap<String, SearchResult>, EngineError> { ) -> Result<Vec<(String, SearchResult)>, EngineError> {
let url = format!("https://search.brave.com/search?q={query}&offset={page}"); let url = format!("https://search.brave.com/search?q={query}&offset={page}");
let safe_search_level = match safe_search { let safe_search_level = match safe_search {

View File

@ -47,7 +47,7 @@ impl SearchEngine for DuckDuckGo {
user_agent: &str, user_agent: &str,
client: &Client, client: &Client,
_safe_search: u8, _safe_search: u8,
) -> Result<HashMap<String, SearchResult>, EngineError> { ) -> Result<Vec<(String, SearchResult)>, EngineError> {
// Page number can be missing or empty string and so appropriate handling is required // Page number can be missing or empty string and so appropriate handling is required
// so that upstream server recieves valid page number. // so that upstream server recieves valid page number.
let url: String = match page { let url: String = match page {

View File

@ -62,7 +62,7 @@ impl SearchEngine for LibreX {
user_agent: &str, user_agent: &str,
client: &Client, client: &Client,
_safe_search: u8, _safe_search: u8,
) -> Result<HashMap<String, SearchResult>, EngineError> { ) -> Result<Vec<(String, SearchResult)>, EngineError> {
// Page number can be missing or empty string and so appropriate handling is required // Page number can be missing or empty string and so appropriate handling is required
// so that upstream server recieves valid page number. // so that upstream server recieves valid page number.
let url: String = format!( let url: String = format!(

View File

@ -47,7 +47,7 @@ impl SearchEngine for Mojeek {
user_agent: &str, user_agent: &str,
client: &Client, client: &Client,
safe_search: u8, safe_search: u8,
) -> Result<HashMap<String, SearchResult>, EngineError> { ) -> Result<Vec<(String, SearchResult)>, EngineError> {
// Mojeek uses `start results from this number` convention // Mojeek uses `start results from this number` convention
// So, for 10 results per page, page 0 starts at 1, page 1 // So, for 10 results per page, page 0 starts at 1, page 1
// starts at 11, and so on. // starts at 11, and so on.

View File

@ -1,5 +1,4 @@
//! This modules provides helper functionalities for parsing a html document into internal SearchResult. //! This modules provides helper functionalities for parsing a html document into internal SearchResult.
use std::collections::HashMap;
use crate::models::{aggregation_models::SearchResult, engine_models::EngineError}; use crate::models::{aggregation_models::SearchResult, engine_models::EngineError};
use error_stack::{Report, Result}; use error_stack::{Report, Result};
@ -47,7 +46,7 @@ impl SearchResultParser {
&self, &self,
document: &Html, document: &Html,
builder: impl Fn(&ElementRef<'_>, &ElementRef<'_>, &ElementRef<'_>) -> Option<SearchResult>, builder: impl Fn(&ElementRef<'_>, &ElementRef<'_>, &ElementRef<'_>) -> Option<SearchResult>,
) -> Result<HashMap<String, SearchResult>, EngineError> { ) -> Result<Vec<(String, SearchResult)>, EngineError> {
let res = document let res = document
.select(&self.results) .select(&self.results)
.filter_map(|result| { .filter_map(|result| {

View File

@ -43,7 +43,7 @@ impl SearchEngine for Searx {
user_agent: &str, user_agent: &str,
client: &Client, client: &Client,
mut safe_search: u8, mut safe_search: u8,
) -> Result<HashMap<String, SearchResult>, EngineError> { ) -> Result<Vec<(String, SearchResult)>, EngineError> {
// Page number can be missing or empty string and so appropriate handling is required // Page number can be missing or empty string and so appropriate handling is required
// so that upstream server recieves valid page number. // so that upstream server recieves valid page number.
if safe_search == 3 { if safe_search == 3 {

View File

@ -47,7 +47,7 @@ impl SearchEngine for Startpage {
user_agent: &str, user_agent: &str,
client: &Client, client: &Client,
_safe_search: u8, _safe_search: u8,
) -> Result<HashMap<String, SearchResult>, EngineError> { ) -> Result<Vec<(String, SearchResult)>, EngineError> {
// Page number can be missing or empty string and so appropriate handling is required // Page number can be missing or empty string and so appropriate handling is required
// so that upstream server recieves valid page number. // so that upstream server recieves valid page number.
let url: String = format!( let url: String = format!(

View File

@ -4,7 +4,7 @@
use super::aggregation_models::SearchResult; use super::aggregation_models::SearchResult;
use error_stack::{Report, Result, ResultExt}; use error_stack::{Report, Result, ResultExt};
use reqwest::Client; use reqwest::Client;
use std::{collections::HashMap, fmt}; use std::fmt;
/// A custom error type used for handle engine associated errors. /// A custom error type used for handle engine associated errors.
#[derive(Debug)] #[derive(Debug)]
@ -147,7 +147,7 @@ pub trait SearchEngine: Sync + Send {
user_agent: &str, user_agent: &str,
client: &Client, client: &Client,
safe_search: u8, safe_search: u8,
) -> Result<HashMap<String, SearchResult>, EngineError>; ) -> Result<Vec<(String, SearchResult)>, EngineError>;
} }
/// A named struct which stores the engine struct with the name of the associated engine. /// A named struct which stores the engine struct with the name of the associated engine.

View File

@ -11,19 +11,18 @@ use error_stack::Report;
use regex::Regex; use regex::Regex;
use reqwest::{Client, ClientBuilder}; use reqwest::{Client, ClientBuilder};
use std::time::{SystemTime, UNIX_EPOCH}; use std::time::{SystemTime, UNIX_EPOCH};
use std::{fs::File, io::BufRead};
use std::{ use std::{
collections::HashMap,
io::{BufReader, Read}, io::{BufReader, Read},
time::Duration, time::Duration,
}; };
use std::{fs::File, io::BufRead};
use tokio::task::JoinHandle; use tokio::task::JoinHandle;
/// A constant for holding the prebuilt Client globally in the app. /// A constant for holding the prebuilt Client globally in the app.
static CLIENT: std::sync::OnceLock<Client> = std::sync::OnceLock::new(); static CLIENT: std::sync::OnceLock<Client> = std::sync::OnceLock::new();
/// Aliases for long type annotations /// Aliases for long type annotations
type FutureVec = Vec<JoinHandle<Result<HashMap<String, SearchResult>, Report<EngineError>>>>; type FutureVec = Vec<JoinHandle<Result<Vec<(String, SearchResult)>, Report<EngineError>>>>;
/// The function aggregates the scraped results from the user-selected upstream search engines. /// The function aggregates the scraped results from the user-selected upstream search engines.
/// These engines can be chosen either from the user interface (UI) or from the configuration file. /// These engines can be chosen either from the user interface (UI) or from the configuration file.
@ -36,7 +35,7 @@ type FutureVec = Vec<JoinHandle<Result<HashMap<String, SearchResult>, Report<Eng
/// ///
/// Additionally, the function eliminates duplicate results. If two results are identified as coming from /// Additionally, the function eliminates duplicate results. If two results are identified as coming from
/// multiple engines, their names are combined to indicate that the results were fetched from these upstream /// multiple engines, their names are combined to indicate that the results were fetched from these upstream
/// engines. After this, all the data in the `HashMap` is removed and placed into a struct that contains all /// engines. After this, all the data in the `Vec` is removed and placed into a struct that contains all
/// the aggregated results in a vector. Furthermore, the query used is also added to the struct. This step is /// the aggregated results in a vector. Furthermore, the query used is also added to the struct. This step is
/// necessary to ensure that the search bar in the search remains populated even when searched from the query URL. /// necessary to ensure that the search bar in the search remains populated even when searched from the query URL.
/// ///
@ -117,7 +116,7 @@ pub async fn aggregate(
} }
// aggregate search results, removing duplicates and handling errors the upstream engines returned // aggregate search results, removing duplicates and handling errors the upstream engines returned
let mut result_map: HashMap<String, SearchResult> = HashMap::new(); let mut result_map: Vec<(String, SearchResult)> = Vec::new();
let mut engine_errors_info: Vec<EngineErrorInfo> = Vec::new(); let mut engine_errors_info: Vec<EngineErrorInfo> = Vec::new();
let mut handle_error = |error: &Report<EngineError>, engine_name: &'static str| { let mut handle_error = |error: &Report<EngineError>, engine_name: &'static str| {
@ -134,35 +133,27 @@ pub async fn aggregate(
if result_map.is_empty() { if result_map.is_empty() {
match response { match response {
Ok(results) => { Ok(results) => result_map = results,
result_map = results.clone(); Err(error) => handle_error(&error, engine),
} };
Err(error) => {
handle_error(&error, engine);
}
}
continue; continue;
} }
match response { match response {
Ok(result) => { Ok(result) => {
result.into_iter().for_each(|(key, value)| { result.into_iter().for_each(|(key, value)| {
result_map match result_map.iter().find(|(key_s, _)| key_s == &key) {
.entry(key) Some(value) => value.1.to_owned().add_engines(engine),
.and_modify(|result| { None => result_map.push((key, value)),
result.add_engines(engine); };
})
.or_insert_with(|| -> SearchResult { value });
}); });
} }
Err(error) => { Err(error) => handle_error(&error, engine),
handle_error(&error, engine); };
}
}
} }
if safe_search >= 3 { if safe_search >= 3 {
let mut blacklist_map: HashMap<String, SearchResult> = HashMap::new(); let mut blacklist_map: Vec<(String, SearchResult)> = Vec::new();
filter_with_lists( filter_with_lists(
&mut result_map, &mut result_map,
&mut blacklist_map, &mut blacklist_map,
@ -178,7 +169,7 @@ pub async fn aggregate(
drop(blacklist_map); drop(blacklist_map);
} }
let results: Vec<SearchResult> = result_map.into_values().collect(); let results: Vec<SearchResult> = result_map.iter().map(|(_, value)| value.clone()).collect();
Ok(SearchResults::new(results, &engine_errors_info)) Ok(SearchResults::new(results, &engine_errors_info))
} }
@ -187,16 +178,16 @@ pub async fn aggregate(
/// ///
/// # Arguments /// # Arguments
/// ///
/// * `map_to_be_filtered` - A mutable reference to a `HashMap` of search results to filter, where the filtered results will be removed from. /// * `map_to_be_filtered` - A mutable reference to a `Vec` of search results to filter, where the filtered results will be removed from.
/// * `resultant_map` - A mutable reference to a `HashMap` to hold the filtered results. /// * `resultant_map` - A mutable reference to a `Vec` to hold the filtered results.
/// * `file_path` - A `&str` representing the path to a file containing regex patterns to use for filtering. /// * `file_path` - A `&str` representing the path to a file containing regex patterns to use for filtering.
/// ///
/// # Errors /// # Errors
/// ///
/// Returns an error if the file at `file_path` cannot be opened or read, or if a regex pattern is invalid. /// Returns an error if the file at `file_path` cannot be opened or read, or if a regex pattern is invalid.
pub fn filter_with_lists( pub fn filter_with_lists(
map_to_be_filtered: &mut HashMap<String, SearchResult>, map_to_be_filtered: &mut Vec<(String, SearchResult)>,
resultant_map: &mut HashMap<String, SearchResult>, resultant_map: &mut Vec<(String, SearchResult)>,
file_path: &str, file_path: &str,
) -> Result<(), Box<dyn std::error::Error>> { ) -> Result<(), Box<dyn std::error::Error>> {
let mut reader = BufReader::new(File::open(file_path)?); let mut reader = BufReader::new(File::open(file_path)?);
@ -205,16 +196,13 @@ pub fn filter_with_lists(
let re = Regex::new(line?.trim())?; let re = Regex::new(line?.trim())?;
// Iterate over each search result in the map and check if it matches the regex pattern // Iterate over each search result in the map and check if it matches the regex pattern
for (url, search_result) in map_to_be_filtered.clone().into_iter() { for (index, (url, search_result)) in map_to_be_filtered.clone().into_iter().enumerate() {
if re.is_match(&url.to_lowercase()) if re.is_match(&url.to_lowercase())
|| re.is_match(&search_result.title.to_lowercase()) || re.is_match(&search_result.title.to_lowercase())
|| re.is_match(&search_result.description.to_lowercase()) || re.is_match(&search_result.description.to_lowercase())
{ {
// If the search result matches the regex pattern, move it from the original map to the resultant map // If the search result matches the regex pattern, move it from the original map to the resultant map
resultant_map.insert( resultant_map.push(map_to_be_filtered.remove(index));
url.to_owned(),
map_to_be_filtered.remove(&url.to_owned()).unwrap(),
);
} }
} }
} }
@ -226,15 +214,14 @@ pub fn filter_with_lists(
mod tests { mod tests {
use super::*; use super::*;
use smallvec::smallvec; use smallvec::smallvec;
use std::collections::HashMap;
use std::io::Write; use std::io::Write;
use tempfile::NamedTempFile; use tempfile::NamedTempFile;
#[test] #[test]
fn test_filter_with_lists() -> Result<(), Box<dyn std::error::Error>> { fn test_filter_with_lists() -> Result<(), Box<dyn std::error::Error>> {
// Create a map of search results to filter // Create a map of search results to filter
let mut map_to_be_filtered = HashMap::new(); let mut map_to_be_filtered = Vec::new();
map_to_be_filtered.insert( map_to_be_filtered.push((
"https://www.example.com".to_owned(), "https://www.example.com".to_owned(),
SearchResult { SearchResult {
title: "Example Domain".to_owned(), title: "Example Domain".to_owned(),
@ -243,15 +230,15 @@ mod tests {
.to_owned(), .to_owned(),
engine: smallvec!["Google".to_owned(), "Bing".to_owned()], engine: smallvec!["Google".to_owned(), "Bing".to_owned()],
}, },
); ));
map_to_be_filtered.insert( map_to_be_filtered.push((
"https://www.rust-lang.org/".to_owned(), "https://www.rust-lang.org/".to_owned(),
SearchResult { SearchResult {
title: "Rust Programming Language".to_owned(), title: "Rust Programming Language".to_owned(),
url: "https://www.rust-lang.org/".to_owned(), url: "https://www.rust-lang.org/".to_owned(),
description: "A systems programming language that runs blazingly fast, prevents segfaults, and guarantees thread safety.".to_owned(), description: "A systems programming language that runs blazingly fast, prevents segfaults, and guarantees thread safety.".to_owned(),
engine: smallvec!["Google".to_owned(), "DuckDuckGo".to_owned()], engine: smallvec!["Google".to_owned(), "DuckDuckGo".to_owned()],
}, },)
); );
// Create a temporary file with regex patterns // Create a temporary file with regex patterns
@ -260,7 +247,7 @@ mod tests {
writeln!(file, "rust")?; writeln!(file, "rust")?;
file.flush()?; file.flush()?;
let mut resultant_map = HashMap::new(); let mut resultant_map = Vec::new();
filter_with_lists( filter_with_lists(
&mut map_to_be_filtered, &mut map_to_be_filtered,
&mut resultant_map, &mut resultant_map,
@ -268,8 +255,12 @@ mod tests {
)?; )?;
assert_eq!(resultant_map.len(), 2); assert_eq!(resultant_map.len(), 2);
assert!(resultant_map.contains_key("https://www.example.com")); assert!(resultant_map
assert!(resultant_map.contains_key("https://www.rust-lang.org/")); .iter()
.any(|(key, _)| key == "https://www.example.com"));
assert!(resultant_map
.iter()
.any(|(key, _)| key == "https://www.rust-lang.org/"));
assert_eq!(map_to_be_filtered.len(), 0); assert_eq!(map_to_be_filtered.len(), 0);
Ok(()) Ok(())
@ -277,8 +268,8 @@ mod tests {
#[test] #[test]
fn test_filter_with_lists_wildcard() -> Result<(), Box<dyn std::error::Error>> { fn test_filter_with_lists_wildcard() -> Result<(), Box<dyn std::error::Error>> {
let mut map_to_be_filtered = HashMap::new(); let mut map_to_be_filtered = Vec::new();
map_to_be_filtered.insert( map_to_be_filtered.push((
"https://www.example.com".to_owned(), "https://www.example.com".to_owned(),
SearchResult { SearchResult {
title: "Example Domain".to_owned(), title: "Example Domain".to_owned(),
@ -287,8 +278,8 @@ mod tests {
.to_owned(), .to_owned(),
engine: smallvec!["Google".to_owned(), "Bing".to_owned()], engine: smallvec!["Google".to_owned(), "Bing".to_owned()],
}, },
); ));
map_to_be_filtered.insert( map_to_be_filtered.push((
"https://www.rust-lang.org/".to_owned(), "https://www.rust-lang.org/".to_owned(),
SearchResult { SearchResult {
title: "Rust Programming Language".to_owned(), title: "Rust Programming Language".to_owned(),
@ -296,14 +287,14 @@ mod tests {
description: "A systems programming language that runs blazingly fast, prevents segfaults, and guarantees thread safety.".to_owned(), description: "A systems programming language that runs blazingly fast, prevents segfaults, and guarantees thread safety.".to_owned(),
engine: smallvec!["Google".to_owned(), "DuckDuckGo".to_owned()], engine: smallvec!["Google".to_owned(), "DuckDuckGo".to_owned()],
}, },
); ));
// Create a temporary file with a regex pattern containing a wildcard // Create a temporary file with a regex pattern containing a wildcard
let mut file = NamedTempFile::new()?; let mut file = NamedTempFile::new()?;
writeln!(file, "ex.*le")?; writeln!(file, "ex.*le")?;
file.flush()?; file.flush()?;
let mut resultant_map = HashMap::new(); let mut resultant_map = Vec::new();
filter_with_lists( filter_with_lists(
&mut map_to_be_filtered, &mut map_to_be_filtered,
@ -312,18 +303,22 @@ mod tests {
)?; )?;
assert_eq!(resultant_map.len(), 1); assert_eq!(resultant_map.len(), 1);
assert!(resultant_map.contains_key("https://www.example.com")); assert!(resultant_map
.iter()
.any(|(key, _)| key == "https://www.example.com"));
assert_eq!(map_to_be_filtered.len(), 1); assert_eq!(map_to_be_filtered.len(), 1);
assert!(map_to_be_filtered.contains_key("https://www.rust-lang.org/")); assert!(map_to_be_filtered
.iter()
.any(|(key, _)| key == "https://www.rust-lang.org/"));
Ok(()) Ok(())
} }
#[test] #[test]
fn test_filter_with_lists_file_not_found() { fn test_filter_with_lists_file_not_found() {
let mut map_to_be_filtered = HashMap::new(); let mut map_to_be_filtered = Vec::new();
let mut resultant_map = HashMap::new(); let mut resultant_map = Vec::new();
// Call the `filter_with_lists` function with a non-existent file path // Call the `filter_with_lists` function with a non-existent file path
let result = filter_with_lists( let result = filter_with_lists(
@ -337,8 +332,8 @@ mod tests {
#[test] #[test]
fn test_filter_with_lists_invalid_regex() { fn test_filter_with_lists_invalid_regex() {
let mut map_to_be_filtered = HashMap::new(); let mut map_to_be_filtered = Vec::new();
map_to_be_filtered.insert( map_to_be_filtered.push((
"https://www.example.com".to_owned(), "https://www.example.com".to_owned(),
SearchResult { SearchResult {
title: "Example Domain".to_owned(), title: "Example Domain".to_owned(),
@ -347,9 +342,9 @@ mod tests {
.to_owned(), .to_owned(),
engine: smallvec!["Google".to_owned(), "Bing".to_owned()], engine: smallvec!["Google".to_owned(), "Bing".to_owned()],
}, },
); ));
let mut resultant_map = HashMap::new(); let mut resultant_map = Vec::new();
// Create a temporary file with an invalid regex pattern // Create a temporary file with an invalid regex pattern
let mut file = NamedTempFile::new().unwrap(); let mut file = NamedTempFile::new().unwrap();