0
0
mirror of https://github.com/neon-mmd/websurfx.git synced 2024-11-22 14:08:23 -05:00

feat: add code to filter aggregated search results using lists (#163)

This commit is contained in:
neon_arch 2023-08-22 19:16:37 +03:00
parent 9d2fb6c946
commit 4f28711218

View File

@ -1,18 +1,22 @@
//! This module provides the functionality to scrape and gathers all the results from the upstream //! This module provides the functionality to scrape and gathers all the results from the upstream
//! search engines and then removes duplicate results. //! search engines and then removes duplicate results.
use std::{collections::HashMap, time::Duration}; use std::{collections::HashMap, io::BufReader, time::Duration};
use error_stack::Report;
use rand::Rng;
use tokio::task::JoinHandle;
use super::{ use super::{
aggregation_models::{EngineErrorInfo, SearchResult, SearchResults}, aggregation_models::{EngineErrorInfo, SearchResult, SearchResults},
user_agent::random_user_agent, user_agent::random_user_agent,
}; };
use error_stack::Report;
use rand::Rng;
use regex::Regex;
use std::{fs::File, io::BufRead};
use tokio::task::JoinHandle;
use crate::engines::engine_models::{EngineError, EngineHandler}; use crate::{
engines::engine_models::{EngineError, EngineHandler},
handler::paths::{file_path, FileType},
};
/// Aliases for long type annotations /// Aliases for long type annotations
type FutureVec = Vec<JoinHandle<Result<HashMap<String, SearchResult>, Report<EngineError>>>>; type FutureVec = Vec<JoinHandle<Result<HashMap<String, SearchResult>, Report<EngineError>>>>;
@ -106,7 +110,7 @@ pub async fn aggregate(
log::error!("Engine Error: {:?}", error); log::error!("Engine Error: {:?}", error);
engine_errors_info.push(EngineErrorInfo::new( engine_errors_info.push(EngineErrorInfo::new(
error.downcast_ref::<EngineError>().unwrap(), error.downcast_ref::<EngineError>().unwrap(),
engine_name.to_string(), engine_name,
)); ));
}; };
@ -143,7 +147,22 @@ pub async fn aggregate(
} }
} }
let results = result_map.into_values().collect(); let mut blacklist_map: HashMap<String, SearchResult> = HashMap::new();
filter_with_lists(
&mut result_map,
&mut blacklist_map,
&file_path(FileType::BlockList)?,
)?;
filter_with_lists(
&mut blacklist_map,
&mut result_map,
&file_path(FileType::AllowList)?,
)?;
drop(blacklist_map);
let results: Vec<SearchResult> = result_map.into_values().collect();
Ok(SearchResults::new( Ok(SearchResults::new(
results, results,
@ -151,3 +170,23 @@ pub async fn aggregate(
engine_errors_info, engine_errors_info,
)) ))
} }
fn filter_with_lists(
map_to_be_filtered: &mut HashMap<String, SearchResult>,
resultant_map: &mut HashMap<String, SearchResult>,
file_path: &str,
) -> Result<(), Box<dyn std::error::Error>> {
for (url, search_result) in map_to_be_filtered.clone().into_iter() {
let reader = BufReader::new(File::open(file_path)?);
for line in reader.lines() {
let re = Regex::new(&line?)?;
if re.is_match(&url.to_lowercase())
|| re.is_match(&search_result.title.to_lowercase())
|| re.is_match(&search_result.description.to_lowercase())
{
resultant_map.insert(url.clone(), map_to_be_filtered.remove(&url).unwrap());
}
}
}
Ok(())
}