mirror of
https://github.com/neon-mmd/websurfx.git
synced 2024-12-22 20:38:22 -05:00
✨ feat: add code to filter aggregated search results using lists (#163)
This commit is contained in:
parent
9d2fb6c946
commit
4f28711218
@ -1,18 +1,22 @@
|
||||
//! This module provides the functionality to scrape and gathers all the results from the upstream
|
||||
//! search engines and then removes duplicate results.
|
||||
|
||||
use std::{collections::HashMap, time::Duration};
|
||||
|
||||
use error_stack::Report;
|
||||
use rand::Rng;
|
||||
use tokio::task::JoinHandle;
|
||||
use std::{collections::HashMap, io::BufReader, time::Duration};
|
||||
|
||||
use super::{
|
||||
aggregation_models::{EngineErrorInfo, SearchResult, SearchResults},
|
||||
user_agent::random_user_agent,
|
||||
};
|
||||
use error_stack::Report;
|
||||
use rand::Rng;
|
||||
use regex::Regex;
|
||||
use std::{fs::File, io::BufRead};
|
||||
use tokio::task::JoinHandle;
|
||||
|
||||
use crate::engines::engine_models::{EngineError, EngineHandler};
|
||||
use crate::{
|
||||
engines::engine_models::{EngineError, EngineHandler},
|
||||
handler::paths::{file_path, FileType},
|
||||
};
|
||||
|
||||
/// Aliases for long type annotations
|
||||
type FutureVec = Vec<JoinHandle<Result<HashMap<String, SearchResult>, Report<EngineError>>>>;
|
||||
@ -106,7 +110,7 @@ pub async fn aggregate(
|
||||
log::error!("Engine Error: {:?}", error);
|
||||
engine_errors_info.push(EngineErrorInfo::new(
|
||||
error.downcast_ref::<EngineError>().unwrap(),
|
||||
engine_name.to_string(),
|
||||
engine_name,
|
||||
));
|
||||
};
|
||||
|
||||
@ -143,7 +147,22 @@ pub async fn aggregate(
|
||||
}
|
||||
}
|
||||
|
||||
let results = result_map.into_values().collect();
|
||||
let mut blacklist_map: HashMap<String, SearchResult> = HashMap::new();
|
||||
filter_with_lists(
|
||||
&mut result_map,
|
||||
&mut blacklist_map,
|
||||
&file_path(FileType::BlockList)?,
|
||||
)?;
|
||||
|
||||
filter_with_lists(
|
||||
&mut blacklist_map,
|
||||
&mut result_map,
|
||||
&file_path(FileType::AllowList)?,
|
||||
)?;
|
||||
|
||||
drop(blacklist_map);
|
||||
|
||||
let results: Vec<SearchResult> = result_map.into_values().collect();
|
||||
|
||||
Ok(SearchResults::new(
|
||||
results,
|
||||
@ -151,3 +170,23 @@ pub async fn aggregate(
|
||||
engine_errors_info,
|
||||
))
|
||||
}
|
||||
|
||||
fn filter_with_lists(
|
||||
map_to_be_filtered: &mut HashMap<String, SearchResult>,
|
||||
resultant_map: &mut HashMap<String, SearchResult>,
|
||||
file_path: &str,
|
||||
) -> Result<(), Box<dyn std::error::Error>> {
|
||||
for (url, search_result) in map_to_be_filtered.clone().into_iter() {
|
||||
let reader = BufReader::new(File::open(file_path)?);
|
||||
for line in reader.lines() {
|
||||
let re = Regex::new(&line?)?;
|
||||
if re.is_match(&url.to_lowercase())
|
||||
|| re.is_match(&search_result.title.to_lowercase())
|
||||
|| re.is_match(&search_result.description.to_lowercase())
|
||||
{
|
||||
resultant_map.insert(url.clone(), map_to_be_filtered.remove(&url).unwrap());
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user