mirror of
https://github.com/neon-mmd/websurfx.git
synced 2024-11-22 05:58:21 -05:00
✨ feat: add code to filter aggregated search results using lists (#163)
This commit is contained in:
parent
9d2fb6c946
commit
4f28711218
@ -1,18 +1,22 @@
|
|||||||
//! This module provides the functionality to scrape and gathers all the results from the upstream
|
//! This module provides the functionality to scrape and gathers all the results from the upstream
|
||||||
//! search engines and then removes duplicate results.
|
//! search engines and then removes duplicate results.
|
||||||
|
|
||||||
use std::{collections::HashMap, time::Duration};
|
use std::{collections::HashMap, io::BufReader, time::Duration};
|
||||||
|
|
||||||
use error_stack::Report;
|
|
||||||
use rand::Rng;
|
|
||||||
use tokio::task::JoinHandle;
|
|
||||||
|
|
||||||
use super::{
|
use super::{
|
||||||
aggregation_models::{EngineErrorInfo, SearchResult, SearchResults},
|
aggregation_models::{EngineErrorInfo, SearchResult, SearchResults},
|
||||||
user_agent::random_user_agent,
|
user_agent::random_user_agent,
|
||||||
};
|
};
|
||||||
|
use error_stack::Report;
|
||||||
|
use rand::Rng;
|
||||||
|
use regex::Regex;
|
||||||
|
use std::{fs::File, io::BufRead};
|
||||||
|
use tokio::task::JoinHandle;
|
||||||
|
|
||||||
use crate::engines::engine_models::{EngineError, EngineHandler};
|
use crate::{
|
||||||
|
engines::engine_models::{EngineError, EngineHandler},
|
||||||
|
handler::paths::{file_path, FileType},
|
||||||
|
};
|
||||||
|
|
||||||
/// Aliases for long type annotations
|
/// Aliases for long type annotations
|
||||||
type FutureVec = Vec<JoinHandle<Result<HashMap<String, SearchResult>, Report<EngineError>>>>;
|
type FutureVec = Vec<JoinHandle<Result<HashMap<String, SearchResult>, Report<EngineError>>>>;
|
||||||
@ -106,7 +110,7 @@ pub async fn aggregate(
|
|||||||
log::error!("Engine Error: {:?}", error);
|
log::error!("Engine Error: {:?}", error);
|
||||||
engine_errors_info.push(EngineErrorInfo::new(
|
engine_errors_info.push(EngineErrorInfo::new(
|
||||||
error.downcast_ref::<EngineError>().unwrap(),
|
error.downcast_ref::<EngineError>().unwrap(),
|
||||||
engine_name.to_string(),
|
engine_name,
|
||||||
));
|
));
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -143,7 +147,22 @@ pub async fn aggregate(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let results = result_map.into_values().collect();
|
let mut blacklist_map: HashMap<String, SearchResult> = HashMap::new();
|
||||||
|
filter_with_lists(
|
||||||
|
&mut result_map,
|
||||||
|
&mut blacklist_map,
|
||||||
|
&file_path(FileType::BlockList)?,
|
||||||
|
)?;
|
||||||
|
|
||||||
|
filter_with_lists(
|
||||||
|
&mut blacklist_map,
|
||||||
|
&mut result_map,
|
||||||
|
&file_path(FileType::AllowList)?,
|
||||||
|
)?;
|
||||||
|
|
||||||
|
drop(blacklist_map);
|
||||||
|
|
||||||
|
let results: Vec<SearchResult> = result_map.into_values().collect();
|
||||||
|
|
||||||
Ok(SearchResults::new(
|
Ok(SearchResults::new(
|
||||||
results,
|
results,
|
||||||
@ -151,3 +170,23 @@ pub async fn aggregate(
|
|||||||
engine_errors_info,
|
engine_errors_info,
|
||||||
))
|
))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn filter_with_lists(
|
||||||
|
map_to_be_filtered: &mut HashMap<String, SearchResult>,
|
||||||
|
resultant_map: &mut HashMap<String, SearchResult>,
|
||||||
|
file_path: &str,
|
||||||
|
) -> Result<(), Box<dyn std::error::Error>> {
|
||||||
|
for (url, search_result) in map_to_be_filtered.clone().into_iter() {
|
||||||
|
let reader = BufReader::new(File::open(file_path)?);
|
||||||
|
for line in reader.lines() {
|
||||||
|
let re = Regex::new(&line?)?;
|
||||||
|
if re.is_match(&url.to_lowercase())
|
||||||
|
|| re.is_match(&search_result.title.to_lowercase())
|
||||||
|
|| re.is_match(&search_result.description.to_lowercase())
|
||||||
|
{
|
||||||
|
resultant_map.insert(url.clone(), map_to_be_filtered.remove(&url).unwrap());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user