diff --git a/src/config/parser.rs b/src/config/parser.rs index c6ca37a..3b50622 100644 --- a/src/config/parser.rs +++ b/src/config/parser.rs @@ -34,7 +34,7 @@ pub struct Config { pub aggregator: AggregatorConfig, pub logging: bool, pub debug: bool, - pub upstream_search_engines: Vec, + pub upstream_search_engines: Vec, pub request_timeout: u8, pub threads: u8, } @@ -107,6 +107,7 @@ impl Config { .get::<_, HashMap>("upstream_search_engines")? .into_iter() .filter_map(|(key, value)| value.then_some(key)) + .filter_map(|engine| crate::engines::engine_models::EngineHandler::new(&engine)) .collect(), request_timeout: globals.get::<_, u8>("request_timeout")?, threads, diff --git a/src/engines/duckduckgo.rs b/src/engines/duckduckgo.rs index 8958319..11b7d86 100644 --- a/src/engines/duckduckgo.rs +++ b/src/engines/duckduckgo.rs @@ -7,7 +7,7 @@ use std::collections::HashMap; use reqwest::header::{HeaderMap, CONTENT_TYPE, COOKIE, REFERER, USER_AGENT}; use scraper::{Html, Selector}; -use crate::results::aggregation_models::RawSearchResult; +use crate::results::aggregation_models::SearchResult; use super::engine_models::{EngineError, SearchEngine}; @@ -43,7 +43,7 @@ impl SearchEngine for DuckDuckGo { page: u32, user_agent: String, request_timeout: u8, - ) -> Result, EngineError> { + ) -> Result, EngineError> { // Page number can be missing or empty string and so appropriate handling is required // so that upstream server recieves valid page number. let url: String = match page { @@ -120,7 +120,7 @@ impl SearchEngine for DuckDuckGo { Ok(document .select(&results) .map(|result| { - RawSearchResult::new( + SearchResult::new( result .select(&result_title) .next() diff --git a/src/engines/engine_models.rs b/src/engines/engine_models.rs index c6aa030..d33d13c 100644 --- a/src/engines/engine_models.rs +++ b/src/engines/engine_models.rs @@ -1,7 +1,7 @@ //! This module provides the error enum to handle different errors associated while requesting data from //! the upstream search engines with the search query provided by the user. -use crate::results::aggregation_models::RawSearchResult; +use crate::results::aggregation_models::SearchResult; use error_stack::{IntoReport, Result, ResultExt}; use std::{collections::HashMap, fmt, time::Duration}; @@ -45,7 +45,7 @@ impl error_stack::Context for EngineError {} /// A trait to define common behavior for all search engines. #[async_trait::async_trait] -pub trait SearchEngine { +pub trait SearchEngine: Sync + Send { async fn fetch_html_from_upstream( &self, url: String, @@ -73,5 +73,37 @@ pub trait SearchEngine { page: u32, user_agent: String, request_timeout: u8, - ) -> Result, EngineError>; + ) -> Result, EngineError>; +} + +pub struct EngineHandler { + engine: Box, + name: &'static str, +} + +impl Clone for EngineHandler { + fn clone(&self) -> Self { + Self::new(self.name).unwrap() + } +} + +impl EngineHandler { + /// parses an engine name into an engine handler, returns none if the engine is unknown + pub fn new(engine_name: &str) -> Option { + let engine: (&'static str, Box) = + match engine_name.to_lowercase().as_str() { + "duckduckgo" => ("duckduckgo", Box::new(super::duckduckgo::DuckDuckGo)), + "searx" => ("searx", Box::new(super::searx::Searx)), + _ => return None, + }; + + Some(Self { + engine: engine.1, + name: engine.0, + }) + } + + pub fn into_name_engine(self) -> (&'static str, Box) { + (self.name, self.engine) + } } diff --git a/src/engines/searx.rs b/src/engines/searx.rs index 1caca17..4ad41f5 100644 --- a/src/engines/searx.rs +++ b/src/engines/searx.rs @@ -6,7 +6,7 @@ use reqwest::header::{HeaderMap, CONTENT_TYPE, COOKIE, REFERER, USER_AGENT}; use scraper::{Html, Selector}; use std::collections::HashMap; -use crate::results::aggregation_models::RawSearchResult; +use crate::results::aggregation_models::SearchResult; use super::engine_models::{EngineError, SearchEngine}; use error_stack::{IntoReport, Report, Result, ResultExt}; @@ -42,7 +42,7 @@ impl SearchEngine for Searx { page: u32, user_agent: String, request_timeout: u8, - ) -> Result, EngineError> { + ) -> Result, EngineError> { // Page number can be missing or empty string and so appropriate handling is required // so that upstream server recieves valid page number. let url: String = match page { @@ -111,7 +111,7 @@ impl SearchEngine for Searx { Ok(document .select(&results) .map(|result| { - RawSearchResult::new( + SearchResult::new( result .select(&result_title) .next() diff --git a/src/results/aggregation_models.rs b/src/results/aggregation_models.rs index 2f1b7b4..b4a780c 100644 --- a/src/results/aggregation_models.rs +++ b/src/results/aggregation_models.rs @@ -5,54 +5,6 @@ use serde::{Deserialize, Serialize}; use crate::{config::parser_models::Style, engines::engine_models::EngineError}; -/// A named struct to store, serialize and deserializes the individual search result from all the -/// scraped and aggregated search results from the upstream search engines. -/// -/// # Fields -/// -/// * `title` - The title of the search result. -/// * `url` - The url to be displayed below the search result title in html. -/// * `description` - The description of the search result. -/// * `engine` - The names of the upstream engines from which this results were provided. -#[derive(Serialize, Deserialize)] -#[serde(rename_all = "camelCase")] -pub struct SearchResult { - pub title: String, - pub url: String, - pub description: String, - pub engine: Vec, -} - -impl SearchResult { - /// Constructs a new `SearchResult` with the given arguments needed for the struct. - /// - /// # Arguments - /// - /// * `title` - The title of the search result. - /// * `visiting_url` - The url which is accessed when clicked on it - /// (href url in html in simple words). - /// * `url` - The url to be displayed below the search result title in html. - /// * `description` - The description of the search result. - /// * `engine` - The names of the upstream engines from which this results were provided. - pub fn new(title: String, url: String, description: String, engine: Vec) -> Self { - SearchResult { - title, - url, - description, - engine, - } - } - - pub fn from_raw(raw: RawSearchResult) -> Self { - SearchResult { - title: raw.title, - url: raw.url, - description: raw.description, - engine: raw.engine, - } - } -} - /// A named struct to store the raw scraped search results scraped search results from the /// upstream search engines before aggregating it.It derives the Clone trait which is needed /// to write idiomatic rust using `Iterators`. @@ -64,15 +16,16 @@ impl SearchResult { /// (href url in html in simple words). /// * `description` - The description of the search result. /// * `engine` - The names of the upstream engines from which this results were provided. -#[derive(Clone)] -pub struct RawSearchResult { +#[derive(Clone, Serialize, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct SearchResult { pub title: String, pub url: String, pub description: String, pub engine: Vec, } -impl RawSearchResult { +impl SearchResult { /// Constructs a new `RawSearchResult` with the given arguments needed for the struct. /// /// # Arguments @@ -83,7 +36,7 @@ impl RawSearchResult { /// * `description` - The description of the search result. /// * `engine` - The names of the upstream engines from which this results were provided. pub fn new(title: String, url: String, description: String, engine: Vec) -> Self { - RawSearchResult { + SearchResult { title, url, description, diff --git a/src/results/aggregator.rs b/src/results/aggregator.rs index 4d0c708..16586c0 100644 --- a/src/results/aggregator.rs +++ b/src/results/aggregator.rs @@ -8,18 +8,14 @@ use rand::Rng; use tokio::task::JoinHandle; use super::{ - aggregation_models::{EngineErrorInfo, RawSearchResult, SearchResult, SearchResults}, + aggregation_models::{EngineErrorInfo, SearchResult, SearchResults}, user_agent::random_user_agent, }; -use crate::engines::{ - duckduckgo, - engine_models::{EngineError, SearchEngine}, - searx, -}; +use crate::engines::engine_models::{EngineError, EngineHandler}; /// Aliases for long type annotations -type FutureVec = Vec, Report>>>; +type FutureVec = Vec, Report>>>; /// The function aggregates the scraped results from the user-selected upstream search engines. /// These engines can be chosen either from the user interface (UI) or from the configuration file. @@ -64,7 +60,7 @@ pub async fn aggregate( page: u32, random_delay: bool, debug: bool, - mut upstream_search_engines: Vec, + upstream_search_engines: Vec, request_timeout: u8, ) -> Result> { let user_agent: String = random_user_agent(); @@ -76,24 +72,22 @@ pub async fn aggregate( tokio::time::sleep(Duration::from_secs(delay_secs)).await; } + let mut names: Vec<&str> = vec![]; + // create tasks for upstream result fetching - let tasks: FutureVec = upstream_search_engines - .iter() - .map(|engine| match engine.to_lowercase().as_str() { - "duckduckgo" => Box::new(duckduckgo::DuckDuckGo) as Box, - "searx" => Box::new(searx::Searx) as Box, - &_ => panic!("Config Error: Incorrect config file option provided"), - }) - .map(|search_engine| { - let query: String = query.clone(); - let user_agent: String = user_agent.clone(); - tokio::spawn(async move { - search_engine - .results(query, page, user_agent.clone(), request_timeout) - .await - }) - }) - .collect(); + let mut tasks: FutureVec = FutureVec::new(); + + for engine_handler in upstream_search_engines { + let (name, search_engine) = engine_handler.into_name_engine(); + names.push(name); + let query: String = query.clone(); + let user_agent: String = user_agent.clone(); + tasks.push(tokio::spawn(async move { + search_engine + .results(query, page, user_agent.clone(), request_timeout) + .await + })); + } // get upstream responses let mut responses = Vec::with_capacity(tasks.len()); @@ -105,20 +99,20 @@ pub async fn aggregate( } // aggregate search results, removing duplicates and handling errors the upstream engines returned - let mut result_map: HashMap = HashMap::new(); + let mut result_map: HashMap = HashMap::new(); let mut engine_errors_info: Vec = Vec::new(); let mut handle_error = |error: Report, engine_name: String| { log::error!("Engine Error: {:?}", error); engine_errors_info.push(EngineErrorInfo::new( error.downcast_ref::().unwrap(), - engine_name, + engine_name.to_string(), )); }; for _ in 0..responses.len() { let response = responses.pop().unwrap(); - let engine_name = upstream_search_engines.pop().unwrap(); + let engine = names.pop().unwrap().to_string(); if result_map.is_empty() { match response { @@ -126,7 +120,7 @@ pub async fn aggregate( result_map = results.clone(); } Err(error) => { - handle_error(error, engine_name.clone()); + handle_error(error, engine); } } continue; @@ -138,21 +132,18 @@ pub async fn aggregate( result_map .entry(key) .and_modify(|result| { - result.add_engines(engine_name.clone()); + result.add_engines(engine.clone()); }) - .or_insert_with(|| -> RawSearchResult { value }); + .or_insert_with(|| -> SearchResult { value }); }); } Err(error) => { - handle_error(error, engine_name.clone()); + handle_error(error, engine); } } } - let mut results = Vec::with_capacity(result_map.len()); - for (_, result) in result_map { - results.push(SearchResult::from_raw(result)) - } + let results = result_map.into_values().collect(); Ok(SearchResults::new( results, diff --git a/src/server/routes.rs b/src/server/routes.rs index c203e20..77210b2 100644 --- a/src/server/routes.rs +++ b/src/server/routes.rs @@ -7,6 +7,7 @@ use std::fs::read_to_string; use crate::{ cache::cacher::RedisCache, config::parser::Config, + engines::engine_models::EngineHandler, handler::public_paths::public_path, results::{aggregation_models::SearchResults, aggregator::aggregate}, }; @@ -175,12 +176,19 @@ async fn results( { Some(cookie_value) => { let cookie_value: Cookie = serde_json::from_str(cookie_value.name_value().1)?; + + let engines = cookie_value + .engines + .iter() + .filter_map(|name| EngineHandler::new(name)) + .collect(); + aggregate( query, page, config.aggregator.random_delay, config.debug, - cookie_value.engines, + engines, config.request_timeout, ) .await?