From 5aca5c0d0d51d4517b00ff6412b4d7bd91420571 Mon Sep 17 00:00:00 2001 From: Milim <81323548+MilimTheTrueOne@users.noreply.github.com> Date: Fri, 18 Aug 2023 10:43:53 +0200 Subject: [PATCH] Improve aggregation Adds the EngineHandler struct Removes vulnerability where an attacker could send requests cookies with fake engine names and crash the server. Merged RawSearchResult and SearchResult, as they were functionally identical. --- src/config/parser.rs | 3 +- src/engines/duckduckgo.rs | 6 +-- src/engines/engine_models.rs | 38 +++++++++++++++++-- src/engines/searx.rs | 6 +-- src/results/aggregation_models.rs | 57 +++------------------------- src/results/aggregator.rs | 63 +++++++++++++------------------ src/server/routes.rs | 10 ++++- 7 files changed, 84 insertions(+), 99 deletions(-) diff --git a/src/config/parser.rs b/src/config/parser.rs index c6ca37a..3b50622 100644 --- a/src/config/parser.rs +++ b/src/config/parser.rs @@ -34,7 +34,7 @@ pub struct Config { pub aggregator: AggregatorConfig, pub logging: bool, pub debug: bool, - pub upstream_search_engines: Vec, + pub upstream_search_engines: Vec, pub request_timeout: u8, pub threads: u8, } @@ -107,6 +107,7 @@ impl Config { .get::<_, HashMap>("upstream_search_engines")? .into_iter() .filter_map(|(key, value)| value.then_some(key)) + .filter_map(|engine| crate::engines::engine_models::EngineHandler::new(&engine)) .collect(), request_timeout: globals.get::<_, u8>("request_timeout")?, threads, diff --git a/src/engines/duckduckgo.rs b/src/engines/duckduckgo.rs index 8958319..11b7d86 100644 --- a/src/engines/duckduckgo.rs +++ b/src/engines/duckduckgo.rs @@ -7,7 +7,7 @@ use std::collections::HashMap; use reqwest::header::{HeaderMap, CONTENT_TYPE, COOKIE, REFERER, USER_AGENT}; use scraper::{Html, Selector}; -use crate::results::aggregation_models::RawSearchResult; +use crate::results::aggregation_models::SearchResult; use super::engine_models::{EngineError, SearchEngine}; @@ -43,7 +43,7 @@ impl SearchEngine for DuckDuckGo { page: u32, user_agent: String, request_timeout: u8, - ) -> Result, EngineError> { + ) -> Result, EngineError> { // Page number can be missing or empty string and so appropriate handling is required // so that upstream server recieves valid page number. let url: String = match page { @@ -120,7 +120,7 @@ impl SearchEngine for DuckDuckGo { Ok(document .select(&results) .map(|result| { - RawSearchResult::new( + SearchResult::new( result .select(&result_title) .next() diff --git a/src/engines/engine_models.rs b/src/engines/engine_models.rs index c6aa030..d33d13c 100644 --- a/src/engines/engine_models.rs +++ b/src/engines/engine_models.rs @@ -1,7 +1,7 @@ //! This module provides the error enum to handle different errors associated while requesting data from //! the upstream search engines with the search query provided by the user. -use crate::results::aggregation_models::RawSearchResult; +use crate::results::aggregation_models::SearchResult; use error_stack::{IntoReport, Result, ResultExt}; use std::{collections::HashMap, fmt, time::Duration}; @@ -45,7 +45,7 @@ impl error_stack::Context for EngineError {} /// A trait to define common behavior for all search engines. #[async_trait::async_trait] -pub trait SearchEngine { +pub trait SearchEngine: Sync + Send { async fn fetch_html_from_upstream( &self, url: String, @@ -73,5 +73,37 @@ pub trait SearchEngine { page: u32, user_agent: String, request_timeout: u8, - ) -> Result, EngineError>; + ) -> Result, EngineError>; +} + +pub struct EngineHandler { + engine: Box, + name: &'static str, +} + +impl Clone for EngineHandler { + fn clone(&self) -> Self { + Self::new(self.name).unwrap() + } +} + +impl EngineHandler { + /// parses an engine name into an engine handler, returns none if the engine is unknown + pub fn new(engine_name: &str) -> Option { + let engine: (&'static str, Box) = + match engine_name.to_lowercase().as_str() { + "duckduckgo" => ("duckduckgo", Box::new(super::duckduckgo::DuckDuckGo)), + "searx" => ("searx", Box::new(super::searx::Searx)), + _ => return None, + }; + + Some(Self { + engine: engine.1, + name: engine.0, + }) + } + + pub fn into_name_engine(self) -> (&'static str, Box) { + (self.name, self.engine) + } } diff --git a/src/engines/searx.rs b/src/engines/searx.rs index 1caca17..4ad41f5 100644 --- a/src/engines/searx.rs +++ b/src/engines/searx.rs @@ -6,7 +6,7 @@ use reqwest::header::{HeaderMap, CONTENT_TYPE, COOKIE, REFERER, USER_AGENT}; use scraper::{Html, Selector}; use std::collections::HashMap; -use crate::results::aggregation_models::RawSearchResult; +use crate::results::aggregation_models::SearchResult; use super::engine_models::{EngineError, SearchEngine}; use error_stack::{IntoReport, Report, Result, ResultExt}; @@ -42,7 +42,7 @@ impl SearchEngine for Searx { page: u32, user_agent: String, request_timeout: u8, - ) -> Result, EngineError> { + ) -> Result, EngineError> { // Page number can be missing or empty string and so appropriate handling is required // so that upstream server recieves valid page number. let url: String = match page { @@ -111,7 +111,7 @@ impl SearchEngine for Searx { Ok(document .select(&results) .map(|result| { - RawSearchResult::new( + SearchResult::new( result .select(&result_title) .next() diff --git a/src/results/aggregation_models.rs b/src/results/aggregation_models.rs index 2f1b7b4..b4a780c 100644 --- a/src/results/aggregation_models.rs +++ b/src/results/aggregation_models.rs @@ -5,54 +5,6 @@ use serde::{Deserialize, Serialize}; use crate::{config::parser_models::Style, engines::engine_models::EngineError}; -/// A named struct to store, serialize and deserializes the individual search result from all the -/// scraped and aggregated search results from the upstream search engines. -/// -/// # Fields -/// -/// * `title` - The title of the search result. -/// * `url` - The url to be displayed below the search result title in html. -/// * `description` - The description of the search result. -/// * `engine` - The names of the upstream engines from which this results were provided. -#[derive(Serialize, Deserialize)] -#[serde(rename_all = "camelCase")] -pub struct SearchResult { - pub title: String, - pub url: String, - pub description: String, - pub engine: Vec, -} - -impl SearchResult { - /// Constructs a new `SearchResult` with the given arguments needed for the struct. - /// - /// # Arguments - /// - /// * `title` - The title of the search result. - /// * `visiting_url` - The url which is accessed when clicked on it - /// (href url in html in simple words). - /// * `url` - The url to be displayed below the search result title in html. - /// * `description` - The description of the search result. - /// * `engine` - The names of the upstream engines from which this results were provided. - pub fn new(title: String, url: String, description: String, engine: Vec) -> Self { - SearchResult { - title, - url, - description, - engine, - } - } - - pub fn from_raw(raw: RawSearchResult) -> Self { - SearchResult { - title: raw.title, - url: raw.url, - description: raw.description, - engine: raw.engine, - } - } -} - /// A named struct to store the raw scraped search results scraped search results from the /// upstream search engines before aggregating it.It derives the Clone trait which is needed /// to write idiomatic rust using `Iterators`. @@ -64,15 +16,16 @@ impl SearchResult { /// (href url in html in simple words). /// * `description` - The description of the search result. /// * `engine` - The names of the upstream engines from which this results were provided. -#[derive(Clone)] -pub struct RawSearchResult { +#[derive(Clone, Serialize, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct SearchResult { pub title: String, pub url: String, pub description: String, pub engine: Vec, } -impl RawSearchResult { +impl SearchResult { /// Constructs a new `RawSearchResult` with the given arguments needed for the struct. /// /// # Arguments @@ -83,7 +36,7 @@ impl RawSearchResult { /// * `description` - The description of the search result. /// * `engine` - The names of the upstream engines from which this results were provided. pub fn new(title: String, url: String, description: String, engine: Vec) -> Self { - RawSearchResult { + SearchResult { title, url, description, diff --git a/src/results/aggregator.rs b/src/results/aggregator.rs index 4d0c708..16586c0 100644 --- a/src/results/aggregator.rs +++ b/src/results/aggregator.rs @@ -8,18 +8,14 @@ use rand::Rng; use tokio::task::JoinHandle; use super::{ - aggregation_models::{EngineErrorInfo, RawSearchResult, SearchResult, SearchResults}, + aggregation_models::{EngineErrorInfo, SearchResult, SearchResults}, user_agent::random_user_agent, }; -use crate::engines::{ - duckduckgo, - engine_models::{EngineError, SearchEngine}, - searx, -}; +use crate::engines::engine_models::{EngineError, EngineHandler}; /// Aliases for long type annotations -type FutureVec = Vec, Report>>>; +type FutureVec = Vec, Report>>>; /// The function aggregates the scraped results from the user-selected upstream search engines. /// These engines can be chosen either from the user interface (UI) or from the configuration file. @@ -64,7 +60,7 @@ pub async fn aggregate( page: u32, random_delay: bool, debug: bool, - mut upstream_search_engines: Vec, + upstream_search_engines: Vec, request_timeout: u8, ) -> Result> { let user_agent: String = random_user_agent(); @@ -76,24 +72,22 @@ pub async fn aggregate( tokio::time::sleep(Duration::from_secs(delay_secs)).await; } + let mut names: Vec<&str> = vec![]; + // create tasks for upstream result fetching - let tasks: FutureVec = upstream_search_engines - .iter() - .map(|engine| match engine.to_lowercase().as_str() { - "duckduckgo" => Box::new(duckduckgo::DuckDuckGo) as Box, - "searx" => Box::new(searx::Searx) as Box, - &_ => panic!("Config Error: Incorrect config file option provided"), - }) - .map(|search_engine| { - let query: String = query.clone(); - let user_agent: String = user_agent.clone(); - tokio::spawn(async move { - search_engine - .results(query, page, user_agent.clone(), request_timeout) - .await - }) - }) - .collect(); + let mut tasks: FutureVec = FutureVec::new(); + + for engine_handler in upstream_search_engines { + let (name, search_engine) = engine_handler.into_name_engine(); + names.push(name); + let query: String = query.clone(); + let user_agent: String = user_agent.clone(); + tasks.push(tokio::spawn(async move { + search_engine + .results(query, page, user_agent.clone(), request_timeout) + .await + })); + } // get upstream responses let mut responses = Vec::with_capacity(tasks.len()); @@ -105,20 +99,20 @@ pub async fn aggregate( } // aggregate search results, removing duplicates and handling errors the upstream engines returned - let mut result_map: HashMap = HashMap::new(); + let mut result_map: HashMap = HashMap::new(); let mut engine_errors_info: Vec = Vec::new(); let mut handle_error = |error: Report, engine_name: String| { log::error!("Engine Error: {:?}", error); engine_errors_info.push(EngineErrorInfo::new( error.downcast_ref::().unwrap(), - engine_name, + engine_name.to_string(), )); }; for _ in 0..responses.len() { let response = responses.pop().unwrap(); - let engine_name = upstream_search_engines.pop().unwrap(); + let engine = names.pop().unwrap().to_string(); if result_map.is_empty() { match response { @@ -126,7 +120,7 @@ pub async fn aggregate( result_map = results.clone(); } Err(error) => { - handle_error(error, engine_name.clone()); + handle_error(error, engine); } } continue; @@ -138,21 +132,18 @@ pub async fn aggregate( result_map .entry(key) .and_modify(|result| { - result.add_engines(engine_name.clone()); + result.add_engines(engine.clone()); }) - .or_insert_with(|| -> RawSearchResult { value }); + .or_insert_with(|| -> SearchResult { value }); }); } Err(error) => { - handle_error(error, engine_name.clone()); + handle_error(error, engine); } } } - let mut results = Vec::with_capacity(result_map.len()); - for (_, result) in result_map { - results.push(SearchResult::from_raw(result)) - } + let results = result_map.into_values().collect(); Ok(SearchResults::new( results, diff --git a/src/server/routes.rs b/src/server/routes.rs index c203e20..77210b2 100644 --- a/src/server/routes.rs +++ b/src/server/routes.rs @@ -7,6 +7,7 @@ use std::fs::read_to_string; use crate::{ cache::cacher::RedisCache, config::parser::Config, + engines::engine_models::EngineHandler, handler::public_paths::public_path, results::{aggregation_models::SearchResults, aggregator::aggregate}, }; @@ -175,12 +176,19 @@ async fn results( { Some(cookie_value) => { let cookie_value: Cookie = serde_json::from_str(cookie_value.name_value().1)?; + + let engines = cookie_value + .engines + .iter() + .filter_map(|name| EngineHandler::new(name)) + .collect(); + aggregate( query, page, config.aggregator.random_delay, config.debug, - cookie_value.engines, + engines, config.request_timeout, ) .await?