Improve aggregation

Adds the EngineHandler struct Removes vulnerability where an attacker could send requests cookies with fake engine names and crash the server. Merged RawSearchResult and SearchResult, as they were functionally identical.
2024-12-22 20:38:22 -05:00 · 2023-08-18 10:43:53 +02:00 · 2023-08-18 10:43:53 +02:00 · 5aca5c0d0d
commit 5aca5c0d0d
parent 15dfda6ea9
7 changed files with 84 additions and 99 deletions
--- a/src/config/parser.rs
+++ b/src/config/parser.rs
@ -34,7 +34,7 @@ pub struct Config {
    pub aggregator: AggregatorConfig,
    pub logging: bool,
    pub debug: bool,
-    pub upstream_search_engines: Vec<String>,
+    pub upstream_search_engines: Vec<crate::engines::engine_models::EngineHandler>,
    pub request_timeout: u8,
    pub threads: u8,
 }
@ -107,6 +107,7 @@ impl Config {
                    .get::<_, HashMap<String, bool>>("upstream_search_engines")?
                    .into_iter()
                    .filter_map(|(key, value)| value.then_some(key))
+                    .filter_map(|engine| crate::engines::engine_models::EngineHandler::new(&engine))
                    .collect(),
                request_timeout: globals.get::<_, u8>("request_timeout")?,
                threads,
--- a/src/engines/duckduckgo.rs
+++ b/src/engines/duckduckgo.rs
@ -7,7 +7,7 @@ use std::collections::HashMap;
 use reqwest::header::{HeaderMap, CONTENT_TYPE, COOKIE, REFERER, USER_AGENT};
 use scraper::{Html, Selector};

-use crate::results::aggregation_models::RawSearchResult;
+use crate::results::aggregation_models::SearchResult;

 use super::engine_models::{EngineError, SearchEngine};

@ -43,7 +43,7 @@ impl SearchEngine for DuckDuckGo {
        page: u32,
        user_agent: String,
        request_timeout: u8,
-    ) -> Result<HashMap<String, RawSearchResult>, EngineError> {
+    ) -> Result<HashMap<String, SearchResult>, EngineError> {
        // Page number can be missing or empty string and so appropriate handling is required
        // so that upstream server recieves valid page number.
        let url: String = match page {
@ -120,7 +120,7 @@ impl SearchEngine for DuckDuckGo {
        Ok(document
            .select(&results)
            .map(|result| {
-                RawSearchResult::new(
+                SearchResult::new(
                    result
                        .select(&result_title)
                        .next()
--- a/src/engines/engine_models.rs
+++ b/src/engines/engine_models.rs
@ -1,7 +1,7 @@
 //! This module provides the error enum to handle different errors associated while requesting data from
 //! the upstream search engines with the search query provided by the user.

-use crate::results::aggregation_models::RawSearchResult;
+use crate::results::aggregation_models::SearchResult;
 use error_stack::{IntoReport, Result, ResultExt};
 use std::{collections::HashMap, fmt, time::Duration};

@ -45,7 +45,7 @@ impl error_stack::Context for EngineError {}

 /// A trait to define common behavior for all search engines.
 #[async_trait::async_trait]
-pub trait SearchEngine {
+pub trait SearchEngine: Sync + Send {
    async fn fetch_html_from_upstream(
        &self,
        url: String,
@ -73,5 +73,37 @@ pub trait SearchEngine {
        page: u32,
        user_agent: String,
        request_timeout: u8,
-    ) -> Result<HashMap<String, RawSearchResult>, EngineError>;
+    ) -> Result<HashMap<String, SearchResult>, EngineError>;
+}
+
+pub struct EngineHandler {
+    engine: Box<dyn SearchEngine>,
+    name: &'static str,
+}
+
+impl Clone for EngineHandler {
+    fn clone(&self) -> Self {
+        Self::new(self.name).unwrap()
+    }
+}
+
+impl EngineHandler {
+    /// parses an engine name into an engine handler, returns none if the engine is unknown
+    pub fn new(engine_name: &str) -> Option<Self> {
+        let engine: (&'static str, Box<dyn SearchEngine>) =
+            match engine_name.to_lowercase().as_str() {
+                "duckduckgo" => ("duckduckgo", Box::new(super::duckduckgo::DuckDuckGo)),
+                "searx" => ("searx", Box::new(super::searx::Searx)),
+                _ => return None,
+            };
+
+        Some(Self {
+            engine: engine.1,
+            name: engine.0,
+        })
+    }
+
+    pub fn into_name_engine(self) -> (&'static str, Box<dyn SearchEngine>) {
+        (self.name, self.engine)
+    }
 }
--- a/src/engines/searx.rs
+++ b/src/engines/searx.rs
@ -6,7 +6,7 @@ use reqwest::header::{HeaderMap, CONTENT_TYPE, COOKIE, REFERER, USER_AGENT};
 use scraper::{Html, Selector};
 use std::collections::HashMap;

-use crate::results::aggregation_models::RawSearchResult;
+use crate::results::aggregation_models::SearchResult;

 use super::engine_models::{EngineError, SearchEngine};
 use error_stack::{IntoReport, Report, Result, ResultExt};
@ -42,7 +42,7 @@ impl SearchEngine for Searx {
        page: u32,
        user_agent: String,
        request_timeout: u8,
-    ) -> Result<HashMap<String, RawSearchResult>, EngineError> {
+    ) -> Result<HashMap<String, SearchResult>, EngineError> {
        // Page number can be missing or empty string and so appropriate handling is required
        // so that upstream server recieves valid page number.
        let url: String = match page {
@ -111,7 +111,7 @@ impl SearchEngine for Searx {
        Ok(document
            .select(&results)
            .map(|result| {
-                RawSearchResult::new(
+                SearchResult::new(
                    result
                        .select(&result_title)
                        .next()
--- a/src/results/aggregation_models.rs
+++ b/src/results/aggregation_models.rs
@ -5,54 +5,6 @@ use serde::{Deserialize, Serialize};

 use crate::{config::parser_models::Style, engines::engine_models::EngineError};

-/// A named struct to store, serialize and deserializes the individual search result from all the
-/// scraped and aggregated search results from the upstream search engines.
-///
-/// # Fields
-///
-/// * `title` - The title of the search result.
-/// * `url` - The url to be displayed below the search result title in html.
-/// * `description` - The description of the search result.
-/// * `engine` - The names of the upstream engines from which this results were provided.
-#[derive(Serialize, Deserialize)]
-#[serde(rename_all = "camelCase")]
-pub struct SearchResult {
-    pub title: String,
-    pub url: String,
-    pub description: String,
-    pub engine: Vec<String>,
-}
-
-impl SearchResult {
-    /// Constructs a new `SearchResult` with the given arguments needed for the struct.
-    ///
-    /// # Arguments
-    ///
-    /// * `title` - The title of the search result.
-    /// * `visiting_url` - The url which is accessed when clicked on it
-    /// (href url in html in simple words).
-    /// * `url` - The url to be displayed below the search result title in html.
-    /// * `description` - The description of the search result.
-    /// * `engine` - The names of the upstream engines from which this results were provided.
-    pub fn new(title: String, url: String, description: String, engine: Vec<String>) -> Self {
-        SearchResult {
-            title,
-            url,
-            description,
-            engine,
-        }
-    }
-
-    pub fn from_raw(raw: RawSearchResult) -> Self {
-        SearchResult {
-            title: raw.title,
-            url: raw.url,
-            description: raw.description,
-            engine: raw.engine,
-        }
-    }
-}
-
 /// A named struct to store the raw scraped search results scraped search results from the
 /// upstream search engines before aggregating it.It derives the Clone trait which is needed
 /// to write idiomatic rust using `Iterators`.
@ -64,15 +16,16 @@ impl SearchResult {
 /// (href url in html in simple words).
 /// * `description` - The description of the search result.
 /// * `engine` - The names of the upstream engines from which this results were provided.
-#[derive(Clone)]
-pub struct RawSearchResult {
+#[derive(Clone, Serialize, Deserialize)]
+#[serde(rename_all = "camelCase")]
+pub struct SearchResult {
    pub title: String,
    pub url: String,
    pub description: String,
    pub engine: Vec<String>,
 }

-impl RawSearchResult {
+impl SearchResult {
    /// Constructs a new `RawSearchResult` with the given arguments needed for the struct.
    ///
    /// # Arguments
@ -83,7 +36,7 @@ impl RawSearchResult {
    /// * `description` - The description of the search result.
    /// * `engine` - The names of the upstream engines from which this results were provided.
    pub fn new(title: String, url: String, description: String, engine: Vec<String>) -> Self {
-        RawSearchResult {
+        SearchResult {
            title,
            url,
            description,
--- a/src/results/aggregator.rs
+++ b/src/results/aggregator.rs
@ -8,18 +8,14 @@ use rand::Rng;
 use tokio::task::JoinHandle;

 use super::{
-    aggregation_models::{EngineErrorInfo, RawSearchResult, SearchResult, SearchResults},
+    aggregation_models::{EngineErrorInfo, SearchResult, SearchResults},
    user_agent::random_user_agent,
 };

-use crate::engines::{
-    duckduckgo,
-    engine_models::{EngineError, SearchEngine},
-    searx,
-};
+use crate::engines::engine_models::{EngineError, EngineHandler};

 /// Aliases for long type annotations
-type FutureVec = Vec<JoinHandle<Result<HashMap<String, RawSearchResult>, Report<EngineError>>>>;
+type FutureVec = Vec<JoinHandle<Result<HashMap<String, SearchResult>, Report<EngineError>>>>;

 /// The function aggregates the scraped results from the user-selected upstream search engines.
 /// These engines can be chosen either from the user interface (UI) or from the configuration file.
@ -64,7 +60,7 @@ pub async fn aggregate(
    page: u32,
    random_delay: bool,
    debug: bool,
-    mut upstream_search_engines: Vec<String>,
+    upstream_search_engines: Vec<EngineHandler>,
    request_timeout: u8,
 ) -> Result<SearchResults, Box<dyn std::error::Error>> {
    let user_agent: String = random_user_agent();
@ -76,24 +72,22 @@ pub async fn aggregate(
        tokio::time::sleep(Duration::from_secs(delay_secs)).await;
    }

+    let mut names: Vec<&str> = vec![];
+
    // create tasks for upstream result fetching
-    let tasks: FutureVec = upstream_search_engines
-        .iter()
-        .map(|engine| match engine.to_lowercase().as_str() {
-            "duckduckgo" => Box::new(duckduckgo::DuckDuckGo) as Box<dyn SearchEngine + Send + Sync>,
-            "searx" => Box::new(searx::Searx) as Box<dyn SearchEngine + Send + Sync>,
-            &_ => panic!("Config Error: Incorrect config file option provided"),
-        })
-        .map(|search_engine| {
-            let query: String = query.clone();
-            let user_agent: String = user_agent.clone();
-            tokio::spawn(async move {
-                search_engine
-                    .results(query, page, user_agent.clone(), request_timeout)
-                    .await
-            })
-        })
-        .collect();
+    let mut tasks: FutureVec = FutureVec::new();
+
+    for engine_handler in upstream_search_engines {
+        let (name, search_engine) = engine_handler.into_name_engine();
+        names.push(name);
+        let query: String = query.clone();
+        let user_agent: String = user_agent.clone();
+        tasks.push(tokio::spawn(async move {
+            search_engine
+                .results(query, page, user_agent.clone(), request_timeout)
+                .await
+        }));
+    }

    // get upstream responses
    let mut responses = Vec::with_capacity(tasks.len());
@ -105,20 +99,20 @@ pub async fn aggregate(
    }

    // aggregate search results, removing duplicates and handling errors the upstream engines returned
-    let mut result_map: HashMap<String, RawSearchResult> = HashMap::new();
+    let mut result_map: HashMap<String, SearchResult> = HashMap::new();
    let mut engine_errors_info: Vec<EngineErrorInfo> = Vec::new();

    let mut handle_error = |error: Report<EngineError>, engine_name: String| {
        log::error!("Engine Error: {:?}", error);
        engine_errors_info.push(EngineErrorInfo::new(
            error.downcast_ref::<EngineError>().unwrap(),
-            engine_name,
+            engine_name.to_string(),
        ));
    };

    for _ in 0..responses.len() {
        let response = responses.pop().unwrap();
-        let engine_name = upstream_search_engines.pop().unwrap();
+        let engine = names.pop().unwrap().to_string();

        if result_map.is_empty() {
            match response {
@ -126,7 +120,7 @@ pub async fn aggregate(
                    result_map = results.clone();
                }
                Err(error) => {
-                    handle_error(error, engine_name.clone());
+                    handle_error(error, engine);
                }
            }
            continue;
@ -138,21 +132,18 @@ pub async fn aggregate(
                    result_map
                        .entry(key)
                        .and_modify(|result| {
-                            result.add_engines(engine_name.clone());
+                            result.add_engines(engine.clone());
                        })
-                        .or_insert_with(|| -> RawSearchResult { value });
+                        .or_insert_with(|| -> SearchResult { value });
                });
            }
            Err(error) => {
-                handle_error(error, engine_name.clone());
+                handle_error(error, engine);
            }
        }
    }

-    let mut results = Vec::with_capacity(result_map.len());
-    for (_, result) in result_map {
-        results.push(SearchResult::from_raw(result))
-    }
+    let results = result_map.into_values().collect();

    Ok(SearchResults::new(
        results,
--- a/src/server/routes.rs
+++ b/src/server/routes.rs
@ -7,6 +7,7 @@ use std::fs::read_to_string;
 use crate::{
    cache::cacher::RedisCache,
    config::parser::Config,
+    engines::engine_models::EngineHandler,
    handler::public_paths::public_path,
    results::{aggregation_models::SearchResults, aggregator::aggregate},
 };
@ -175,12 +176,19 @@ async fn results(
            {
                Some(cookie_value) => {
                    let cookie_value: Cookie = serde_json::from_str(cookie_value.name_value().1)?;
+
+                    let engines = cookie_value
+                        .engines
+                        .iter()
+                        .filter_map(|name| EngineHandler::new(name))
+                        .collect();
+
                    aggregate(
                        query,
                        page,
                        config.aggregator.random_delay,
                        config.debug,
-                        cookie_value.engines,
+                        engines,
                        config.request_timeout,
                    )
                    .await?