websurfx/src/search_results_handler/aggregator.rs

//! This module provides the functionality to scrape and gathers all the results from the upstream
//! search engines and then removes duplicate results.

use std::{collections::HashMap, time::Duration};

use error_stack::Report;
use rand::Rng;
use tokio::task::JoinHandle;

use super::{
    aggregation_models::{RawSearchResult, SearchResult, SearchResults},
    user_agent::random_user_agent,
};

use crate::engines::{
    duckduckgo,
    engine_models::{EngineError, SearchEngine},
    searx,
};

/// A function that aggregates all the scraped results from the above upstream engines and
/// then removes duplicate results and if two results are found to be from two or more engines
/// then puts their names together to show the results are fetched from these upstream engines
/// and then removes all data from the HashMap and puts into a struct of all results aggregated
/// into a vector and also adds the query used into the struct this is neccessory because
/// otherwise the search bar in search remains empty if searched from the query url
///
/// # Example:
///
/// If you search from the url like `https://127.0.0.1/search?q=huston` then the search bar should
/// contain the word huston and not remain empty.
///
/// # Arguments
///
/// * `query` - Accepts a string to query with the above upstream search engines.
/// * `page` - Accepts an u32 page number.
/// * `random_delay` - Accepts a boolean value to add a random delay before making the request.
///
/// # Error
///
/// Returns an error a reqwest and scraping selector errors if any error occurs in the results
/// function in either `searx` or `duckduckgo` or both otherwise returns a `SearchResults struct`
/// containing appropriate values.
pub async fn aggregate(
    query: String,
    page: u32,
    random_delay: bool,
    debug: bool,
    upstream_search_engines: Vec<String>,
) -> Result<SearchResults, Box<dyn std::error::Error>> {
    let user_agent: String = random_user_agent();
    let mut result_map: HashMap<String, RawSearchResult> = HashMap::new();

    // Add a random delay before making the request.
    if random_delay || !debug {
        let mut rng = rand::thread_rng();
        let delay_secs = rng.gen_range(1..10);
        std::thread::sleep(Duration::from_secs(delay_secs));
    }

    // fetch results from upstream search engines simultaneously/concurrently.
    let search_engines: Vec<Box<dyn SearchEngine>> = upstream_search_engines
        .iter()
        .map(|engine| match engine.to_lowercase().as_str() {
            "duckduckgo" => Box::new(duckduckgo::DuckDuckGo) as Box<dyn SearchEngine>,
            "searx " => Box::new(searx::Searx) as Box<dyn SearchEngine>,
        })
        .collect();

    let tasks: Vec<JoinHandle<Result<HashMap<String, RawSearchResult>, Report<EngineError>>>> =
        search_engines
            .iter()
            .map(|search_engine| {
                tokio::spawn(search_engine.results(query.clone(), page, user_agent.clone()))
            })
            .collect();

    let mut outputs = Vec::with_capacity(search_engines.len());

    for task in tasks {
        outputs.push(task.await.ok())
    }

    let mut initial: bool = true;
    let mut counter: usize = 0;
    outputs.iter().for_each(|results| {
        if initial {
            match results {
                Some(result) => {
                    let new_result = result.clone();
                    result_map.extend(new_result.as_ref().unwrap().clone());
                    counter += 1;
                    initial = false
                }
                None => {
                    if debug {
                        log::error!(
                            "Error fetching results from {}",
                            upstream_search_engines[counter]
                        );
                    };
                    counter += 1
                }
            }
        } else {
            match results {
                Some(result) => {
                    let new_result = result.clone();
                    new_result
                        .as_ref()
                        .unwrap()
                        .clone()
                        .into_iter()
                        .for_each(|(key, value)| {
                            result_map
                                .entry(key)
                                .and_modify(|result| {
                                    result.add_engines(value.clone().engine());
                                })
                                .or_insert_with(|| -> RawSearchResult {
                                    RawSearchResult::new(
                                        value.title.clone(),
                                        value.visiting_url.clone(),
                                        value.description.clone(),
                                        value.engine.clone(),
                                    )
                                });
                        });
                    counter += 1
                }
                None => {
                    if debug {
                        log::error!(
                            "Error fetching results from {}",
                            upstream_search_engines[counter]
                        );
                    };
                    counter += 1
                }
            }
        }
    });

    Ok(SearchResults::new(
        result_map
            .into_iter()
            .map(|(key, value)| {
                SearchResult::new(
                    value.title,
                    value.visiting_url,
                    key,
                    value.description,
                    value.engine,
                )
            })
            .collect(),
        query.to_string(),
    ))
}
Improving source code documentation. 2023-04-27 10:53:28 -04:00			`//! This module provides the functionality to scrape and gathers all the results from the upstream`
			`//! search engines and then removes duplicate results.`

optimise code for large scale server use and closes #7 2023-05-07 14:18:19 -04:00			`use std::{collections::HashMap, time::Duration};`

✨ feat: implement async multithreading and engine selection code 2023-07-11 12:44:38 -04:00			`use error_stack::Report;`
optimise code for large scale server use and closes #7 2023-05-07 14:18:19 -04:00			`use rand::Rng;`
✨ feat: implement async multithreading and engine selection code 2023-07-11 12:44:38 -04:00			`use tokio::task::JoinHandle;`
initial commit 2023-04-22 07:35:07 -04:00
Refactoring code and separating code into files for better maintainability 2023-04-25 09:30:04 -04:00			`use super::{`
			`aggregation_models::{RawSearchResult, SearchResult, SearchResults},`
			`user_agent::random_user_agent,`
			`};`
initial commit 2023-04-22 07:35:07 -04:00
✨ feat: implement async multithreading and engine selection code 2023-07-11 12:44:38 -04:00			`use crate::engines::{`
			`duckduckgo,`
			`engine_models::{EngineError, SearchEngine},`
			`searx,`
			`};`
initial commit 2023-04-22 07:35:07 -04:00
Improving source code documentation. 2023-04-27 10:53:28 -04:00			`/// A function that aggregates all the scraped results from the above upstream engines and`
			`/// then removes duplicate results and if two results are found to be from two or more engines`
			`/// then puts their names together to show the results are fetched from these upstream engines`
			`/// and then removes all data from the HashMap and puts into a struct of all results aggregated`
optimise code for large scale server use and closes #7 2023-05-07 14:18:19 -04:00			`/// into a vector and also adds the query used into the struct this is neccessory because`
Improving source code documentation. 2023-04-27 10:53:28 -04:00			`/// otherwise the search bar in search remains empty if searched from the query url`
			`///`
			`/// # Example:`
			`///`
			/// If you search from the url like `https://127.0.0.1/search?q=huston` then the search bar should
			`/// contain the word huston and not remain empty.`
optimise code for large scale server use and closes #7 2023-05-07 14:18:19 -04:00			`///`
Improving source code documentation. 2023-04-27 10:53:28 -04:00			`/// # Arguments`
			`///`
			/// * `query` - Accepts a string to query with the above upstream search engines.
add code to evade ip blocking, improve pagination code and fix documentation 2023-05-02 04:58:21 -04:00			/// * `page` - Accepts an u32 page number.
supports the option to add a random delay 2023-05-21 21:13:06 -04:00			/// * `random_delay` - Accepts a boolean value to add a random delay before making the request.
Improving source code documentation. 2023-04-27 10:53:28 -04:00			`///`
			`/// # Error`
			`///`
optimise code for large scale server use and closes #7 2023-05-07 14:18:19 -04:00			`/// Returns an error a reqwest and scraping selector errors if any error occurs in the results`
Improving source code documentation. 2023-04-27 10:53:28 -04:00			/// function in either `searx` or `duckduckgo` or both otherwise returns a `SearchResults struct`
			`/// containing appropriate values.`
initial commit 2023-04-22 07:35:07 -04:00			`pub async fn aggregate(`
✨ feat: implement async multithreading and engine selection code 2023-07-11 12:44:38 -04:00			`query: String,`
add code to evade ip blocking, improve pagination code and fix documentation 2023-05-02 04:58:21 -04:00			`page: u32,`
supports the option to add a random delay 2023-05-21 21:13:06 -04:00			`random_delay: bool,`
feat: remove random delays when debug is set to true 2023-05-29 14:28:09 -04:00			`debug: bool,`
✨ feat: implement async multithreading and engine selection code 2023-07-11 12:44:38 -04:00			`upstream_search_engines: Vec<String>,`
initial commit 2023-04-22 07:35:07 -04:00			`) -> Result<SearchResults, Box<dyn std::error::Error>> {`
Refactoring code and separating code into files for better maintainability 2023-04-25 09:30:04 -04:00			`let user_agent: String = random_user_agent();`
initial commit 2023-04-22 07:35:07 -04:00			`let mut result_map: HashMap<String, RawSearchResult> = HashMap::new();`

optimise code for large scale server use and closes #7 2023-05-07 14:18:19 -04:00			`// Add a random delay before making the request.`
feat: remove random delays when debug is set to true 2023-05-29 14:28:09 -04:00			`if random_delay \|\| !debug {`
supports the option to add a random delay 2023-05-21 21:13:06 -04:00			`let mut rng = rand::thread_rng();`
			`let delay_secs = rng.gen_range(1..10);`
			`std::thread::sleep(Duration::from_secs(delay_secs));`
			`}`
optimise code for large scale server use and closes #7 2023-05-07 14:18:19 -04:00
			`// fetch results from upstream search engines simultaneously/concurrently.`
✨ feat: implement async multithreading and engine selection code 2023-07-11 12:44:38 -04:00			`let search_engines: Vec<Box<dyn SearchEngine>> = upstream_search_engines`
			`.iter()`
			`.map(\|engine\| match engine.to_lowercase().as_str() {`
			`"duckduckgo" => Box::new(duckduckgo::DuckDuckGo) as Box<dyn SearchEngine>,`
			`"searx " => Box::new(searx::Searx) as Box<dyn SearchEngine>,`
			`})`
			`.collect();`
optimise code for large scale server use and closes #7 2023-05-07 14:18:19 -04:00
✨ feat: implement async multithreading and engine selection code 2023-07-11 12:44:38 -04:00			`let tasks: Vec<JoinHandle<Result<HashMap<String, RawSearchResult>, Report<EngineError>>>> =`
			`search_engines`
			`.iter()`
			`.map(\|search_engine\| {`
			`tokio::spawn(search_engine.results(query.clone(), page, user_agent.clone()))`
			`})`
			`.collect();`
log the error from engines 2023-06-14 18:27:45 -04:00
✨ feat: implement async multithreading and engine selection code 2023-07-11 12:44:38 -04:00			`let mut outputs = Vec::with_capacity(search_engines.len());`
initial commit 2023-04-22 07:35:07 -04:00
✨ feat: implement async multithreading and engine selection code 2023-07-11 12:44:38 -04:00			`for task in tasks {`
			`outputs.push(task.await.ok())`
			`}`
initial commit 2023-04-22 07:35:07 -04:00
✨ feat: implement async multithreading and engine selection code 2023-07-11 12:44:38 -04:00			`let mut initial: bool = true;`
			`let mut counter: usize = 0;`
			`outputs.iter().for_each(\|results\| {`
			`if initial {`
			`match results {`
			`Some(result) => {`
			`let new_result = result.clone();`
			`result_map.extend(new_result.as_ref().unwrap().clone());`
			`counter += 1;`
			`initial = false`
			`}`
			`None => {`
			`if debug {`
			`log::error!(`
			`"Error fetching results from {}",`
			`upstream_search_engines[counter]`
			`);`
			`};`
			`counter += 1`
			`}`
			`}`
			`} else {`
			`match results {`
			`Some(result) => {`
			`let new_result = result.clone();`
			`new_result`
			`.as_ref()`
			`.unwrap()`
			`.clone()`
			`.into_iter()`
			`.for_each(\|(key, value)\| {`
			`result_map`
			`.entry(key)`
			`.and_modify(\|result\| {`
			`result.add_engines(value.clone().engine());`
			`})`
			`.or_insert_with(\|\| -> RawSearchResult {`
			`RawSearchResult::new(`
			`value.title.clone(),`
			`value.visiting_url.clone(),`
			`value.description.clone(),`
			`value.engine.clone(),`
			`)`
			`});`
			`});`
			`counter += 1`
			`}`
			`None => {`
			`if debug {`
			`log::error!(`
			`"Error fetching results from {}",`
			`upstream_search_engines[counter]`
			`);`
			`};`
			`counter += 1`
			`}`
			`}`
			`}`
Refactoring code and separating code into files for better maintainability 2023-04-25 09:30:04 -04:00			`});`
initial commit 2023-04-22 07:35:07 -04:00
Refactoring code and separating code into files for better maintainability 2023-04-25 09:30:04 -04:00			`Ok(SearchResults::new(`
			`result_map`
			`.into_iter()`
			`.map(\|(key, value)\| {`
			`SearchResult::new(`
			`value.title,`
			`value.visiting_url,`
			`key,`
			`value.description,`
			`value.engine,`
			`)`
			`})`
			`.collect(),`
			`query.to_string(),`
			`))`
initial commit 2023-04-22 07:35:07 -04:00			`}`