0
0
mirror of https://github.com/neon-mmd/websurfx.git synced 2024-11-22 14:08:23 -05:00

Improve aggregation

Adds the EngineHandler struct
Removes vulnerability where an attacker could send requests cookies with fake engine names and crash the server.
Merged RawSearchResult and SearchResult, as they were functionally identical.
This commit is contained in:
Milim 2023-08-18 10:43:53 +02:00
parent 15dfda6ea9
commit 5aca5c0d0d
7 changed files with 84 additions and 99 deletions

View File

@ -34,7 +34,7 @@ pub struct Config {
pub aggregator: AggregatorConfig, pub aggregator: AggregatorConfig,
pub logging: bool, pub logging: bool,
pub debug: bool, pub debug: bool,
pub upstream_search_engines: Vec<String>, pub upstream_search_engines: Vec<crate::engines::engine_models::EngineHandler>,
pub request_timeout: u8, pub request_timeout: u8,
pub threads: u8, pub threads: u8,
} }
@ -107,6 +107,7 @@ impl Config {
.get::<_, HashMap<String, bool>>("upstream_search_engines")? .get::<_, HashMap<String, bool>>("upstream_search_engines")?
.into_iter() .into_iter()
.filter_map(|(key, value)| value.then_some(key)) .filter_map(|(key, value)| value.then_some(key))
.filter_map(|engine| crate::engines::engine_models::EngineHandler::new(&engine))
.collect(), .collect(),
request_timeout: globals.get::<_, u8>("request_timeout")?, request_timeout: globals.get::<_, u8>("request_timeout")?,
threads, threads,

View File

@ -7,7 +7,7 @@ use std::collections::HashMap;
use reqwest::header::{HeaderMap, CONTENT_TYPE, COOKIE, REFERER, USER_AGENT}; use reqwest::header::{HeaderMap, CONTENT_TYPE, COOKIE, REFERER, USER_AGENT};
use scraper::{Html, Selector}; use scraper::{Html, Selector};
use crate::results::aggregation_models::RawSearchResult; use crate::results::aggregation_models::SearchResult;
use super::engine_models::{EngineError, SearchEngine}; use super::engine_models::{EngineError, SearchEngine};
@ -43,7 +43,7 @@ impl SearchEngine for DuckDuckGo {
page: u32, page: u32,
user_agent: String, user_agent: String,
request_timeout: u8, request_timeout: u8,
) -> Result<HashMap<String, RawSearchResult>, EngineError> { ) -> Result<HashMap<String, SearchResult>, EngineError> {
// Page number can be missing or empty string and so appropriate handling is required // Page number can be missing or empty string and so appropriate handling is required
// so that upstream server recieves valid page number. // so that upstream server recieves valid page number.
let url: String = match page { let url: String = match page {
@ -120,7 +120,7 @@ impl SearchEngine for DuckDuckGo {
Ok(document Ok(document
.select(&results) .select(&results)
.map(|result| { .map(|result| {
RawSearchResult::new( SearchResult::new(
result result
.select(&result_title) .select(&result_title)
.next() .next()

View File

@ -1,7 +1,7 @@
//! This module provides the error enum to handle different errors associated while requesting data from //! This module provides the error enum to handle different errors associated while requesting data from
//! the upstream search engines with the search query provided by the user. //! the upstream search engines with the search query provided by the user.
use crate::results::aggregation_models::RawSearchResult; use crate::results::aggregation_models::SearchResult;
use error_stack::{IntoReport, Result, ResultExt}; use error_stack::{IntoReport, Result, ResultExt};
use std::{collections::HashMap, fmt, time::Duration}; use std::{collections::HashMap, fmt, time::Duration};
@ -45,7 +45,7 @@ impl error_stack::Context for EngineError {}
/// A trait to define common behavior for all search engines. /// A trait to define common behavior for all search engines.
#[async_trait::async_trait] #[async_trait::async_trait]
pub trait SearchEngine { pub trait SearchEngine: Sync + Send {
async fn fetch_html_from_upstream( async fn fetch_html_from_upstream(
&self, &self,
url: String, url: String,
@ -73,5 +73,37 @@ pub trait SearchEngine {
page: u32, page: u32,
user_agent: String, user_agent: String,
request_timeout: u8, request_timeout: u8,
) -> Result<HashMap<String, RawSearchResult>, EngineError>; ) -> Result<HashMap<String, SearchResult>, EngineError>;
}
pub struct EngineHandler {
engine: Box<dyn SearchEngine>,
name: &'static str,
}
impl Clone for EngineHandler {
fn clone(&self) -> Self {
Self::new(self.name).unwrap()
}
}
impl EngineHandler {
/// parses an engine name into an engine handler, returns none if the engine is unknown
pub fn new(engine_name: &str) -> Option<Self> {
let engine: (&'static str, Box<dyn SearchEngine>) =
match engine_name.to_lowercase().as_str() {
"duckduckgo" => ("duckduckgo", Box::new(super::duckduckgo::DuckDuckGo)),
"searx" => ("searx", Box::new(super::searx::Searx)),
_ => return None,
};
Some(Self {
engine: engine.1,
name: engine.0,
})
}
pub fn into_name_engine(self) -> (&'static str, Box<dyn SearchEngine>) {
(self.name, self.engine)
}
} }

View File

@ -6,7 +6,7 @@ use reqwest::header::{HeaderMap, CONTENT_TYPE, COOKIE, REFERER, USER_AGENT};
use scraper::{Html, Selector}; use scraper::{Html, Selector};
use std::collections::HashMap; use std::collections::HashMap;
use crate::results::aggregation_models::RawSearchResult; use crate::results::aggregation_models::SearchResult;
use super::engine_models::{EngineError, SearchEngine}; use super::engine_models::{EngineError, SearchEngine};
use error_stack::{IntoReport, Report, Result, ResultExt}; use error_stack::{IntoReport, Report, Result, ResultExt};
@ -42,7 +42,7 @@ impl SearchEngine for Searx {
page: u32, page: u32,
user_agent: String, user_agent: String,
request_timeout: u8, request_timeout: u8,
) -> Result<HashMap<String, RawSearchResult>, EngineError> { ) -> Result<HashMap<String, SearchResult>, EngineError> {
// Page number can be missing or empty string and so appropriate handling is required // Page number can be missing or empty string and so appropriate handling is required
// so that upstream server recieves valid page number. // so that upstream server recieves valid page number.
let url: String = match page { let url: String = match page {
@ -111,7 +111,7 @@ impl SearchEngine for Searx {
Ok(document Ok(document
.select(&results) .select(&results)
.map(|result| { .map(|result| {
RawSearchResult::new( SearchResult::new(
result result
.select(&result_title) .select(&result_title)
.next() .next()

View File

@ -5,54 +5,6 @@ use serde::{Deserialize, Serialize};
use crate::{config::parser_models::Style, engines::engine_models::EngineError}; use crate::{config::parser_models::Style, engines::engine_models::EngineError};
/// A named struct to store, serialize and deserializes the individual search result from all the
/// scraped and aggregated search results from the upstream search engines.
///
/// # Fields
///
/// * `title` - The title of the search result.
/// * `url` - The url to be displayed below the search result title in html.
/// * `description` - The description of the search result.
/// * `engine` - The names of the upstream engines from which this results were provided.
#[derive(Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct SearchResult {
pub title: String,
pub url: String,
pub description: String,
pub engine: Vec<String>,
}
impl SearchResult {
/// Constructs a new `SearchResult` with the given arguments needed for the struct.
///
/// # Arguments
///
/// * `title` - The title of the search result.
/// * `visiting_url` - The url which is accessed when clicked on it
/// (href url in html in simple words).
/// * `url` - The url to be displayed below the search result title in html.
/// * `description` - The description of the search result.
/// * `engine` - The names of the upstream engines from which this results were provided.
pub fn new(title: String, url: String, description: String, engine: Vec<String>) -> Self {
SearchResult {
title,
url,
description,
engine,
}
}
pub fn from_raw(raw: RawSearchResult) -> Self {
SearchResult {
title: raw.title,
url: raw.url,
description: raw.description,
engine: raw.engine,
}
}
}
/// A named struct to store the raw scraped search results scraped search results from the /// A named struct to store the raw scraped search results scraped search results from the
/// upstream search engines before aggregating it.It derives the Clone trait which is needed /// upstream search engines before aggregating it.It derives the Clone trait which is needed
/// to write idiomatic rust using `Iterators`. /// to write idiomatic rust using `Iterators`.
@ -64,15 +16,16 @@ impl SearchResult {
/// (href url in html in simple words). /// (href url in html in simple words).
/// * `description` - The description of the search result. /// * `description` - The description of the search result.
/// * `engine` - The names of the upstream engines from which this results were provided. /// * `engine` - The names of the upstream engines from which this results were provided.
#[derive(Clone)] #[derive(Clone, Serialize, Deserialize)]
pub struct RawSearchResult { #[serde(rename_all = "camelCase")]
pub struct SearchResult {
pub title: String, pub title: String,
pub url: String, pub url: String,
pub description: String, pub description: String,
pub engine: Vec<String>, pub engine: Vec<String>,
} }
impl RawSearchResult { impl SearchResult {
/// Constructs a new `RawSearchResult` with the given arguments needed for the struct. /// Constructs a new `RawSearchResult` with the given arguments needed for the struct.
/// ///
/// # Arguments /// # Arguments
@ -83,7 +36,7 @@ impl RawSearchResult {
/// * `description` - The description of the search result. /// * `description` - The description of the search result.
/// * `engine` - The names of the upstream engines from which this results were provided. /// * `engine` - The names of the upstream engines from which this results were provided.
pub fn new(title: String, url: String, description: String, engine: Vec<String>) -> Self { pub fn new(title: String, url: String, description: String, engine: Vec<String>) -> Self {
RawSearchResult { SearchResult {
title, title,
url, url,
description, description,

View File

@ -8,18 +8,14 @@ use rand::Rng;
use tokio::task::JoinHandle; use tokio::task::JoinHandle;
use super::{ use super::{
aggregation_models::{EngineErrorInfo, RawSearchResult, SearchResult, SearchResults}, aggregation_models::{EngineErrorInfo, SearchResult, SearchResults},
user_agent::random_user_agent, user_agent::random_user_agent,
}; };
use crate::engines::{ use crate::engines::engine_models::{EngineError, EngineHandler};
duckduckgo,
engine_models::{EngineError, SearchEngine},
searx,
};
/// Aliases for long type annotations /// Aliases for long type annotations
type FutureVec = Vec<JoinHandle<Result<HashMap<String, RawSearchResult>, Report<EngineError>>>>; type FutureVec = Vec<JoinHandle<Result<HashMap<String, SearchResult>, Report<EngineError>>>>;
/// The function aggregates the scraped results from the user-selected upstream search engines. /// The function aggregates the scraped results from the user-selected upstream search engines.
/// These engines can be chosen either from the user interface (UI) or from the configuration file. /// These engines can be chosen either from the user interface (UI) or from the configuration file.
@ -64,7 +60,7 @@ pub async fn aggregate(
page: u32, page: u32,
random_delay: bool, random_delay: bool,
debug: bool, debug: bool,
mut upstream_search_engines: Vec<String>, upstream_search_engines: Vec<EngineHandler>,
request_timeout: u8, request_timeout: u8,
) -> Result<SearchResults, Box<dyn std::error::Error>> { ) -> Result<SearchResults, Box<dyn std::error::Error>> {
let user_agent: String = random_user_agent(); let user_agent: String = random_user_agent();
@ -76,24 +72,22 @@ pub async fn aggregate(
tokio::time::sleep(Duration::from_secs(delay_secs)).await; tokio::time::sleep(Duration::from_secs(delay_secs)).await;
} }
let mut names: Vec<&str> = vec![];
// create tasks for upstream result fetching // create tasks for upstream result fetching
let tasks: FutureVec = upstream_search_engines let mut tasks: FutureVec = FutureVec::new();
.iter()
.map(|engine| match engine.to_lowercase().as_str() { for engine_handler in upstream_search_engines {
"duckduckgo" => Box::new(duckduckgo::DuckDuckGo) as Box<dyn SearchEngine + Send + Sync>, let (name, search_engine) = engine_handler.into_name_engine();
"searx" => Box::new(searx::Searx) as Box<dyn SearchEngine + Send + Sync>, names.push(name);
&_ => panic!("Config Error: Incorrect config file option provided"), let query: String = query.clone();
}) let user_agent: String = user_agent.clone();
.map(|search_engine| { tasks.push(tokio::spawn(async move {
let query: String = query.clone(); search_engine
let user_agent: String = user_agent.clone(); .results(query, page, user_agent.clone(), request_timeout)
tokio::spawn(async move { .await
search_engine }));
.results(query, page, user_agent.clone(), request_timeout) }
.await
})
})
.collect();
// get upstream responses // get upstream responses
let mut responses = Vec::with_capacity(tasks.len()); let mut responses = Vec::with_capacity(tasks.len());
@ -105,20 +99,20 @@ pub async fn aggregate(
} }
// aggregate search results, removing duplicates and handling errors the upstream engines returned // aggregate search results, removing duplicates and handling errors the upstream engines returned
let mut result_map: HashMap<String, RawSearchResult> = HashMap::new(); let mut result_map: HashMap<String, SearchResult> = HashMap::new();
let mut engine_errors_info: Vec<EngineErrorInfo> = Vec::new(); let mut engine_errors_info: Vec<EngineErrorInfo> = Vec::new();
let mut handle_error = |error: Report<EngineError>, engine_name: String| { let mut handle_error = |error: Report<EngineError>, engine_name: String| {
log::error!("Engine Error: {:?}", error); log::error!("Engine Error: {:?}", error);
engine_errors_info.push(EngineErrorInfo::new( engine_errors_info.push(EngineErrorInfo::new(
error.downcast_ref::<EngineError>().unwrap(), error.downcast_ref::<EngineError>().unwrap(),
engine_name, engine_name.to_string(),
)); ));
}; };
for _ in 0..responses.len() { for _ in 0..responses.len() {
let response = responses.pop().unwrap(); let response = responses.pop().unwrap();
let engine_name = upstream_search_engines.pop().unwrap(); let engine = names.pop().unwrap().to_string();
if result_map.is_empty() { if result_map.is_empty() {
match response { match response {
@ -126,7 +120,7 @@ pub async fn aggregate(
result_map = results.clone(); result_map = results.clone();
} }
Err(error) => { Err(error) => {
handle_error(error, engine_name.clone()); handle_error(error, engine);
} }
} }
continue; continue;
@ -138,21 +132,18 @@ pub async fn aggregate(
result_map result_map
.entry(key) .entry(key)
.and_modify(|result| { .and_modify(|result| {
result.add_engines(engine_name.clone()); result.add_engines(engine.clone());
}) })
.or_insert_with(|| -> RawSearchResult { value }); .or_insert_with(|| -> SearchResult { value });
}); });
} }
Err(error) => { Err(error) => {
handle_error(error, engine_name.clone()); handle_error(error, engine);
} }
} }
} }
let mut results = Vec::with_capacity(result_map.len()); let results = result_map.into_values().collect();
for (_, result) in result_map {
results.push(SearchResult::from_raw(result))
}
Ok(SearchResults::new( Ok(SearchResults::new(
results, results,

View File

@ -7,6 +7,7 @@ use std::fs::read_to_string;
use crate::{ use crate::{
cache::cacher::RedisCache, cache::cacher::RedisCache,
config::parser::Config, config::parser::Config,
engines::engine_models::EngineHandler,
handler::public_paths::public_path, handler::public_paths::public_path,
results::{aggregation_models::SearchResults, aggregator::aggregate}, results::{aggregation_models::SearchResults, aggregator::aggregate},
}; };
@ -175,12 +176,19 @@ async fn results(
{ {
Some(cookie_value) => { Some(cookie_value) => {
let cookie_value: Cookie = serde_json::from_str(cookie_value.name_value().1)?; let cookie_value: Cookie = serde_json::from_str(cookie_value.name_value().1)?;
let engines = cookie_value
.engines
.iter()
.filter_map(|name| EngineHandler::new(name))
.collect();
aggregate( aggregate(
query, query,
page, page,
config.aggregator.random_delay, config.aggregator.random_delay,
config.debug, config.debug,
cookie_value.engines, engines,
config.request_timeout, config.request_timeout,
) )
.await? .await?