diff --git a/src/config/parser.rs b/src/config/parser.rs index fb9f8b1..0acdd25 100644 --- a/src/config/parser.rs +++ b/src/config/parser.rs @@ -3,6 +3,7 @@ use crate::handler::paths::{file_path, FileType}; +use crate::models::engine_models::{EngineError, EngineHandler}; use crate::models::parser_models::{AggregatorConfig, RateLimiter, Style}; use log::LevelFilter; use mlua::Lua; @@ -28,7 +29,7 @@ pub struct Config { /// It stores the option to whether enable or disable debug mode. pub debug: bool, /// It stores all the engine names that were enabled by the user. - pub upstream_search_engines: Vec, + pub upstream_search_engines: Vec, /// It stores the time (secs) which controls the server request timeout. pub request_timeout: u8, /// It stores the number of threads which controls the app will use to run. @@ -111,8 +112,8 @@ impl Config { .get::<_, HashMap>("upstream_search_engines")? .into_iter() .filter_map(|(key, value)| value.then_some(key)) - .filter_map(|engine| crate::models::engine_models::EngineHandler::new(&engine)) - .collect(), + .map(|engine| EngineHandler::new(&engine)) + .collect::, error_stack::Report>>()?, request_timeout: globals.get::<_, u8>("request_timeout")?, threads, rate_limiter: RateLimiter { diff --git a/src/engines/duckduckgo.rs b/src/engines/duckduckgo.rs index 0f06ea4..352a33b 100644 --- a/src/engines/duckduckgo.rs +++ b/src/engines/duckduckgo.rs @@ -5,7 +5,7 @@ use std::collections::HashMap; use reqwest::header::HeaderMap; -use scraper::{Html, Selector}; +use scraper::Html; use crate::models::aggregation_models::SearchResult; @@ -13,9 +13,29 @@ use crate::models::engine_models::{EngineError, SearchEngine}; use error_stack::{Report, Result, ResultExt}; +use super::search_result_parser::SearchResultParser; + /// A new DuckDuckGo engine type defined in-order to implement the `SearchEngine` trait which allows to /// reduce code duplication as well as allows to create vector of different search engines easily. -pub struct DuckDuckGo; +pub struct DuckDuckGo { + /// The parser, used to interpret the search result. + parser: SearchResultParser, +} + +impl DuckDuckGo { + /// Creates the DuckDuckGo parser. + pub fn new() -> Result { + Ok(Self { + parser: SearchResultParser::new( + ".no-results", + ".result", + ".result__a", + ".result__url", + ".result__snippet", + )?, + }) + } +} #[async_trait::async_trait] impl SearchEngine for DuckDuckGo { @@ -59,58 +79,19 @@ impl SearchEngine for DuckDuckGo { &DuckDuckGo::fetch_html_from_upstream(self, &url, header_map, request_timeout).await?, ); - let no_result: Selector = Selector::parse(".no-results") - .map_err(|_| Report::new(EngineError::UnexpectedError)) - .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".no-results"))?; - - if document.select(&no_result).next().is_some() { + if self.parser.parse_for_no_results(&document).next().is_some() { return Err(Report::new(EngineError::EmptyResultSet)); } - let results: Selector = Selector::parse(".result") - .map_err(|_| Report::new(EngineError::UnexpectedError)) - .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result"))?; - let result_title: Selector = Selector::parse(".result__a") - .map_err(|_| Report::new(EngineError::UnexpectedError)) - .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result__a"))?; - let result_url: Selector = Selector::parse(".result__url") - .map_err(|_| Report::new(EngineError::UnexpectedError)) - .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result__url"))?; - let result_desc: Selector = Selector::parse(".result__snippet") - .map_err(|_| Report::new(EngineError::UnexpectedError)) - .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result__snippet"))?; - // scrape all the results from the html - Ok(document - .select(&results) - .map(|result| { - SearchResult::new( - result - .select(&result_title) - .next() - .unwrap() - .inner_html() - .trim(), - format!( - "https://{}", - result - .select(&result_url) - .next() - .unwrap() - .inner_html() - .trim() - ) - .as_str(), - result - .select(&result_desc) - .next() - .unwrap() - .inner_html() - .trim(), + self.parser + .parse_for_results(&document, |title, url, desc| { + Some(SearchResult::new( + title.inner_html().trim(), + &format!("https://{}", url.inner_html().trim()), + desc.inner_html().trim(), &["duckduckgo"], - ) + )) }) - .map(|search_result| (search_result.url.clone(), search_result)) - .collect()) } } diff --git a/src/engines/mod.rs b/src/engines/mod.rs index 0016728..39b50c8 100644 --- a/src/engines/mod.rs +++ b/src/engines/mod.rs @@ -4,4 +4,5 @@ //! code. Moreover, it also provides a custom error for the upstream search engine handling code. pub mod duckduckgo; +pub mod search_result_parser; pub mod searx; diff --git a/src/engines/search_result_parser.rs b/src/engines/search_result_parser.rs new file mode 100644 index 0000000..0512bdd --- /dev/null +++ b/src/engines/search_result_parser.rs @@ -0,0 +1,76 @@ +//! This modules provides helper functionalities for parsing a html document into internal SearchResult. +use std::collections::HashMap; + +use crate::models::{aggregation_models::SearchResult, engine_models::EngineError}; +use error_stack::{Report, Result}; +use scraper::{html::Select, ElementRef, Html, Selector}; + +/// A html search result parser, based on a predefined CSS selectors. +pub struct SearchResultParser { + /// selector to locate the element which is displayed, if there were nothing found. + no_result: Selector, + /// selector to locate the element which contains one item from the search result. + results: Selector, + /// selector to locate the title relative to the search result item. + result_title: Selector, + /// selector to locate the url relative to the search result item. + result_url: Selector, + /// selector to locate the description relative to the search result item. + result_desc: Selector, +} + +impl SearchResultParser { + /// Creates a new parser, if all the selectors are valid, otherwise it returns an EngineError + pub fn new( + no_result_selector: &str, + results_selector: &str, + result_title_selector: &str, + result_url_selector: &str, + result_desc_selector: &str, + ) -> Result { + Ok(SearchResultParser { + no_result: new_selector(no_result_selector)?, + results: new_selector(results_selector)?, + result_title: new_selector(result_title_selector)?, + result_url: new_selector(result_url_selector)?, + result_desc: new_selector(result_desc_selector)?, + }) + } + + /// Parse the html and returns element representing the 'no result found' response. + pub fn parse_for_no_results<'a>(&'a self, document: &'a Html) -> Select<'a, 'a> { + document.select(&self.no_result) + } + + /// Parse the html, and convert the results to SearchResult with the help of the builder function + pub fn parse_for_results( + &self, + document: &Html, + builder: impl Fn(&ElementRef<'_>, &ElementRef<'_>, &ElementRef<'_>) -> Option, + ) -> Result, EngineError> { + let res = document + .select(&self.results) + .filter_map(|result| { + let title = result.select(&self.result_title).next(); + let url = result.select(&self.result_url).next(); + let desc = result.select(&self.result_desc).next(); + match (title, url, desc) { + (Some(ref t), Some(ref u), Some(ref d)) => builder(t, u, d), + _ => None, + } + }) + .map(|search_result| (search_result.url.clone(), search_result)) + .collect(); + Ok(res) + } +} + +/// Create a Selector struct, if the given parameter is a valid css expression, otherwise convert it into an EngineError. +fn new_selector(selector: &str) -> Result { + Selector::parse(selector).map_err(|err| { + Report::new(EngineError::UnexpectedError).attach_printable(format!( + "invalid CSS selector: {}, err: {:?}", + selector, err + )) + }) +} diff --git a/src/engines/searx.rs b/src/engines/searx.rs index 6ab0469..79c1e95 100644 --- a/src/engines/searx.rs +++ b/src/engines/searx.rs @@ -3,16 +3,35 @@ //! number if provided. use reqwest::header::HeaderMap; -use scraper::{Html, Selector}; +use scraper::Html; use std::collections::HashMap; +use super::search_result_parser::SearchResultParser; use crate::models::aggregation_models::SearchResult; use crate::models::engine_models::{EngineError, SearchEngine}; use error_stack::{Report, Result, ResultExt}; /// A new Searx engine type defined in-order to implement the `SearchEngine` trait which allows to /// reduce code duplication as well as allows to create vector of different search engines easily. -pub struct Searx; +pub struct Searx { + /// The parser, used to interpret the search result. + parser: SearchResultParser, +} + +impl Searx { + /// creates a Searx parser + pub fn new() -> Result { + Ok(Self { + parser: SearchResultParser::new( + "#urls>.dialog-error>p", + ".result", + "h3>a", + "h3>a", + ".content", + )?, + }) + } +} #[async_trait::async_trait] impl SearchEngine for Searx { @@ -52,13 +71,7 @@ impl SearchEngine for Searx { &Searx::fetch_html_from_upstream(self, &url, header_map, request_timeout).await?, ); - let no_result: Selector = Selector::parse("#urls>.dialog-error>p") - .map_err(|_| Report::new(EngineError::UnexpectedError)) - .attach_printable_lazy(|| { - format!("invalid CSS selector: {}", "#urls>.dialog-error>p") - })?; - - if let Some(no_result_msg) = document.select(&no_result).nth(1) { + if let Some(no_result_msg) = self.parser.parse_for_no_results(&document).nth(1) { if no_result_msg.inner_html() == "we didn't find any results. Please use another query or search in more categories" { @@ -66,48 +79,17 @@ impl SearchEngine for Searx { } } - let results: Selector = Selector::parse(".result") - .map_err(|_| Report::new(EngineError::UnexpectedError)) - .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result"))?; - let result_title: Selector = Selector::parse("h3>a") - .map_err(|_| Report::new(EngineError::UnexpectedError)) - .attach_printable_lazy(|| format!("invalid CSS selector: {}", "h3>a"))?; - let result_url: Selector = Selector::parse("h3>a") - .map_err(|_| Report::new(EngineError::UnexpectedError)) - .attach_printable_lazy(|| format!("invalid CSS selector: {}", "h3>a"))?; - - let result_desc: Selector = Selector::parse(".content") - .map_err(|_| Report::new(EngineError::UnexpectedError)) - .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".content"))?; - // scrape all the results from the html - Ok(document - .select(&results) - .map(|result| { - SearchResult::new( - result - .select(&result_title) - .next() - .unwrap() - .inner_html() - .trim(), - result - .select(&result_url) - .next() - .unwrap() - .value() - .attr("href") - .unwrap(), - result - .select(&result_desc) - .next() - .unwrap() - .inner_html() - .trim(), - &["searx"], - ) + self.parser + .parse_for_results(&document, |title, url, desc| { + url.value().attr("href").map(|url| { + SearchResult::new( + title.inner_html().trim(), + url, + desc.inner_html().trim(), + &["searx"], + ) + }) }) - .map(|search_result| (search_result.url.clone(), search_result)) - .collect()) } } diff --git a/src/models/aggregation_models.rs b/src/models/aggregation_models.rs index 72bbf08..660804e 100644 --- a/src/models/aggregation_models.rs +++ b/src/models/aggregation_models.rs @@ -85,12 +85,14 @@ impl EngineErrorInfo { pub fn new(error: &EngineError, engine: &str) -> Self { Self { error: match error { + EngineError::NoSuchEngineFound(_) => "EngineNotFound".to_owned(), EngineError::RequestError => "RequestError".to_owned(), EngineError::EmptyResultSet => "EmptyResultSet".to_owned(), EngineError::UnexpectedError => "UnexpectedError".to_owned(), }, engine: engine.to_owned(), severity_color: match error { + EngineError::NoSuchEngineFound(_) => "red".to_owned(), EngineError::RequestError => "green".to_owned(), EngineError::EmptyResultSet => "blue".to_owned(), EngineError::UnexpectedError => "red".to_owned(), diff --git a/src/models/engine_models.rs b/src/models/engine_models.rs index d4a4e72..05b5a11 100644 --- a/src/models/engine_models.rs +++ b/src/models/engine_models.rs @@ -2,12 +2,14 @@ //! the upstream search engines with the search query provided by the user. use super::aggregation_models::SearchResult; -use error_stack::{Result, ResultExt}; +use error_stack::{Report, Result, ResultExt}; use std::{collections::HashMap, fmt, time::Duration}; /// A custom error type used for handle engine associated errors. #[derive(Debug)] pub enum EngineError { + /// No matching engine found + NoSuchEngineFound(String), /// This variant handles all request related errors like forbidden, not found, /// etc. EmptyResultSet, @@ -24,6 +26,9 @@ pub enum EngineError { impl fmt::Display for EngineError { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self { + EngineError::NoSuchEngineFound(engine) => { + write!(f, "No such engine with the name '{engine}' found") + } EngineError::EmptyResultSet => { write!(f, "The upstream search engine returned an empty result set") } @@ -134,18 +139,25 @@ impl EngineHandler { /// # Returns /// /// It returns an option either containing the value or a none if the engine is unknown - pub fn new(engine_name: &str) -> Option { + pub fn new(engine_name: &str) -> Result { let engine: (&'static str, Box) = match engine_name.to_lowercase().as_str() { - "duckduckgo" => ( - "duckduckgo", - Box::new(crate::engines::duckduckgo::DuckDuckGo), - ), - "searx" => ("searx", Box::new(crate::engines::searx::Searx)), - _ => return None, + "duckduckgo" => { + let engine = crate::engines::duckduckgo::DuckDuckGo::new()?; + ("duckduckgo", Box::new(engine)) + } + "searx" => { + let engine = crate::engines::searx::Searx::new()?; + ("searx", Box::new(engine)) + } + _ => { + return Err(Report::from(EngineError::NoSuchEngineFound( + engine_name.to_string(), + ))) + } }; - Some(Self { + Ok(Self { engine: engine.1, name: engine.0, }) diff --git a/src/server/routes/search.rs b/src/server/routes/search.rs index 80db98f..9dbd1e1 100644 --- a/src/server/routes/search.rs +++ b/src/server/routes/search.rs @@ -191,7 +191,7 @@ async fn results( let engines: Vec = cookie_value .engines .iter() - .filter_map(|name| EngineHandler::new(name)) + .filter_map(|name| EngineHandler::new(name).ok()) .collect(); safe_search_level = match config.safe_search {