From 75a77d25f06f2dbf53494655f76719568d7adc9f Mon Sep 17 00:00:00 2001 From: Zsombor Gegesy Date: Sun, 24 Sep 2023 13:54:08 +0200 Subject: [PATCH] Create separate search_result_parser --- src/config/parser.rs | 2 +- src/engines/duckduckgo.rs | 49 ++++++++++++++------------- src/engines/mod.rs | 1 + src/engines/search_result_parser.rs | 38 +++++++++++++++++++++ src/engines/searx.rs | 52 ++++++++++++++--------------- src/models/aggregation_models.rs | 2 ++ src/models/engine_models.rs | 26 ++++++++++----- src/server/routes/search.rs | 2 +- 8 files changed, 110 insertions(+), 62 deletions(-) create mode 100644 src/engines/search_result_parser.rs diff --git a/src/config/parser.rs b/src/config/parser.rs index fb9f8b1..d7202a6 100644 --- a/src/config/parser.rs +++ b/src/config/parser.rs @@ -111,7 +111,7 @@ impl Config { .get::<_, HashMap>("upstream_search_engines")? .into_iter() .filter_map(|(key, value)| value.then_some(key)) - .filter_map(|engine| crate::models::engine_models::EngineHandler::new(&engine)) + .filter_map(|engine| crate::models::engine_models::EngineHandler::new(&engine).ok()) .collect(), request_timeout: globals.get::<_, u8>("request_timeout")?, threads, diff --git a/src/engines/duckduckgo.rs b/src/engines/duckduckgo.rs index 0f06ea4..318e764 100644 --- a/src/engines/duckduckgo.rs +++ b/src/engines/duckduckgo.rs @@ -5,7 +5,7 @@ use std::collections::HashMap; use reqwest::header::HeaderMap; -use scraper::{Html, Selector}; +use scraper::Html; use crate::models::aggregation_models::SearchResult; @@ -13,9 +13,27 @@ use crate::models::engine_models::{EngineError, SearchEngine}; use error_stack::{Report, Result, ResultExt}; +use super::search_result_parser::SearchResultParser; + /// A new DuckDuckGo engine type defined in-order to implement the `SearchEngine` trait which allows to /// reduce code duplication as well as allows to create vector of different search engines easily. -pub struct DuckDuckGo; +pub struct DuckDuckGo { + parser: SearchResultParser, +} + +impl DuckDuckGo { + pub fn new() -> Result { + Ok(Self { + parser: SearchResultParser::new( + ".no-results", + ".result", + ".result__a", + ".result__url", + ".result__snippet", + )?, + }) + } +} #[async_trait::async_trait] impl SearchEngine for DuckDuckGo { @@ -59,34 +77,17 @@ impl SearchEngine for DuckDuckGo { &DuckDuckGo::fetch_html_from_upstream(self, &url, header_map, request_timeout).await?, ); - let no_result: Selector = Selector::parse(".no-results") - .map_err(|_| Report::new(EngineError::UnexpectedError)) - .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".no-results"))?; - - if document.select(&no_result).next().is_some() { + if document.select(&self.parser.no_result).next().is_some() { return Err(Report::new(EngineError::EmptyResultSet)); } - let results: Selector = Selector::parse(".result") - .map_err(|_| Report::new(EngineError::UnexpectedError)) - .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result"))?; - let result_title: Selector = Selector::parse(".result__a") - .map_err(|_| Report::new(EngineError::UnexpectedError)) - .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result__a"))?; - let result_url: Selector = Selector::parse(".result__url") - .map_err(|_| Report::new(EngineError::UnexpectedError)) - .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result__url"))?; - let result_desc: Selector = Selector::parse(".result__snippet") - .map_err(|_| Report::new(EngineError::UnexpectedError)) - .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result__snippet"))?; - // scrape all the results from the html Ok(document - .select(&results) + .select(&self.parser.results) .map(|result| { SearchResult::new( result - .select(&result_title) + .select(&self.parser.result_title) .next() .unwrap() .inner_html() @@ -94,7 +95,7 @@ impl SearchEngine for DuckDuckGo { format!( "https://{}", result - .select(&result_url) + .select(&self.parser.result_url) .next() .unwrap() .inner_html() @@ -102,7 +103,7 @@ impl SearchEngine for DuckDuckGo { ) .as_str(), result - .select(&result_desc) + .select(&self.parser.result_desc) .next() .unwrap() .inner_html() diff --git a/src/engines/mod.rs b/src/engines/mod.rs index 0016728..39b50c8 100644 --- a/src/engines/mod.rs +++ b/src/engines/mod.rs @@ -4,4 +4,5 @@ //! code. Moreover, it also provides a custom error for the upstream search engine handling code. pub mod duckduckgo; +pub mod search_result_parser; pub mod searx; diff --git a/src/engines/search_result_parser.rs b/src/engines/search_result_parser.rs new file mode 100644 index 0000000..6918917 --- /dev/null +++ b/src/engines/search_result_parser.rs @@ -0,0 +1,38 @@ +use crate::models::engine_models::EngineError; +use error_stack::{Report, Result, ResultExt}; +use scraper::{Html, Selector}; + +pub struct SearchResultParser { + pub no_result: Selector, + pub results: Selector, + pub result_title: Selector, + pub result_url: Selector, + pub result_desc: Selector, +} + +impl SearchResultParser { + pub fn new( + no_result_selector: &str, + results_selector: &str, + result_title_selector: &str, + result_url_selector: &str, + result_desc_selector: &str, + ) -> Result { + Ok(SearchResultParser { + no_result: new_selector(no_result_selector)?, + results: new_selector(results_selector)?, + result_title: new_selector(result_title_selector)?, + result_url: new_selector(result_url_selector)?, + result_desc: new_selector(result_desc_selector)?, + }) + } +} + +fn new_selector(selector: &str) -> Result { + Selector::parse(selector).map_err(|err| { + Report::new(EngineError::UnexpectedError).attach_printable(format!( + "invalid CSS selector: {}, err: {:?}", + selector, err + )) + }) +} diff --git a/src/engines/searx.rs b/src/engines/searx.rs index 6ab0469..32f286e 100644 --- a/src/engines/searx.rs +++ b/src/engines/searx.rs @@ -3,16 +3,34 @@ //! number if provided. use reqwest::header::HeaderMap; -use scraper::{Html, Selector}; +use scraper::Html; use std::collections::HashMap; +use super::search_result_parser::SearchResultParser; use crate::models::aggregation_models::SearchResult; use crate::models::engine_models::{EngineError, SearchEngine}; use error_stack::{Report, Result, ResultExt}; /// A new Searx engine type defined in-order to implement the `SearchEngine` trait which allows to /// reduce code duplication as well as allows to create vector of different search engines easily. -pub struct Searx; +pub struct Searx { + parser: SearchResultParser, +} + +impl Searx { + // new Searchx engine + pub fn new() -> Result { + Ok(Self { + parser: SearchResultParser::new( + "#urls>.dialog-error>p", + ".result", + "h3>a", + "h3>a", + ".content", + )?, + }) + } +} #[async_trait::async_trait] impl SearchEngine for Searx { @@ -52,13 +70,7 @@ impl SearchEngine for Searx { &Searx::fetch_html_from_upstream(self, &url, header_map, request_timeout).await?, ); - let no_result: Selector = Selector::parse("#urls>.dialog-error>p") - .map_err(|_| Report::new(EngineError::UnexpectedError)) - .attach_printable_lazy(|| { - format!("invalid CSS selector: {}", "#urls>.dialog-error>p") - })?; - - if let Some(no_result_msg) = document.select(&no_result).nth(1) { + if let Some(no_result_msg) = document.select(&self.parser.no_result).nth(1) { if no_result_msg.inner_html() == "we didn't find any results. Please use another query or search in more categories" { @@ -66,40 +78,26 @@ impl SearchEngine for Searx { } } - let results: Selector = Selector::parse(".result") - .map_err(|_| Report::new(EngineError::UnexpectedError)) - .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result"))?; - let result_title: Selector = Selector::parse("h3>a") - .map_err(|_| Report::new(EngineError::UnexpectedError)) - .attach_printable_lazy(|| format!("invalid CSS selector: {}", "h3>a"))?; - let result_url: Selector = Selector::parse("h3>a") - .map_err(|_| Report::new(EngineError::UnexpectedError)) - .attach_printable_lazy(|| format!("invalid CSS selector: {}", "h3>a"))?; - - let result_desc: Selector = Selector::parse(".content") - .map_err(|_| Report::new(EngineError::UnexpectedError)) - .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".content"))?; - // scrape all the results from the html Ok(document - .select(&results) + .select(&self.parser.results) .map(|result| { SearchResult::new( result - .select(&result_title) + .select(&self.parser.result_title) .next() .unwrap() .inner_html() .trim(), result - .select(&result_url) + .select(&self.parser.result_url) .next() .unwrap() .value() .attr("href") .unwrap(), result - .select(&result_desc) + .select(&self.parser.result_desc) .next() .unwrap() .inner_html() diff --git a/src/models/aggregation_models.rs b/src/models/aggregation_models.rs index 72bbf08..6e4bddf 100644 --- a/src/models/aggregation_models.rs +++ b/src/models/aggregation_models.rs @@ -85,12 +85,14 @@ impl EngineErrorInfo { pub fn new(error: &EngineError, engine: &str) -> Self { Self { error: match error { + EngineError::EngineNotFound => "EngineNotFound".to_owned(), EngineError::RequestError => "RequestError".to_owned(), EngineError::EmptyResultSet => "EmptyResultSet".to_owned(), EngineError::UnexpectedError => "UnexpectedError".to_owned(), }, engine: engine.to_owned(), severity_color: match error { + EngineError::EngineNotFound => "red".to_owned(), EngineError::RequestError => "green".to_owned(), EngineError::EmptyResultSet => "blue".to_owned(), EngineError::UnexpectedError => "red".to_owned(), diff --git a/src/models/engine_models.rs b/src/models/engine_models.rs index d4a4e72..77ec4c4 100644 --- a/src/models/engine_models.rs +++ b/src/models/engine_models.rs @@ -2,12 +2,14 @@ //! the upstream search engines with the search query provided by the user. use super::aggregation_models::SearchResult; -use error_stack::{Result, ResultExt}; +use error_stack::{Report, Result, ResultExt}; use std::{collections::HashMap, fmt, time::Duration}; /// A custom error type used for handle engine associated errors. #[derive(Debug)] pub enum EngineError { + // No matching engine found + EngineNotFound, /// This variant handles all request related errors like forbidden, not found, /// etc. EmptyResultSet, @@ -24,6 +26,9 @@ pub enum EngineError { impl fmt::Display for EngineError { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self { + EngineError::EngineNotFound => { + write!(f, "Search engine not found") + } EngineError::EmptyResultSet => { write!(f, "The upstream search engine returned an empty result set") } @@ -134,18 +139,21 @@ impl EngineHandler { /// # Returns /// /// It returns an option either containing the value or a none if the engine is unknown - pub fn new(engine_name: &str) -> Option { + pub fn new(engine_name: &str) -> Result { let engine: (&'static str, Box) = match engine_name.to_lowercase().as_str() { - "duckduckgo" => ( - "duckduckgo", - Box::new(crate::engines::duckduckgo::DuckDuckGo), - ), - "searx" => ("searx", Box::new(crate::engines::searx::Searx)), - _ => return None, + "duckduckgo" => { + let engine = crate::engines::duckduckgo::DuckDuckGo::new()?; + ("duckduckgo", Box::new(engine)) + } + "searx" => { + let engine = crate::engines::searx::Searx::new()?; + ("searx", Box::new(engine)) + } + _ => return Err(Report::from(EngineError::EngineNotFound)), }; - Some(Self { + Ok(Self { engine: engine.1, name: engine.0, }) diff --git a/src/server/routes/search.rs b/src/server/routes/search.rs index 80db98f..9dbd1e1 100644 --- a/src/server/routes/search.rs +++ b/src/server/routes/search.rs @@ -191,7 +191,7 @@ async fn results( let engines: Vec = cookie_value .engines .iter() - .filter_map(|name| EngineHandler::new(name)) + .filter_map(|name| EngineHandler::new(name).ok()) .collect(); safe_search_level = match config.safe_search {