diff --git a/Cargo.lock b/Cargo.lock index c3b4f93..568ca5f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -268,6 +268,12 @@ dependencies = [ "alloc-no-stdlib", ] +[[package]] +name = "anyhow" +version = "1.0.71" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c7d0618f0e0b7e8ff11427422b64564d5fb0be1940354bfe2e0529b18a9d9b8" + [[package]] name = "askama_escape" version = "0.10.3" @@ -733,6 +739,16 @@ dependencies = [ "libc", ] +[[package]] +name = "error-stack" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f00447f331c7f726db5b8532ebc9163519eed03c6d7c8b73c90b3ff5646ac85" +dependencies = [ + "anyhow", + "rustc_version 0.4.0", +] + [[package]] name = "failure" version = "0.1.8" @@ -3373,6 +3389,7 @@ dependencies = [ "actix-files", "actix-web", "env_logger", + "error-stack", "fake-useragent", "handlebars", "log", diff --git a/Cargo.toml b/Cargo.toml index 2e2adfb..9102880 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -23,6 +23,7 @@ redis = {version="*"} md5 = {version="*"} rand={version="*"} once_cell = {version="*"} +error-stack = "0.3.1" [dev-dependencies] rusty-hook = "^0.11.2" diff --git a/src/engines/duckduckgo.rs b/src/engines/duckduckgo.rs index 64c34c3..70c3a87 100644 --- a/src/engines/duckduckgo.rs +++ b/src/engines/duckduckgo.rs @@ -9,7 +9,9 @@ use scraper::{Html, Selector}; use crate::search_results_handler::aggregation_models::RawSearchResult; -use super::engine_models::EngineErrorKind; +use super::engine_models::EngineError; + +use error_stack::{IntoReport, Report, Result, ResultExt}; /// This function scrapes results from the upstream engine duckduckgo and puts all the scraped /// results like title, visiting_url (href in html),engine (from which engine it was fetched from) @@ -32,7 +34,7 @@ pub async fn results( query: &str, page: u32, user_agent: &str, -) -> Result, EngineErrorKind> { +) -> Result, EngineError> { // Page number can be missing or empty string and so appropriate handling is required // so that upstream server recieves valid page number. let url: String = match page { @@ -51,33 +53,71 @@ pub async fn results( // initializing HeaderMap and adding appropriate headers. let mut header_map = HeaderMap::new(); - header_map.insert(USER_AGENT, user_agent.parse()?); - header_map.insert(REFERER, "https://google.com/".parse()?); - header_map.insert(CONTENT_TYPE, "application/x-www-form-urlencoded".parse()?); - header_map.insert(COOKIE, "kl=wt-wt".parse()?); + header_map.insert( + USER_AGENT, + user_agent + .parse() + .into_report() + .change_context(EngineError::UnexpectedError)?, + ); + header_map.insert( + REFERER, + "https://google.com/" + .parse() + .into_report() + .change_context(EngineError::UnexpectedError)?, + ); + header_map.insert( + CONTENT_TYPE, + "application/x-www-form-urlencoded" + .parse() + .into_report() + .change_context(EngineError::UnexpectedError)?, + ); + header_map.insert( + COOKIE, + "kl=wt-wt" + .parse() + .into_report() + .change_context(EngineError::UnexpectedError)?, + ); // fetch the html from upstream duckduckgo engine let results: String = reqwest::Client::new() .get(url) - .timeout(Duration::from_secs(30)) + .timeout(Duration::from_secs(5)) .headers(header_map) // add spoofed headers to emulate human behaviour .send() - .await? + .await + .into_report() + .change_context(EngineError::RequestError)? .text() - .await?; + .await + .into_report() + .change_context(EngineError::RequestError)?; let document: Html = Html::parse_document(&results); - let no_result: Selector = Selector::parse(".no-results")?; + let no_result: Selector = Selector::parse(".no-results") + .map_err(|_| Report::new(EngineError::UnexpectedError)) + .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".no-results"))?; if document.select(&no_result).next().is_some() { - return Err(EngineErrorKind::EmptyResultSet); + return Err(Report::new(EngineError::EmptyResultSet)); } - let results: Selector = Selector::parse(".result")?; - let result_title: Selector = Selector::parse(".result__a")?; - let result_url: Selector = Selector::parse(".result__url")?; - let result_desc: Selector = Selector::parse(".result__snippet")?; + let results: Selector = Selector::parse(".result") + .map_err(|_| Report::new(EngineError::UnexpectedError)) + .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result"))?; + let result_title: Selector = Selector::parse(".result__a") + .map_err(|_| Report::new(EngineError::UnexpectedError)) + .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result__a"))?; + let result_url: Selector = Selector::parse(".result__url") + .map_err(|_| Report::new(EngineError::UnexpectedError)) + .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result__url"))?; + let result_desc: Selector = Selector::parse(".result__snippet") + .map_err(|_| Report::new(EngineError::UnexpectedError)) + .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result__snippet"))?; // scrape all the results from the html Ok(document diff --git a/src/engines/engine_models.rs b/src/engines/engine_models.rs index e763852..7a58688 100644 --- a/src/engines/engine_models.rs +++ b/src/engines/engine_models.rs @@ -1,8 +1,8 @@ //! This module provides the error enum to handle different errors associated while requesting data from //! the upstream search engines with the search query provided by the user. -use reqwest::header::InvalidHeaderValue; -use scraper::error::SelectorErrorKind; +use error_stack::Context; +use std::fmt; /// A custom error type used for handle engine associated errors. /// @@ -15,73 +15,29 @@ use scraper::error::SelectorErrorKind; /// and are errors mostly related to failure in initialization of HeaderMap, Selector errors and /// all other errors occuring within the code handling the `upstream search engines`. #[derive(Debug)] -pub enum EngineErrorKind { - RequestError(reqwest::Error), +pub enum EngineError { EmptyResultSet, - UnexpectedError { - message: String, - source: Option>, - }, + RequestError, + UnexpectedError, } -/// Implementing `Display` trait to make errors writable on the stdout and also providing/passing the -/// appropriate errors that should be written to the stdout when this error is raised/encountered. -impl std::fmt::Display for EngineErrorKind { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { +impl fmt::Display for EngineError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self { - EngineErrorKind::RequestError(request_error) => { - write!(f, "Request error: {}", request_error) - } - EngineErrorKind::EmptyResultSet => { + EngineError::EmptyResultSet => { write!(f, "The upstream search engine returned an empty result set") } - EngineErrorKind::UnexpectedError { message, source } => { - write!(f, "Unexpected error: {}", message)?; - if let Some(source) = source { - write!(f, "\nCaused by: {}", source)?; - } - Ok(()) + EngineError::RequestError => { + write!( + f, + "Error occurred while requesting data from upstream search engine" + ) + } + EngineError::UnexpectedError => { + write!(f, "An unexpected error occurred while processing the data") } } } } -/// Implementing `Error` trait to make the the `EngineErrorKind` enum an error type and -/// mapping `ReqwestErrors` to `RequestError` and `UnexpectedError` errors to all other unexpected -/// errors ocurring within the code handling the upstream search engines. -impl std::error::Error for EngineErrorKind { - fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { - match self { - EngineErrorKind::RequestError(request_error) => Some(request_error), - EngineErrorKind::UnexpectedError { source, .. } => source.as_deref().map(|s| s), - _ => None, - } - } -} - -/// Implementing `From` trait to map the `SelectorErrorKind` to `UnexpectedError` variant. -impl From> for EngineErrorKind { - fn from(err: SelectorErrorKind<'_>) -> Self { - Self::UnexpectedError { - message: err.to_string(), - source: None, - } - } -} - -/// Implementing `From` trait to map the `InvalidHeaderValue` to `UnexpectedError` variant. -impl From for EngineErrorKind { - fn from(err: InvalidHeaderValue) -> Self { - Self::UnexpectedError { - message: err.to_string(), - source: Some(Box::new(err)), - } - } -} - -/// Implementing `From` trait to map all `reqwest::Error` to `UnexpectedError` variant. -impl From for EngineErrorKind { - fn from(err: reqwest::Error) -> Self { - Self::RequestError(err) - } -} +impl Context for EngineError {} diff --git a/src/engines/searx.rs b/src/engines/searx.rs index feab464..bc68608 100644 --- a/src/engines/searx.rs +++ b/src/engines/searx.rs @@ -8,7 +8,8 @@ use std::collections::HashMap; use crate::search_results_handler::aggregation_models::RawSearchResult; -use super::engine_models::EngineErrorKind; +use super::engine_models::EngineError; +use error_stack::{IntoReport, Report, Result, ResultExt}; /// This function scrapes results from the upstream engine duckduckgo and puts all the scraped /// results like title, visiting_url (href in html),engine (from which engine it was fetched from) @@ -31,43 +32,76 @@ pub async fn results( query: &str, page: u32, user_agent: &str, -) -> Result, EngineErrorKind> { +) -> Result, EngineError> { // Page number can be missing or empty string and so appropriate handling is required // so that upstream server recieves valid page number. let url: String = format!("https://searx.work/search?q={query}&pageno={page}"); // initializing headers and adding appropriate headers. let mut header_map = HeaderMap::new(); - header_map.insert(USER_AGENT, user_agent.parse()?); - header_map.insert(REFERER, "https://google.com/".parse()?); - header_map.insert(CONTENT_TYPE, "application/x-www-form-urlencoded".parse()?); - header_map.insert(COOKIE, "categories=general; language=auto; locale=en; autocomplete=duckduckgo; image_proxy=1; method=POST; safesearch=2; theme=simple; results_on_new_tab=1; doi_resolver=oadoi.org; simple_style=auto; center_alignment=1; query_in_title=1; infinite_scroll=0; disabled_engines=; enabled_engines=\"archive is__general\\054yep__general\\054curlie__general\\054currency__general\\054ddg definitions__general\\054wikidata__general\\054duckduckgo__general\\054tineye__general\\054lingva__general\\054startpage__general\\054yahoo__general\\054wiby__general\\054marginalia__general\\054alexandria__general\\054wikibooks__general\\054wikiquote__general\\054wikisource__general\\054wikiversity__general\\054wikivoyage__general\\054dictzone__general\\054seznam__general\\054mojeek__general\\054naver__general\\054wikimini__general\\054brave__general\\054petalsearch__general\\054goo__general\"; disabled_plugins=; enabled_plugins=\"searx.plugins.hostname_replace\\054searx.plugins.oa_doi_rewrite\\054searx.plugins.vim_hotkeys\"; tokens=; maintab=on; enginetab=on".parse()?); + header_map.insert( + USER_AGENT, + user_agent + .parse() + .into_report() + .change_context(EngineError::UnexpectedError)?, + ); + header_map.insert( + REFERER, + "https://google.com/" + .parse() + .into_report() + .change_context(EngineError::UnexpectedError)?, + ); + header_map.insert( + CONTENT_TYPE, + "application/x-www-form-urlencoded" + .parse() + .into_report() + .change_context(EngineError::UnexpectedError)?, + ); + header_map.insert(COOKIE, "categories=general; language=auto; locale=en; autocomplete=duckduckgo; image_proxy=1; method=POST; safesearch=2; theme=simple; results_on_new_tab=1; doi_resolver=oadoi.org; simple_style=auto; center_alignment=1; query_in_title=1; infinite_scroll=0; disabled_engines=; enabled_engines=\"archive is__general\\054yep__general\\054curlie__general\\054currency__general\\054ddg definitions__general\\054wikidata__general\\054duckduckgo__general\\054tineye__general\\054lingva__general\\054startpage__general\\054yahoo__general\\054wiby__general\\054marginalia__general\\054alexandria__general\\054wikibooks__general\\054wikiquote__general\\054wikisource__general\\054wikiversity__general\\054wikivoyage__general\\054dictzone__general\\054seznam__general\\054mojeek__general\\054naver__general\\054wikimini__general\\054brave__general\\054petalsearch__general\\054goo__general\"; disabled_plugins=; enabled_plugins=\"searx.plugins.hostname_replace\\054searx.plugins.oa_doi_rewrite\\054searx.plugins.vim_hotkeys\"; tokens=; maintab=on; enginetab=on".parse().into_report().change_context(EngineError::UnexpectedError)?); // fetch the html from upstream searx instance engine let results: String = reqwest::Client::new() .get(url) .headers(header_map) // add spoofed headers to emulate human behaviours. .send() - .await? + .await + .into_report() + .change_context(EngineError::RequestError)? .text() - .await?; + .await + .into_report() + .change_context(EngineError::RequestError)?; let document: Html = Html::parse_document(&results); - let no_result: Selector = Selector::parse("#urls>.dialog-error>p")?; + let no_result: Selector = Selector::parse("#urls>.dialog-error>p") + .map_err(|_| Report::new(EngineError::UnexpectedError)) + .attach_printable_lazy(|| format!("invalid CSS selector: {}", "#urls>.dialog-error>p"))?; if let Some(no_result_msg) = document.select(&no_result).nth(1) { if no_result_msg.inner_html() == "we didn't find any results. Please use another query or search in more categories" { - return Err(EngineErrorKind::EmptyResultSet); + return Err(Report::new(EngineError::EmptyResultSet)); } } - let results: Selector = Selector::parse(".result")?; - let result_title: Selector = Selector::parse("h3>a")?; - let result_url: Selector = Selector::parse("h3>a")?; - let result_desc: Selector = Selector::parse(".content")?; + let results: Selector = Selector::parse(".result") + .map_err(|_| Report::new(EngineError::UnexpectedError)) + .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result"))?; + let result_title: Selector = Selector::parse("h3>a") + .map_err(|_| Report::new(EngineError::UnexpectedError)) + .attach_printable_lazy(|| format!("invalid CSS selector: {}", "h3>a"))?; + let result_url: Selector = Selector::parse("h3>a") + .map_err(|_| Report::new(EngineError::UnexpectedError)) + .attach_printable_lazy(|| format!("invalid CSS selector: {}", "h3>a"))?; + + let result_desc: Selector = Selector::parse(".content") + .map_err(|_| Report::new(EngineError::UnexpectedError)) + .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".content"))?; // scrape all the results from the html Ok(document diff --git a/src/search_results_handler/aggregator.rs b/src/search_results_handler/aggregator.rs index 8b6bae3..cba266c 100644 --- a/src/search_results_handler/aggregator.rs +++ b/src/search_results_handler/aggregator.rs @@ -58,8 +58,19 @@ pub async fn aggregate( searx::results(query, page, &user_agent) ); - let ddg_map_results: HashMap = ddg_map_results?; - let searx_map_results: HashMap = searx_map_results?; + let ddg_map_results = ddg_map_results.unwrap_or_else(|e| { + if debug { + log::error!("Error fetching results from DuckDuckGo: {:?}", e); + } + HashMap::new() + }); + + let searx_map_results = searx_map_results.unwrap_or_else(|e| { + if debug { + log::error!("Error fetching results from Searx: {:?}", e); + } + HashMap::new() + }); result_map.extend(ddg_map_results);