From f9b9e87a0e82361c3148ee7cb0a0f24b3bcbe254 Mon Sep 17 00:00:00 2001 From: neon_arch Date: Tue, 11 Jul 2023 19:42:17 +0300 Subject: [PATCH] =?UTF-8?q?=E2=9C=A8=20feat:=20rewrite=20code=20by=20imple?= =?UTF-8?q?menting=20common=20engine=20trait=20`SearchEngine`?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/engines/duckduckgo.rs | 225 +++++++++++++++++++------------------- src/engines/searx.rs | 193 ++++++++++++++++---------------- 2 files changed, 205 insertions(+), 213 deletions(-) diff --git a/src/engines/duckduckgo.rs b/src/engines/duckduckgo.rs index 70c3a87..e13ca4c 100644 --- a/src/engines/duckduckgo.rs +++ b/src/engines/duckduckgo.rs @@ -9,7 +9,7 @@ use scraper::{Html, Selector}; use crate::search_results_handler::aggregation_models::RawSearchResult; -use super::engine_models::EngineError; +use super::engine_models::{EngineError, SearchEngine}; use error_stack::{IntoReport, Report, Result, ResultExt}; @@ -30,126 +30,121 @@ use error_stack::{IntoReport, Report, Result, ResultExt}; /// reach the above `upstream search engine` page or if the `upstream search engine` is unable to /// provide results for the requested search query and also returns error if the scraping selector /// or HeaderMap fails to initialize. -pub async fn results( - query: &str, - page: u32, - user_agent: &str, -) -> Result, EngineError> { - // Page number can be missing or empty string and so appropriate handling is required - // so that upstream server recieves valid page number. - let url: String = match page { - 1 => { - format!("https://html.duckduckgo.com/html/?q={query}&s=&dc=&v=1&o=json&api=/d.js") - } - _ => { - format!( - "https://duckduckgo.com/html/?q={}&s={}&dc={}&v=1&o=json&api=/d.js", - query, - (page / 2 + (page % 2)) * 30, - (page / 2 + (page % 2)) * 30 + 1 - ) - } - }; - // initializing HeaderMap and adding appropriate headers. - let mut header_map = HeaderMap::new(); - header_map.insert( - USER_AGENT, - user_agent - .parse() - .into_report() - .change_context(EngineError::UnexpectedError)?, - ); - header_map.insert( - REFERER, - "https://google.com/" - .parse() - .into_report() - .change_context(EngineError::UnexpectedError)?, - ); - header_map.insert( - CONTENT_TYPE, - "application/x-www-form-urlencoded" - .parse() - .into_report() - .change_context(EngineError::UnexpectedError)?, - ); - header_map.insert( - COOKIE, - "kl=wt-wt" - .parse() - .into_report() - .change_context(EngineError::UnexpectedError)?, - ); +pub struct DuckDuckGo; - // fetch the html from upstream duckduckgo engine - let results: String = reqwest::Client::new() - .get(url) - .timeout(Duration::from_secs(5)) - .headers(header_map) // add spoofed headers to emulate human behaviour - .send() - .await - .into_report() - .change_context(EngineError::RequestError)? - .text() - .await - .into_report() - .change_context(EngineError::RequestError)?; - - let document: Html = Html::parse_document(&results); - - let no_result: Selector = Selector::parse(".no-results") - .map_err(|_| Report::new(EngineError::UnexpectedError)) - .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".no-results"))?; - - if document.select(&no_result).next().is_some() { - return Err(Report::new(EngineError::EmptyResultSet)); - } - - let results: Selector = Selector::parse(".result") - .map_err(|_| Report::new(EngineError::UnexpectedError)) - .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result"))?; - let result_title: Selector = Selector::parse(".result__a") - .map_err(|_| Report::new(EngineError::UnexpectedError)) - .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result__a"))?; - let result_url: Selector = Selector::parse(".result__url") - .map_err(|_| Report::new(EngineError::UnexpectedError)) - .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result__url"))?; - let result_desc: Selector = Selector::parse(".result__snippet") - .map_err(|_| Report::new(EngineError::UnexpectedError)) - .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result__snippet"))?; - - // scrape all the results from the html - Ok(document - .select(&results) - .map(|result| { - RawSearchResult::new( - result - .select(&result_title) - .next() - .unwrap() - .inner_html() - .trim() - .to_string(), +#[async_trait::async_trait] +impl SearchEngine for DuckDuckGo { + async fn results( + &self, + query: String, + page: u32, + user_agent: String, + ) -> Result, EngineError> { + // Page number can be missing or empty string and so appropriate handling is required + // so that upstream server recieves valid page number. + let url: String = match page { + 1 => { + format!("https://html.duckduckgo.com/html/?q={query}&s=&dc=&v=1&o=json&api=/d.js") + } + _ => { format!( - "https://{}", + "https://duckduckgo.com/html/?q={}&s={}&dc={}&v=1&o=json&api=/d.js", + query, + (page / 2 + (page % 2)) * 30, + (page / 2 + (page % 2)) * 30 + 1 + ) + } + }; + + // initializing HeaderMap and adding appropriate headers. + let mut header_map = HeaderMap::new(); + header_map.insert( + USER_AGENT, + user_agent + .parse() + .into_report() + .change_context(EngineError::UnexpectedError)?, + ); + header_map.insert( + REFERER, + "https://google.com/" + .parse() + .into_report() + .change_context(EngineError::UnexpectedError)?, + ); + header_map.insert( + CONTENT_TYPE, + "application/x-www-form-urlencoded" + .parse() + .into_report() + .change_context(EngineError::UnexpectedError)?, + ); + header_map.insert( + COOKIE, + "kl=wt-wt" + .parse() + .into_report() + .change_context(EngineError::UnexpectedError)?, + ); + + let document: Html = Html::parse_document( + &DuckDuckGo::fetch_html_from_upstream(&self, url, header_map).await?, + ); + + let no_result: Selector = Selector::parse(".no-results") + .map_err(|_| Report::new(EngineError::UnexpectedError)) + .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".no-results"))?; + + if document.select(&no_result).next().is_some() { + return Err(Report::new(EngineError::EmptyResultSet)); + } + + let results: Selector = Selector::parse(".result") + .map_err(|_| Report::new(EngineError::UnexpectedError)) + .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result"))?; + let result_title: Selector = Selector::parse(".result__a") + .map_err(|_| Report::new(EngineError::UnexpectedError)) + .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result__a"))?; + let result_url: Selector = Selector::parse(".result__url") + .map_err(|_| Report::new(EngineError::UnexpectedError)) + .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result__url"))?; + let result_desc: Selector = Selector::parse(".result__snippet") + .map_err(|_| Report::new(EngineError::UnexpectedError)) + .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result__snippet"))?; + + // scrape all the results from the html + Ok(document + .select(&results) + .map(|result| { + RawSearchResult::new( result - .select(&result_url) + .select(&result_title) .next() .unwrap() .inner_html() .trim() - ), - result - .select(&result_desc) - .next() - .unwrap() - .inner_html() - .trim() - .to_string(), - vec!["duckduckgo".to_string()], - ) - }) - .map(|search_result| (search_result.visiting_url.clone(), search_result)) - .collect()) + .to_string(), + format!( + "https://{}", + result + .select(&result_url) + .next() + .unwrap() + .inner_html() + .trim() + ), + result + .select(&result_desc) + .next() + .unwrap() + .inner_html() + .trim() + .to_string(), + vec!["duckduckgo".to_string()], + ) + }) + .map(|search_result| (search_result.visiting_url.clone(), search_result)) + .collect()) + } } diff --git a/src/engines/searx.rs b/src/engines/searx.rs index bc68608..56e3b23 100644 --- a/src/engines/searx.rs +++ b/src/engines/searx.rs @@ -8,7 +8,7 @@ use std::collections::HashMap; use crate::search_results_handler::aggregation_models::RawSearchResult; -use super::engine_models::EngineError; +use super::engine_models::{EngineError, SearchEngine}; use error_stack::{IntoReport, Report, Result, ResultExt}; /// This function scrapes results from the upstream engine duckduckgo and puts all the scraped @@ -28,111 +28,108 @@ use error_stack::{IntoReport, Report, Result, ResultExt}; /// reach the above `upstream search engine` page or if the `upstream search engine` is unable to /// provide results for the requested search query and also returns error if the scraping selector /// or HeaderMap fails to initialize. -pub async fn results( - query: &str, - page: u32, - user_agent: &str, -) -> Result, EngineError> { - // Page number can be missing or empty string and so appropriate handling is required - // so that upstream server recieves valid page number. - let url: String = format!("https://searx.work/search?q={query}&pageno={page}"); - // initializing headers and adding appropriate headers. - let mut header_map = HeaderMap::new(); - header_map.insert( - USER_AGENT, - user_agent - .parse() - .into_report() - .change_context(EngineError::UnexpectedError)?, - ); - header_map.insert( - REFERER, - "https://google.com/" - .parse() - .into_report() - .change_context(EngineError::UnexpectedError)?, - ); - header_map.insert( - CONTENT_TYPE, - "application/x-www-form-urlencoded" - .parse() - .into_report() - .change_context(EngineError::UnexpectedError)?, - ); - header_map.insert(COOKIE, "categories=general; language=auto; locale=en; autocomplete=duckduckgo; image_proxy=1; method=POST; safesearch=2; theme=simple; results_on_new_tab=1; doi_resolver=oadoi.org; simple_style=auto; center_alignment=1; query_in_title=1; infinite_scroll=0; disabled_engines=; enabled_engines=\"archive is__general\\054yep__general\\054curlie__general\\054currency__general\\054ddg definitions__general\\054wikidata__general\\054duckduckgo__general\\054tineye__general\\054lingva__general\\054startpage__general\\054yahoo__general\\054wiby__general\\054marginalia__general\\054alexandria__general\\054wikibooks__general\\054wikiquote__general\\054wikisource__general\\054wikiversity__general\\054wikivoyage__general\\054dictzone__general\\054seznam__general\\054mojeek__general\\054naver__general\\054wikimini__general\\054brave__general\\054petalsearch__general\\054goo__general\"; disabled_plugins=; enabled_plugins=\"searx.plugins.hostname_replace\\054searx.plugins.oa_doi_rewrite\\054searx.plugins.vim_hotkeys\"; tokens=; maintab=on; enginetab=on".parse().into_report().change_context(EngineError::UnexpectedError)?); +pub struct Searx; - // fetch the html from upstream searx instance engine - let results: String = reqwest::Client::new() - .get(url) - .headers(header_map) // add spoofed headers to emulate human behaviours. - .send() - .await - .into_report() - .change_context(EngineError::RequestError)? - .text() - .await - .into_report() - .change_context(EngineError::RequestError)?; +#[async_trait::async_trait] +impl SearchEngine for Searx { + async fn results( + &self, + query: String, + page: u32, + user_agent: String, + ) -> Result, EngineError> { + // Page number can be missing or empty string and so appropriate handling is required + // so that upstream server recieves valid page number. + let url: String = format!("https://searx.work/search?q={query}&pageno={page}"); - let document: Html = Html::parse_document(&results); + // initializing headers and adding appropriate headers. + let mut header_map = HeaderMap::new(); + header_map.insert( + USER_AGENT, + user_agent + .parse() + .into_report() + .change_context(EngineError::UnexpectedError)?, + ); + header_map.insert( + REFERER, + "https://google.com/" + .parse() + .into_report() + .change_context(EngineError::UnexpectedError)?, + ); + header_map.insert( + CONTENT_TYPE, + "application/x-www-form-urlencoded" + .parse() + .into_report() + .change_context(EngineError::UnexpectedError)?, + ); + header_map.insert(COOKIE, "categories=general; language=auto; locale=en; autocomplete=duckduckgo; image_proxy=1; method=POST; safesearch=2; theme=simple; results_on_new_tab=1; doi_resolver=oadoi.org; simple_style=auto; center_alignment=1; query_in_title=1; infinite_scroll=0; disabled_engines=; enabled_engines=\"archive is__general\\054yep__general\\054curlie__general\\054currency__general\\054ddg definitions__general\\054wikidata__general\\054duckduckgo__general\\054tineye__general\\054lingva__general\\054startpage__general\\054yahoo__general\\054wiby__general\\054marginalia__general\\054alexandria__general\\054wikibooks__general\\054wikiquote__general\\054wikisource__general\\054wikiversity__general\\054wikivoyage__general\\054dictzone__general\\054seznam__general\\054mojeek__general\\054naver__general\\054wikimini__general\\054brave__general\\054petalsearch__general\\054goo__general\"; disabled_plugins=; enabled_plugins=\"searx.plugins.hostname_replace\\054searx.plugins.oa_doi_rewrite\\054searx.plugins.vim_hotkeys\"; tokens=; maintab=on; enginetab=on".parse().into_report().change_context(EngineError::UnexpectedError)?); - let no_result: Selector = Selector::parse("#urls>.dialog-error>p") - .map_err(|_| Report::new(EngineError::UnexpectedError)) - .attach_printable_lazy(|| format!("invalid CSS selector: {}", "#urls>.dialog-error>p"))?; + let document: Html = + Html::parse_document(&Searx::fetch_html_from_upstream(&self, url, header_map).await?); - if let Some(no_result_msg) = document.select(&no_result).nth(1) { - if no_result_msg.inner_html() + let no_result: Selector = Selector::parse("#urls>.dialog-error>p") + .map_err(|_| Report::new(EngineError::UnexpectedError)) + .attach_printable_lazy(|| { + format!("invalid CSS selector: {}", "#urls>.dialog-error>p") + })?; + + if let Some(no_result_msg) = document.select(&no_result).nth(1) { + if no_result_msg.inner_html() == "we didn't find any results. Please use another query or search in more categories" { return Err(Report::new(EngineError::EmptyResultSet)); } + } + + let results: Selector = Selector::parse(".result") + .map_err(|_| Report::new(EngineError::UnexpectedError)) + .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result"))?; + let result_title: Selector = Selector::parse("h3>a") + .map_err(|_| Report::new(EngineError::UnexpectedError)) + .attach_printable_lazy(|| format!("invalid CSS selector: {}", "h3>a"))?; + let result_url: Selector = Selector::parse("h3>a") + .map_err(|_| Report::new(EngineError::UnexpectedError)) + .attach_printable_lazy(|| format!("invalid CSS selector: {}", "h3>a"))?; + + let result_desc: Selector = Selector::parse(".content") + .map_err(|_| Report::new(EngineError::UnexpectedError)) + .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".content"))?; + + // scrape all the results from the html + Ok(document + .select(&results) + .map(|result| { + RawSearchResult::new( + result + .select(&result_title) + .next() + .unwrap() + .inner_html() + .trim() + .to_string(), + result + .select(&result_url) + .next() + .unwrap() + .value() + .attr("href") + .unwrap() + .to_string(), + result + .select(&result_desc) + .next() + .unwrap() + .inner_html() + .trim() + .to_string(), + vec!["searx".to_string()], + ) + }) + .map(|search_result| (search_result.visiting_url.clone(), search_result)) + .collect()) } - - let results: Selector = Selector::parse(".result") - .map_err(|_| Report::new(EngineError::UnexpectedError)) - .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result"))?; - let result_title: Selector = Selector::parse("h3>a") - .map_err(|_| Report::new(EngineError::UnexpectedError)) - .attach_printable_lazy(|| format!("invalid CSS selector: {}", "h3>a"))?; - let result_url: Selector = Selector::parse("h3>a") - .map_err(|_| Report::new(EngineError::UnexpectedError)) - .attach_printable_lazy(|| format!("invalid CSS selector: {}", "h3>a"))?; - - let result_desc: Selector = Selector::parse(".content") - .map_err(|_| Report::new(EngineError::UnexpectedError)) - .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".content"))?; - - // scrape all the results from the html - Ok(document - .select(&results) - .map(|result| { - RawSearchResult::new( - result - .select(&result_title) - .next() - .unwrap() - .inner_html() - .trim() - .to_string(), - result - .select(&result_url) - .next() - .unwrap() - .value() - .attr("href") - .unwrap() - .to_string(), - result - .select(&result_desc) - .next() - .unwrap() - .inner_html() - .trim() - .to_string(), - vec!["searx".to_string()], - ) - }) - .map(|search_result| (search_result.visiting_url.clone(), search_result)) - .collect()) }