diff --git a/Cargo.lock b/Cargo.lock index 2bd43ac..6552adc 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -292,6 +292,17 @@ version = "0.10.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "619743e34b5ba4e9703bba34deac3427c72507c7159f5fd030aea8cac0cfe341" +[[package]] +name = "async-trait" +version = "0.1.71" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a564d521dd56509c4c47480d00b80ee55f7e385ae48db5744c67ad50c92d2ebf" +dependencies = [ + "proc-macro2 1.0.64", + "quote 1.0.29", + "syn 2.0.26", +] + [[package]] name = "autocfg" version = "0.1.8" @@ -506,18 +517,18 @@ dependencies = [ [[package]] name = "clap" -version = "4.3.11" +version = "4.3.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1640e5cc7fb47dbb8338fd471b105e7ed6c3cb2aeb00c2e067127ffd3764a05d" +checksum = "3eab9e8ceb9afdade1ab3f0fd8dbce5b1b2f468ad653baf10e771781b2b67b73" dependencies = [ "clap_builder", ] [[package]] name = "clap_builder" -version = "4.3.11" +version = "4.3.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "98c59138d527eeaf9b53f35a77fcc1fad9d883116070c63d5de1c7dc7b00c72b" +checksum = "9f2763db829349bf00cfc06251268865ed4363b93a943174f638daf3ecdba2cd" dependencies = [ "anstyle", "clap_lex", @@ -784,7 +795,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "13b588ba4ac1a99f7f2964d24b3d896ddc6bf847ee3855dbd4366f058cfcd331" dependencies = [ "quote 1.0.29", - "syn 2.0.25", + "syn 2.0.26", ] [[package]] @@ -1457,7 +1468,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cb0889898416213fab133e1d33a0e5858a48177452750691bde3666d0fdbaf8b" dependencies = [ "hermit-abi", - "rustix 0.38.3", + "rustix 0.38.4", "windows-sys", ] @@ -1834,7 +1845,7 @@ checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" dependencies = [ "proc-macro2 1.0.64", "quote 1.0.29", - "syn 2.0.25", + "syn 2.0.26", ] [[package]] @@ -1952,7 +1963,7 @@ dependencies = [ "pest_meta", "proc-macro2 1.0.64", "quote 1.0.29", - "syn 2.0.25", + "syn 2.0.26", ] [[package]] @@ -2054,7 +2065,7 @@ dependencies = [ "phf_shared 0.11.2", "proc-macro2 1.0.64", "quote 1.0.29", - "syn 2.0.25", + "syn 2.0.26", ] [[package]] @@ -2398,9 +2409,9 @@ dependencies = [ [[package]] name = "regex-automata" -version = "0.3.2" +version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "83d3daa6976cffb758ec878f108ba0e062a45b2d6ca3a2cca965338855476caf" +checksum = "39354c10dd07468c2e73926b23bb9c2caca74c5501e38a35da70406f1d923310" dependencies = [ "aho-corasick", "memchr", @@ -2409,9 +2420,9 @@ dependencies = [ [[package]] name = "regex-syntax" -version = "0.7.3" +version = "0.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2ab07dc67230e4a4718e70fd5c20055a4334b121f1f9db8fe63ef39ce9b8c846" +checksum = "e5ea92a5b6195c6ef2a0295ea818b312502c6fc94dde986c5553242e18fd4ce2" [[package]] name = "reqwest" @@ -2548,9 +2559,9 @@ dependencies = [ [[package]] name = "rustix" -version = "0.38.3" +version = "0.38.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac5ffa1efe7548069688cd7028f32591853cd7b5b756d41bcffd2353e4fc75b4" +checksum = "0a962918ea88d644592894bc6dc55acc6c0956488adcebbfb6e273506b7fd6e5" dependencies = [ "bitflags 2.3.3", "errno", @@ -2708,14 +2719,14 @@ checksum = "389894603bd18c46fa56231694f8d827779c0951a667087194cf9de94ed24682" dependencies = [ "proc-macro2 1.0.64", "quote 1.0.29", - "syn 2.0.25", + "syn 2.0.26", ] [[package]] name = "serde_json" -version = "1.0.100" +version = "1.0.102" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0f1e14e89be7aa4c4b78bdbdc9eb5bf8517829a600ae8eaa39a6e1d960b5185c" +checksum = "b5062a995d481b2308b6064e9af76011f2921c35f97b0468811ed9f6cd91dfed" dependencies = [ "itoa 1.0.8", "ryu", @@ -2937,9 +2948,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.25" +version = "2.0.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "15e3fc8c0c74267e2df136e5e5fb656a464158aa57624053375eb9c8c6e25ae2" +checksum = "45c3457aacde3c65315de5031ec191ce46604304d2446e803d71ade03308d970" dependencies = [ "proc-macro2 1.0.64", "quote 1.0.29", @@ -3009,7 +3020,7 @@ checksum = "463fe12d7993d3b327787537ce8dd4dfa058de32fc2b195ef3cde03dc4771e8f" dependencies = [ "proc-macro2 1.0.64", "quote 1.0.29", - "syn 2.0.25", + "syn 2.0.26", ] [[package]] @@ -3164,7 +3175,7 @@ checksum = "630bdcf245f78637c13ec01ffae6187cca34625e8c63150d424b59e55af2675e" dependencies = [ "proc-macro2 1.0.64", "quote 1.0.29", - "syn 2.0.25", + "syn 2.0.26", ] [[package]] @@ -3343,9 +3354,9 @@ checksum = "92888ba5573ff080736b3648696b70cafad7d250551175acbaa4e0385b3e1460" [[package]] name = "unicode-ident" -version = "1.0.10" +version = "1.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "22049a19f4a68748a168c0fc439f9516686aa045927ff767eca0a85101fb6e73" +checksum = "301abaae475aa91687eb82514b328ab47a211a533026cb25fc3e519b86adfc3c" [[package]] name = "unicode-normalization" @@ -3486,7 +3497,7 @@ dependencies = [ "once_cell", "proc-macro2 1.0.64", "quote 1.0.29", - "syn 2.0.25", + "syn 2.0.26", "wasm-bindgen-shared", ] @@ -3520,7 +3531,7 @@ checksum = "54681b18a46765f095758388f2d0cf16eb8d4169b639ab575a8f5693af210c7b" dependencies = [ "proc-macro2 1.0.64", "quote 1.0.29", - "syn 2.0.25", + "syn 2.0.26", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -3543,10 +3554,11 @@ dependencies = [ [[package]] name = "websurfx" -version = "0.13.17" +version = "0.14.0" dependencies = [ "actix-files", "actix-web", + "async-trait", "criterion", "env_logger", "error-stack", diff --git a/Cargo.toml b/Cargo.toml index 137134a..fe75fb8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "websurfx" -version = "0.13.17" +version = "0.14.0" edition = "2021" description = "An open-source alternative to Searx that provides clean, ad-free, and organic results with incredible speed while keeping privacy and security in mind." repository = "https://github.com/neon-mmd/websurfx" @@ -12,7 +12,7 @@ tokio = {version="*",features=["full"]} serde = {version="*",features=["derive"]} handlebars = { version = "4.3.6", features = ["dir_source"] } scraper = {version="*"} -actix-web = {version="4.3.1"} +actix-web = {version="4.3.1", features = ["cookies"]} actix-files = {version="0.6.2"} serde_json = {version="*"} fake-useragent = {version="*"} @@ -24,6 +24,7 @@ md5 = {version="*"} rand={version="*"} once_cell = {version="*"} error-stack = {version="0.3.1"} +async-trait = {version="*"} [dev-dependencies] rusty-hook = "^0.11.2" diff --git a/src/config/parser.rs b/src/config/parser.rs index e411732..971cd48 100644 --- a/src/config/parser.rs +++ b/src/config/parser.rs @@ -3,7 +3,7 @@ use super::parser_models::Style; use rlua::Lua; -use std::{format, fs, path::Path}; +use std::{collections::HashMap, format, fs, path::Path}; // ------- Constants -------- static COMMON_DIRECTORY_NAME: &str = "websurfx"; @@ -18,6 +18,10 @@ static CONFIG_FILE_NAME: &str = "config.lua"; /// * `style` - It stores the theming options for the website. /// * `redis_url` - It stores the redis connection url address on which the redis /// client should connect. +/// * `aggregator` - It stores the option to whether enable or disable production use. +/// * `logging` - It stores the option to whether enable or disable logs. +/// * `debug` - It stores the option to whether enable or disable debug mode. +/// * `upstream_search_engines` - It stores all the engine names that were enabled by the user. #[derive(Clone)] pub struct Config { pub port: u16, @@ -27,12 +31,17 @@ pub struct Config { pub aggregator: AggregatorConfig, pub logging: bool, pub debug: bool, + pub upstream_search_engines: Vec, } /// Configuration options for the aggregator. +/// +/// # Fields +/// +/// * `random_delay` - It stores the option to whether enable or disable random delays between +/// requests. #[derive(Clone)] pub struct AggregatorConfig { - /// Whether to introduce a random delay before sending the request to the search engine. pub random_delay: bool, } @@ -66,6 +75,11 @@ impl Config { }, logging: globals.get::<_, bool>("logging")?, debug: globals.get::<_, bool>("debug")?, + upstream_search_engines: globals + .get::<_, HashMap>("upstream_search_engines")? + .into_iter() + .filter_map(|(key, value)| value.then_some(key)) + .collect(), }) }) } diff --git a/src/engines/duckduckgo.rs b/src/engines/duckduckgo.rs index 21202de..f8ad597 100644 --- a/src/engines/duckduckgo.rs +++ b/src/engines/duckduckgo.rs @@ -2,154 +2,150 @@ //! by querying the upstream duckduckgo search engine with user provided query and with a page //! number if provided. -use std::{collections::HashMap, time::Duration}; +use std::collections::HashMap; use reqwest::header::{HeaderMap, CONTENT_TYPE, COOKIE, REFERER, USER_AGENT}; use scraper::{Html, Selector}; use crate::results::aggregation_models::RawSearchResult; -use super::engine_models::EngineError; +use super::engine_models::{EngineError, SearchEngine}; use error_stack::{IntoReport, Report, Result, ResultExt}; -/// This function scrapes results from the upstream engine duckduckgo and puts all the scraped -/// results like title, visiting_url (href in html),engine (from which engine it was fetched from) -/// and description in a RawSearchResult and then adds that to HashMap whose keys are url and -/// values are RawSearchResult struct and then returns it within a Result enum. -/// -/// # Arguments -/// -/// * `query` - Takes the user provided query to query to the upstream search engine with. -/// * `page` - Takes an u32 as an argument. -/// * `user_agent` - Takes a random user agent string as an argument. -/// -/// # Errors -/// -/// Returns an `EngineErrorKind` if the user is not connected to the internet or if their is failure to -/// reach the above `upstream search engine` page or if the `upstream search engine` is unable to -/// provide results for the requested search query and also returns error if the scraping selector -/// or HeaderMap fails to initialize. -pub async fn results( - query: &str, - page: u32, - user_agent: &str, -) -> Result, EngineError> { - // Page number can be missing or empty string and so appropriate handling is required - // so that upstream server receives valid page number. - let url: String = match page { - 1 => { - format!("https://html.duckduckgo.com/html/?q={query}&s=&dc=&v=1&o=json&api=/d.js") - } - _ => { - format!( - "https://duckduckgo.com/html/?q={}&s={}&dc={}&v=1&o=json&api=/d.js", - query, - (page / 2 + (page % 2)) * 30, - (page / 2 + (page % 2)) * 30 + 1 - ) - } - }; +/// A new DuckDuckGo engine type defined in-order to implement the `SearchEngine` trait which allows to +/// reduce code duplication as well as allows to create vector of different search engines easily. +pub struct DuckDuckGo; - // initializing HeaderMap and adding appropriate headers. - let mut header_map = HeaderMap::new(); - header_map.insert( - USER_AGENT, - user_agent - .parse() - .into_report() - .change_context(EngineError::UnexpectedError)?, - ); - header_map.insert( - REFERER, - "https://google.com/" - .parse() - .into_report() - .change_context(EngineError::UnexpectedError)?, - ); - header_map.insert( - CONTENT_TYPE, - "application/x-www-form-urlencoded" - .parse() - .into_report() - .change_context(EngineError::UnexpectedError)?, - ); - header_map.insert( - COOKIE, - "kl=wt-wt" - .parse() - .into_report() - .change_context(EngineError::UnexpectedError)?, - ); - - // fetch the html from upstream duckduckgo engine - let results: String = reqwest::Client::new() - .get(url) - .timeout(Duration::from_secs(5)) - .headers(header_map) // add spoofed headers to emulate human behavior - .send() - .await - .into_report() - .change_context(EngineError::RequestError)? - .text() - .await - .into_report() - .change_context(EngineError::RequestError)?; - - let document: Html = Html::parse_document(&results); - - let no_result: Selector = Selector::parse(".no-results") - .map_err(|_| Report::new(EngineError::UnexpectedError)) - .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".no-results"))?; - - if document.select(&no_result).next().is_some() { - return Err(Report::new(EngineError::EmptyResultSet)); - } - - let results: Selector = Selector::parse(".result") - .map_err(|_| Report::new(EngineError::UnexpectedError)) - .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result"))?; - let result_title: Selector = Selector::parse(".result__a") - .map_err(|_| Report::new(EngineError::UnexpectedError)) - .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result__a"))?; - let result_url: Selector = Selector::parse(".result__url") - .map_err(|_| Report::new(EngineError::UnexpectedError)) - .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result__url"))?; - let result_desc: Selector = Selector::parse(".result__snippet") - .map_err(|_| Report::new(EngineError::UnexpectedError)) - .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result__snippet"))?; - - // scrape all the results from the html - Ok(document - .select(&results) - .map(|result| { - RawSearchResult::new( - result - .select(&result_title) - .next() - .unwrap() - .inner_html() - .trim() - .to_string(), +#[async_trait::async_trait] +impl SearchEngine for DuckDuckGo { + /// This function scrapes results from the upstream engine duckduckgo and puts all the scraped + /// results like title, visiting_url (href in html),engine (from which engine it was fetched from) + /// and description in a RawSearchResult and then adds that to HashMap whose keys are url and + /// values are RawSearchResult struct and then returns it within a Result enum. + /// + /// # Arguments + /// + /// * `query` - Takes the user provided query to query to the upstream search engine with. + /// * `page` - Takes an u32 as an argument. + /// * `user_agent` - Takes a random user agent string as an argument. + /// + /// # Errors + /// + /// Returns an `EngineErrorKind` if the user is not connected to the internet or if their is failure to + /// reach the above `upstream search engine` page or if the `upstream search engine` is unable to + /// provide results for the requested search query and also returns error if the scraping selector + /// or HeaderMap fails to initialize. + async fn results( + &self, + query: String, + page: u32, + user_agent: String, + ) -> Result, EngineError> { + // Page number can be missing or empty string and so appropriate handling is required + // so that upstream server recieves valid page number. + let url: String = match page { + 1 => { + format!("https://html.duckduckgo.com/html/?q={query}&s=&dc=&v=1&o=json&api=/d.js") + } + _ => { format!( - "https://{}", + "https://duckduckgo.com/html/?q={}&s={}&dc={}&v=1&o=json&api=/d.js", + query, + (page / 2 + (page % 2)) * 30, + (page / 2 + (page % 2)) * 30 + 1 + ) + } + }; + + // initializing HeaderMap and adding appropriate headers. + let mut header_map = HeaderMap::new(); + header_map.insert( + USER_AGENT, + user_agent + .parse() + .into_report() + .change_context(EngineError::UnexpectedError)?, + ); + header_map.insert( + REFERER, + "https://google.com/" + .parse() + .into_report() + .change_context(EngineError::UnexpectedError)?, + ); + header_map.insert( + CONTENT_TYPE, + "application/x-www-form-urlencoded" + .parse() + .into_report() + .change_context(EngineError::UnexpectedError)?, + ); + header_map.insert( + COOKIE, + "kl=wt-wt" + .parse() + .into_report() + .change_context(EngineError::UnexpectedError)?, + ); + + let document: Html = Html::parse_document( + &DuckDuckGo::fetch_html_from_upstream(self, url, header_map).await?, + ); + + let no_result: Selector = Selector::parse(".no-results") + .map_err(|_| Report::new(EngineError::UnexpectedError)) + .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".no-results"))?; + + if document.select(&no_result).next().is_some() { + return Err(Report::new(EngineError::EmptyResultSet)); + } + + let results: Selector = Selector::parse(".result") + .map_err(|_| Report::new(EngineError::UnexpectedError)) + .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result"))?; + let result_title: Selector = Selector::parse(".result__a") + .map_err(|_| Report::new(EngineError::UnexpectedError)) + .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result__a"))?; + let result_url: Selector = Selector::parse(".result__url") + .map_err(|_| Report::new(EngineError::UnexpectedError)) + .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result__url"))?; + let result_desc: Selector = Selector::parse(".result__snippet") + .map_err(|_| Report::new(EngineError::UnexpectedError)) + .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result__snippet"))?; + + // scrape all the results from the html + Ok(document + .select(&results) + .map(|result| { + RawSearchResult::new( result - .select(&result_url) + .select(&result_title) .next() .unwrap() .inner_html() .trim() - ), - result - .select(&result_desc) - .next() - .unwrap() - .inner_html() - .trim() - .to_string(), - vec!["duckduckgo".to_string()], - ) - }) - .map(|search_result| (search_result.visiting_url.clone(), search_result)) - .collect()) + .to_string(), + format!( + "https://{}", + result + .select(&result_url) + .next() + .unwrap() + .inner_html() + .trim() + ), + result + .select(&result_desc) + .next() + .unwrap() + .inner_html() + .trim() + .to_string(), + vec!["duckduckgo".to_string()], + ) + }) + .map(|search_result| (search_result.visiting_url.clone(), search_result)) + .collect()) + } } diff --git a/src/engines/engine_models.rs b/src/engines/engine_models.rs index 091c62d..f635ca9 100644 --- a/src/engines/engine_models.rs +++ b/src/engines/engine_models.rs @@ -1,8 +1,9 @@ //! This module provides the error enum to handle different errors associated while requesting data from //! the upstream search engines with the search query provided by the user. -use error_stack::Context; -use std::fmt; +use crate::results::aggregation_models::RawSearchResult; +use error_stack::{IntoReport, Result, ResultExt}; +use std::{collections::HashMap, fmt, time::Duration}; /// A custom error type used for handle engine associated errors. /// @@ -40,4 +41,35 @@ impl fmt::Display for EngineError { } } -impl Context for EngineError {} +impl error_stack::Context for EngineError {} + +/// A trait to define common behaviour for all search engines. +#[async_trait::async_trait] +pub trait SearchEngine { + async fn fetch_html_from_upstream( + &self, + url: String, + header_map: reqwest::header::HeaderMap, + ) -> Result { + // fetch the html from upstream search engine + Ok(reqwest::Client::new() + .get(url) + .timeout(Duration::from_secs(30)) // Add timeout to request to avoid DDOSing the server + .headers(header_map) // add spoofed headers to emulate human behaviour + .send() + .await + .into_report() + .change_context(EngineError::RequestError)? + .text() + .await + .into_report() + .change_context(EngineError::RequestError)?) + } + + async fn results( + &self, + query: String, + page: u32, + user_agent: String, + ) -> Result, EngineError>; +} diff --git a/src/engines/searx.rs b/src/engines/searx.rs index 2a59901..145abf1 100644 --- a/src/engines/searx.rs +++ b/src/engines/searx.rs @@ -8,131 +8,130 @@ use std::collections::HashMap; use crate::results::aggregation_models::RawSearchResult; -use super::engine_models::EngineError; +use super::engine_models::{EngineError, SearchEngine}; use error_stack::{IntoReport, Report, Result, ResultExt}; -/// This function scrapes results from the upstream engine duckduckgo and puts all the scraped -/// results like title, visiting_url (href in html),engine (from which engine it was fetched from) -/// and description in a RawSearchResult and then adds that to HashMap whose keys are url and -/// values are RawSearchResult struct and then returns it within a Result enum. -/// -/// # Arguments -/// -/// * `query` - Takes the user provided query to query to the upstream search engine with. -/// * `page` - Takes an u32 as an argument. -/// * `user_agent` - Takes a random user agent string as an argument. -/// -/// # Errors -/// -/// Returns an `EngineErrorKind` if the user is not connected to the internet or if their is failure to -/// reach the above `upstream search engine` page or if the `upstream search engine` is unable to -/// provide results for the requested search query and also returns error if the scraping selector -/// or HeaderMap fails to initialize. -pub async fn results( - query: &str, - page: u32, - user_agent: &str, -) -> Result, EngineError> { - // Page number can be missing or empty string and so appropriate handling is required - // so that upstream server recieves valid page number. - let url: String = format!("https://searx.work/search?q={query}&pageno={page}"); +/// A new Searx engine type defined in-order to implement the `SearchEngine` trait which allows to +/// reduce code duplication as well as allows to create vector of different search engines easily. +pub struct Searx; - // initializing headers and adding appropriate headers. - let mut header_map = HeaderMap::new(); - header_map.insert( - USER_AGENT, - user_agent - .parse() - .into_report() - .change_context(EngineError::UnexpectedError)?, - ); - header_map.insert( - REFERER, - "https://google.com/" - .parse() - .into_report() - .change_context(EngineError::UnexpectedError)?, - ); - header_map.insert( - CONTENT_TYPE, - "application/x-www-form-urlencoded" - .parse() - .into_report() - .change_context(EngineError::UnexpectedError)?, - ); - header_map.insert(COOKIE, "categories=general; language=auto; locale=en; autocomplete=duckduckgo; image_proxy=1; method=POST; safesearch=2; theme=simple; results_on_new_tab=1; doi_resolver=oadoi.org; simple_style=auto; center_alignment=1; query_in_title=1; infinite_scroll=0; disabled_engines=; enabled_engines=\"archive is__general\\054yep__general\\054curlie__general\\054currency__general\\054ddg definitions__general\\054wikidata__general\\054duckduckgo__general\\054tineye__general\\054lingva__general\\054startpage__general\\054yahoo__general\\054wiby__general\\054marginalia__general\\054alexandria__general\\054wikibooks__general\\054wikiquote__general\\054wikisource__general\\054wikiversity__general\\054wikivoyage__general\\054dictzone__general\\054seznam__general\\054mojeek__general\\054naver__general\\054wikimini__general\\054brave__general\\054petalsearch__general\\054goo__general\"; disabled_plugins=; enabled_plugins=\"searx.plugins.hostname_replace\\054searx.plugins.oa_doi_rewrite\\054searx.plugins.vim_hotkeys\"; tokens=; maintab=on; enginetab=on".parse().into_report().change_context(EngineError::UnexpectedError)?); +#[async_trait::async_trait] +impl SearchEngine for Searx { + /// This function scrapes results from the upstream engine duckduckgo and puts all the scraped + /// results like title, visiting_url (href in html),engine (from which engine it was fetched from) + /// and description in a RawSearchResult and then adds that to HashMap whose keys are url and + /// values are RawSearchResult struct and then returns it within a Result enum. + /// + /// # Arguments + /// + /// * `query` - Takes the user provided query to query to the upstream search engine with. + /// * `page` - Takes an u32 as an argument. + /// * `user_agent` - Takes a random user agent string as an argument. + /// + /// # Errors + /// + /// Returns an `EngineErrorKind` if the user is not connected to the internet or if their is failure to + /// reach the above `upstream search engine` page or if the `upstream search engine` is unable to + /// provide results for the requested search query and also returns error if the scraping selector + /// or HeaderMap fails to initialize. - // fetch the html from upstream searx instance engine - let results: String = reqwest::Client::new() - .get(url) - .headers(header_map) // add spoofed headers to emulate human behaviours. - .send() - .await - .into_report() - .change_context(EngineError::RequestError)? - .text() - .await - .into_report() - .change_context(EngineError::RequestError)?; + async fn results( + &self, + query: String, + page: u32, + user_agent: String, + ) -> Result, EngineError> { + // Page number can be missing or empty string and so appropriate handling is required + // so that upstream server recieves valid page number. + let url: String = format!("https://searx.work/search?q={query}&pageno={page}"); - let document: Html = Html::parse_document(&results); + // initializing headers and adding appropriate headers. + let mut header_map = HeaderMap::new(); + header_map.insert( + USER_AGENT, + user_agent + .parse() + .into_report() + .change_context(EngineError::UnexpectedError)?, + ); + header_map.insert( + REFERER, + "https://google.com/" + .parse() + .into_report() + .change_context(EngineError::UnexpectedError)?, + ); + header_map.insert( + CONTENT_TYPE, + "application/x-www-form-urlencoded" + .parse() + .into_report() + .change_context(EngineError::UnexpectedError)?, + ); + header_map.insert(COOKIE, "categories=general; language=auto; locale=en; autocomplete=duckduckgo; image_proxy=1; method=POST; safesearch=2; theme=simple; results_on_new_tab=1; doi_resolver=oadoi.org; simple_style=auto; center_alignment=1; query_in_title=1; infinite_scroll=0; disabled_engines=; enabled_engines=\"archive is__general\\054yep__general\\054curlie__general\\054currency__general\\054ddg definitions__general\\054wikidata__general\\054duckduckgo__general\\054tineye__general\\054lingva__general\\054startpage__general\\054yahoo__general\\054wiby__general\\054marginalia__general\\054alexandria__general\\054wikibooks__general\\054wikiquote__general\\054wikisource__general\\054wikiversity__general\\054wikivoyage__general\\054dictzone__general\\054seznam__general\\054mojeek__general\\054naver__general\\054wikimini__general\\054brave__general\\054petalsearch__general\\054goo__general\"; disabled_plugins=; enabled_plugins=\"searx.plugins.hostname_replace\\054searx.plugins.oa_doi_rewrite\\054searx.plugins.vim_hotkeys\"; tokens=; maintab=on; enginetab=on".parse().into_report().change_context(EngineError::UnexpectedError)?); - let no_result: Selector = Selector::parse("#urls>.dialog-error>p") - .map_err(|_| Report::new(EngineError::UnexpectedError)) - .attach_printable_lazy(|| format!("invalid CSS selector: {}", "#urls>.dialog-error>p"))?; + let document: Html = + Html::parse_document(&Searx::fetch_html_from_upstream(self, url, header_map).await?); - if let Some(no_result_msg) = document.select(&no_result).nth(1) { - if no_result_msg.inner_html() + let no_result: Selector = Selector::parse("#urls>.dialog-error>p") + .map_err(|_| Report::new(EngineError::UnexpectedError)) + .attach_printable_lazy(|| { + format!("invalid CSS selector: {}", "#urls>.dialog-error>p") + })?; + + if let Some(no_result_msg) = document.select(&no_result).nth(1) { + if no_result_msg.inner_html() == "we didn't find any results. Please use another query or search in more categories" { return Err(Report::new(EngineError::EmptyResultSet)); } + } + + let results: Selector = Selector::parse(".result") + .map_err(|_| Report::new(EngineError::UnexpectedError)) + .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result"))?; + let result_title: Selector = Selector::parse("h3>a") + .map_err(|_| Report::new(EngineError::UnexpectedError)) + .attach_printable_lazy(|| format!("invalid CSS selector: {}", "h3>a"))?; + let result_url: Selector = Selector::parse("h3>a") + .map_err(|_| Report::new(EngineError::UnexpectedError)) + .attach_printable_lazy(|| format!("invalid CSS selector: {}", "h3>a"))?; + + let result_desc: Selector = Selector::parse(".content") + .map_err(|_| Report::new(EngineError::UnexpectedError)) + .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".content"))?; + + // scrape all the results from the html + Ok(document + .select(&results) + .map(|result| { + RawSearchResult::new( + result + .select(&result_title) + .next() + .unwrap() + .inner_html() + .trim() + .to_string(), + result + .select(&result_url) + .next() + .unwrap() + .value() + .attr("href") + .unwrap() + .to_string(), + result + .select(&result_desc) + .next() + .unwrap() + .inner_html() + .trim() + .to_string(), + vec!["searx".to_string()], + ) + }) + .map(|search_result| (search_result.visiting_url.clone(), search_result)) + .collect()) } - - let results: Selector = Selector::parse(".result") - .map_err(|_| Report::new(EngineError::UnexpectedError)) - .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result"))?; - let result_title: Selector = Selector::parse("h3>a") - .map_err(|_| Report::new(EngineError::UnexpectedError)) - .attach_printable_lazy(|| format!("invalid CSS selector: {}", "h3>a"))?; - let result_url: Selector = Selector::parse("h3>a") - .map_err(|_| Report::new(EngineError::UnexpectedError)) - .attach_printable_lazy(|| format!("invalid CSS selector: {}", "h3>a"))?; - - let result_desc: Selector = Selector::parse(".content") - .map_err(|_| Report::new(EngineError::UnexpectedError)) - .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".content"))?; - - // scrape all the results from the html - Ok(document - .select(&results) - .map(|result| { - RawSearchResult::new( - result - .select(&result_title) - .next() - .unwrap() - .inner_html() - .trim() - .to_string(), - result - .select(&result_url) - .next() - .unwrap() - .value() - .attr("href") - .unwrap() - .to_string(), - result - .select(&result_desc) - .next() - .unwrap() - .inner_html() - .trim() - .to_string(), - vec!["searx".to_string()], - ) - }) - .map(|search_result| (search_result.visiting_url.clone(), search_result)) - .collect()) } diff --git a/src/results/aggregation_models.rs b/src/results/aggregation_models.rs index 86559a7..6766fae 100644 --- a/src/results/aggregation_models.rs +++ b/src/results/aggregation_models.rs @@ -3,7 +3,7 @@ use serde::{Deserialize, Serialize}; -use crate::config::parser_models::Style; +use crate::{config::parser_models::Style, engines::engine_models::EngineError}; /// A named struct to store, serialize and deserializes the individual search result from all the /// scraped and aggregated search results from the upstream search engines. @@ -16,7 +16,7 @@ use crate::config::parser_models::Style; /// * `url` - The url to be displayed below the search result title in html. /// * `description` - The description of the search result. /// * `engine` - The names of the upstream engines from which this results were provided. -#[derive(Debug, Serialize, Deserialize)] +#[derive(Serialize, Deserialize)] #[serde(rename_all = "camelCase")] pub struct SearchResult { pub title: String, @@ -116,6 +116,25 @@ impl RawSearchResult { } } +#[derive(Serialize, Deserialize)] +pub struct EngineErrorInfo { + pub error: String, + pub engine: String, +} + +impl EngineErrorInfo { + pub fn new(error: &EngineError, engine: String) -> Self { + Self { + error: match error { + EngineError::RequestError => String::from("RequestError"), + EngineError::EmptyResultSet => String::from("EmptyResultSet"), + EngineError::UnexpectedError => String::from("UnexpectedError"), + }, + engine, + } + } +} + /// A named struct to store, serialize, deserialize the all the search results scraped and /// aggregated from the upstream search engines. /// @@ -124,12 +143,18 @@ impl RawSearchResult { /// * `results` - Stores the individual serializable `SearchResult` struct into a vector of /// `SearchResult` structs. /// * `page_query` - Stores the current pages search query `q` provided in the search url. +/// * `style` - Stores the theming options for the website. +/// * `engine_errors_info` - Stores the information on which engines failed with their engine name +/// and the type of error that caused it. +/// * `empty_result_set` - Stores a boolean which indicates that no engines gave a result for the +/// given search query. #[derive(Serialize, Deserialize)] #[serde(rename_all = "camelCase")] pub struct SearchResults { pub results: Vec, pub page_query: String, pub style: Style, + pub engine_errors_info: Vec, } impl SearchResults { @@ -141,14 +166,22 @@ impl SearchResults { /// and stores it into a vector of `SearchResult` structs. /// * `page_query` - Takes an argument of current page`s search query `q` provided in /// the search url. - pub fn new(results: Vec, page_query: String) -> Self { + /// * `empty_result_set` - Takes a boolean which indicates that no engines gave a result for the + /// given search query. + pub fn new( + results: Vec, + page_query: String, + engine_errors_info: Vec, + ) -> Self { SearchResults { results, page_query, style: Style::new("".to_string(), "".to_string()), + engine_errors_info, } } + /// A setter function to add website style to the return search results. pub fn add_style(&mut self, style: Style) { self.style = style; } diff --git a/src/results/aggregator.rs b/src/results/aggregator.rs index f5719b0..501b273 100644 --- a/src/results/aggregator.rs +++ b/src/results/aggregator.rs @@ -3,22 +3,41 @@ use std::{collections::HashMap, time::Duration}; +use error_stack::Report; use rand::Rng; -use tokio::join; +use tokio::task::JoinHandle; use super::{ - aggregation_models::{RawSearchResult, SearchResult, SearchResults}, + aggregation_models::{EngineErrorInfo, RawSearchResult, SearchResult, SearchResults}, user_agent::random_user_agent, }; -use crate::engines::{duckduckgo, searx}; +use crate::engines::{ + duckduckgo, + engine_models::{EngineError, SearchEngine}, + searx, +}; -/// A function that aggregates all the scraped results from the above upstream engines and -/// then removes duplicate results and if two results are found to be from two or more engines -/// then puts their names together to show the results are fetched from these upstream engines -/// and then removes all data from the HashMap and puts into a struct of all results aggregated -/// into a vector and also adds the query used into the struct this is necessary because -/// otherwise the search bar in search remains empty if searched from the query url +/// Aliases for long type annotations +type FutureVec = Vec, Report>>>; + +/// The function aggregates the scraped results from the user-selected upstream search engines. +/// These engines can be chosen either from the user interface (UI) or from the configuration file. +/// The code handles this process by matching the selected search engines and adding them to a vector. +/// This vector is then used to create an asynchronous task vector using `tokio::spawn`, which returns +/// a future. This future is awaited in another loop. Once the results are collected, they are filtered +/// to remove any errors and ensure only proper results are included. If an error is encountered, it is +/// sent to the UI along with the name of the engine and the type of error. This information is finally +/// placed in the returned `SearchResults` struct. +/// +/// Additionally, the function eliminates duplicate results. If two results are identified as coming from +/// multiple engines, their names are combined to indicate that the results were fetched from these upstream +/// engines. After this, all the data in the `HashMap` is removed and placed into a struct that contains all +/// the aggregated results in a vector. Furthermore, the query used is also added to the struct. This step is +/// necessary to ensure that the search bar in the search remains populated even when searched from the query URL. +/// +/// Overall, this function serves to aggregate scraped results from user-selected search engines, handling errors, +/// removing duplicates, and organizing the data for display in the UI. /// /// # Example: /// @@ -30,6 +49,9 @@ use crate::engines::{duckduckgo, searx}; /// * `query` - Accepts a string to query with the above upstream search engines. /// * `page` - Accepts an u32 page number. /// * `random_delay` - Accepts a boolean value to add a random delay before making the request. +/// * `debug` - Accepts a boolean value to enable or disable debug mode option. +/// * `upstream_search_engines` - Accepts a vector of search engine names which was selected by the +/// user through the UI or the config file. /// /// # Error /// @@ -37,10 +59,11 @@ use crate::engines::{duckduckgo, searx}; /// function in either `searx` or `duckduckgo` or both otherwise returns a `SearchResults struct` /// containing appropriate values. pub async fn aggregate( - query: &str, + query: String, page: u32, random_delay: bool, debug: bool, + upstream_search_engines: Vec, ) -> Result> { let user_agent: String = random_user_agent(); let mut result_map: HashMap = HashMap::new(); @@ -53,41 +76,106 @@ pub async fn aggregate( } // fetch results from upstream search engines simultaneously/concurrently. - let (ddg_map_results, searx_map_results) = join!( - duckduckgo::results(query, page, &user_agent), - searx::results(query, page, &user_agent) - ); + let search_engines: Vec> = upstream_search_engines + .iter() + .map(|engine| match engine.to_lowercase().as_str() { + "duckduckgo" => Box::new(duckduckgo::DuckDuckGo) as Box, + "searx" => Box::new(searx::Searx) as Box, + &_ => panic!("Config Error: Incorrect config file option provided"), + }) + .collect(); - let ddg_map_results = ddg_map_results.unwrap_or_else(|e| { - if debug { - log::error!("Error fetching results from DuckDuckGo: {:?}", e); + let task_capacity: usize = search_engines.len(); + + let tasks: FutureVec = search_engines + .into_iter() + .map(|search_engine| { + let query: String = query.clone(); + let user_agent: String = user_agent.clone(); + tokio::spawn( + async move { search_engine.results(query, page, user_agent.clone()).await }, + ) + }) + .collect(); + + let mut outputs = Vec::with_capacity(task_capacity); + + for task in tasks { + if let Ok(result) = task.await { + outputs.push(result) } - HashMap::new() - }); + } - let searx_map_results = searx_map_results.unwrap_or_else(|e| { - if debug { - log::error!("Error fetching results from Searx: {:?}", e); + let mut engine_errors_info: Vec = Vec::new(); + + // The code block `outputs.iter()` determines whether it is the first time the code is being run. + // It does this by checking the initial flag. If it is the first time, the code selects the first + // engine from which results are fetched and adds or extends them into the `result_map`. If the + // initially selected engine fails, the code automatically selects another engine to map or extend + // into the `result_map`. On the other hand, if an engine selected for the first time successfully + // fetches results and maps them into the `result_map`, the initial flag is set to false. Subsequently, + // the code iterates through the remaining engines one by one. It compares the fetched results from each + // engine with the results already present in the `result_map` to identify any duplicates. If duplicate + // results are found, the code groups them together with the name of the engine from which they were + // fetched, and automatically removes the duplicate results from the newly fetched data. + // + // Additionally, the code handles errors returned by the engines. It keeps track of which engines + // encountered errors and stores this information in a vector of structures called `EngineErrorInfo`. + // Each structure in this vector contains the name of the engine and the type of error it returned. + // These structures will later be added to the final `SearchResults` structure. The `SearchResults` + // structure is used to display an error box in the UI containing the relevant information from + // the `EngineErrorInfo` structure. + // + // In summary, this code block manages the selection of engines, handling of duplicate results, and tracking + // of errors in order to populate the `result_map` and provide informative feedback to the user through the + // `SearchResults` structure. + let mut initial: bool = true; + let mut counter: usize = 0; + outputs.iter().for_each(|results| { + if initial { + match results { + Ok(result) => { + result_map.extend(result.clone()); + counter += 1; + initial = false + } + Err(error_type) => { + engine_errors_info.push(EngineErrorInfo::new( + error_type.downcast_ref::().unwrap(), + upstream_search_engines[counter].clone(), + )); + counter += 1 + } + } + } else { + match results { + Ok(result) => { + result.clone().into_iter().for_each(|(key, value)| { + result_map + .entry(key) + .and_modify(|result| { + result.add_engines(value.clone().engine()); + }) + .or_insert_with(|| -> RawSearchResult { + RawSearchResult::new( + value.title.clone(), + value.visiting_url.clone(), + value.description.clone(), + value.engine.clone(), + ) + }); + }); + counter += 1 + } + Err(error_type) => { + engine_errors_info.push(EngineErrorInfo::new( + error_type.downcast_ref::().unwrap(), + upstream_search_engines[counter].clone(), + )); + counter += 1 + } + } } - HashMap::new() - }); - - result_map.extend(ddg_map_results); - - searx_map_results.into_iter().for_each(|(key, value)| { - result_map - .entry(key) - .and_modify(|result| { - result.add_engines(value.clone().engine()); - }) - .or_insert_with(|| -> RawSearchResult { - RawSearchResult::new( - value.title.clone(), - value.visiting_url.clone(), - value.description.clone(), - value.engine.clone(), - ) - }); }); Ok(SearchResults::new( @@ -104,5 +192,6 @@ pub async fn aggregate( }) .collect(), query.to_string(), + engine_errors_info, )) } diff --git a/src/server/routes.rs b/src/server/routes.rs index e33848d..cb6999d 100644 --- a/src/server/routes.rs +++ b/src/server/routes.rs @@ -22,7 +22,7 @@ use serde::Deserialize; /// of the search url. /// * `page` - It stores the search parameter `page` (or pageno in simple words) /// of the search url. -#[derive(Debug, Deserialize)] +#[derive(Deserialize)] struct SearchParams { q: Option, page: Option, @@ -51,6 +51,21 @@ pub async fn not_found( .body(page_content)) } +/// A named struct which is used to deserialize the cookies fetched from the client side. +/// +/// # Fields +/// +/// * `theme` - It stores the theme name used in the website. +/// * `colorscheme` - It stores the colorscheme name used for the website theme. +/// * `engines` - It stores the user selected upstream search engines selected from the UI. +#[allow(dead_code)] +#[derive(Deserialize)] +struct Cookie { + theme: String, + colorscheme: String, + engines: Vec, +} + /// Handles the route of search page of the `websurfx` meta search engine website and it takes /// two search url parameters `q` and `page` where `page` parameter is optional. /// @@ -72,7 +87,6 @@ pub async fn search( config: web::Data, ) -> Result> { let params = web::Query::::from_query(req.query_string())?; - match ¶ms.q { Some(query) => { if query.trim().is_empty() { @@ -89,7 +103,7 @@ pub async fn search( "http://{}:{}/search?q={}&page={}", config.binding_ip, config.port, query, page ); - let results_json = results(url, &config, query, page).await?; + let results_json = results(url, &config, query.to_string(), page, req).await?; let page_content: String = hbs.render("search", &results_json)?; Ok(HttpResponse::Ok().body(page_content)) } @@ -104,23 +118,51 @@ pub async fn search( async fn results( url: String, config: &Config, - query: &str, + query: String, page: u32, + req: HttpRequest, ) -> Result> { //Initialize redis cache connection struct let mut redis_cache = RedisCache::new(config.redis_url.clone())?; // fetch the cached results json. let cached_results_json = redis_cache.cached_json(&url); - // check if fetched results was indeed fetched or it was an error and if so + // check if fetched cache results was indeed fetched or it was an error and if so // handle the data accordingly. match cached_results_json { - Ok(results_json) => Ok(serde_json::from_str::(&results_json).unwrap()), + Ok(results) => Ok(serde_json::from_str::(&results).unwrap()), Err(_) => { - let mut results_json: crate::results::aggregation_models::SearchResults = - aggregate(query, page, config.aggregator.random_delay, config.debug).await?; - results_json.add_style(config.style.clone()); - redis_cache.cache_results(serde_json::to_string(&results_json)?, &url)?; - Ok(results_json) + // check if the cookie value is empty or not if it is empty then use the + // default selected upstream search engines from the config file otherwise + // parse the non-empty cookie and grab the user selected engines from the + // UI and use that. + let mut results: crate::results::aggregation_models::SearchResults = match req + .cookie("appCookie") + { + Some(cookie_value) => { + let cookie_value: Cookie = serde_json::from_str(cookie_value.name_value().1)?; + aggregate( + query, + page, + config.aggregator.random_delay, + config.debug, + cookie_value.engines, + ) + .await? + } + None => { + aggregate( + query, + page, + config.aggregator.random_delay, + config.debug, + config.upstream_search_engines.clone(), + ) + .await? + } + }; + results.add_style(config.style.clone()); + redis_cache.cache_results(serde_json::to_string(&results)?, &url)?; + Ok(results) } } } diff --git a/websurfx/config.lua b/websurfx/config.lua index 3e2167a..5d64ee1 100644 --- a/websurfx/config.lua +++ b/websurfx/config.lua @@ -5,7 +5,7 @@ debug = false -- an option to enable or disable debug mode. -- ### Server ### port = "8080" -- port on which server should be launched binding_ip = "127.0.0.1" --ip address on the which server should be launched. -production_use = false -- whether to use production mode or not (in other words this option should be used if it is to be used to host it on the server to provide a service to a large number of users) +production_use = false -- whether to use production mode or not (in other words this option should be used if it is to be used to host it on the server to provide a service to a large number of users (more than one)) -- if production_use is set to true -- There will be a random delay before sending the request to the search engines, this is to prevent DDoSing the upstream search engines from a large number of simultaneous requests. @@ -26,3 +26,6 @@ theme = "simple" -- the theme name which should be used for the website -- ### Caching ### redis_url = "redis://127.0.0.1:8082" -- redis connection url address on which the client should connect on. + +-- ### Search Engines ### +upstream_search_engines = { DuckDuckGo = true, Searx = false } -- select the upstream search engines from which the results should be fetched.