diff --git a/Cargo.lock b/Cargo.lock index b5dcc6e..ce683cb 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -57,6 +57,18 @@ dependencies = [ "pin-project-lite", ] +[[package]] +name = "actix-governor" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "46ff2d40f2bc627b8054c5e20fa6b0b0cf9428699b54bd41634e9ae3098ad555" +dependencies = [ + "actix-http", + "actix-web", + "futures 0.3.28", + "governor", +] + [[package]] name = "actix-http" version = "3.4.0" @@ -590,7 +602,7 @@ version = "0.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "888604f00b3db336d2af898ec3c1d5d0ddf5e6d462220f2ededc33a87ac4bbd5" dependencies = [ - "time 0.1.45", + "time 0.1.43", "url 1.7.2", ] @@ -618,7 +630,7 @@ dependencies = [ "publicsuffix", "serde", "serde_json", - "time 0.1.45", + "time 0.1.43", "try_from", "url 1.7.2", ] @@ -817,6 +829,19 @@ dependencies = [ "syn 2.0.32", ] +[[package]] +name = "dashmap" +version = "5.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "978747c1d849a7d2ee5e8adc0159961c48fb7e5db2f06af6723b80123bb53856" +dependencies = [ + "cfg-if 1.0.0", + "hashbrown 0.14.0", + "lock_api 0.4.10", + "once_cell", + "parking_lot_core 0.9.8", +] + [[package]] name = "deranged" version = "0.3.8" @@ -1162,6 +1187,12 @@ version = "0.3.28" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "76d3d132be6c0e6aa1534069c705a74a5997a356c0dc2f86a47765e5617c5b65" +[[package]] +name = "futures-timer" +version = "3.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e64b03909df88034c26dc1547e8970b91f98bdb65165d6a4e9110d94263dbb2c" + [[package]] name = "futures-util" version = "0.3.28" @@ -1225,6 +1256,24 @@ version = "0.28.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6fb8d784f27acf97159b40fc4db5ecd8aa23b9ad5ef69cdd136d3bc80665f0c0" +[[package]] +name = "governor" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c390a940a5d157878dd057c78680a33ce3415bcd05b4799509ea44210914b4d5" +dependencies = [ + "cfg-if 1.0.0", + "dashmap", + "futures 0.3.28", + "futures-timer", + "no-std-compat", + "nonzero_ext", + "parking_lot 0.12.1", + "quanta", + "rand 0.8.5", + "smallvec 1.11.0", +] + [[package]] name = "h2" version = "0.1.26" @@ -1289,6 +1338,12 @@ version = "0.12.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888" +[[package]] +name = "hashbrown" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2c6201b9ff9fd90a5a3bac2e56a830d0caa509576f0e503818ee82c181b3437a" + [[package]] name = "hermit-abi" version = "0.3.2" @@ -1410,7 +1465,7 @@ dependencies = [ "log", "net2", "rustc_version 0.2.3", - "time 0.1.45", + "time 0.1.43", "tokio 0.1.22", "tokio-buf", "tokio-executor", @@ -1511,7 +1566,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99" dependencies = [ "autocfg 1.1.0", - "hashbrown", + "hashbrown 0.12.3", ] [[package]] @@ -1672,6 +1727,15 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4" +[[package]] +name = "mach" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b823e83b2affd8f40a9ee8c29dbc56404c1e34cd2710921f2801e2cf29527afa" +dependencies = [ + "libc", +] + [[package]] name = "markup5ever" version = "0.8.1" @@ -1887,6 +1951,18 @@ version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ab250442c86f1850815b5d268639dff018c0627022bc1940eb2d642ca1ce12f0" +[[package]] +name = "no-std-compat" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b93853da6d84c2e3c7d730d6473e8817692dd89be387eb01b94d7f108ecb5b8c" + +[[package]] +name = "nonzero_ext" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38bf9645c8b145698bb0b18a4637dcacbc421ea49bef2317e4fd8065a387cf21" + [[package]] name = "num-traits" version = "0.2.16" @@ -2307,6 +2383,22 @@ dependencies = [ "url 2.4.1", ] +[[package]] +name = "quanta" +version = "0.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "20afe714292d5e879d8b12740aa223c6a88f118af41870e8b6196e39a02238a8" +dependencies = [ + "crossbeam-utils 0.8.16", + "libc", + "mach", + "once_cell", + "raw-cpuid", + "wasi 0.10.2+wasi-snapshot-preview1", + "web-sys", + "winapi 0.3.9", +] + [[package]] name = "quote" version = "0.6.13" @@ -2461,6 +2553,15 @@ dependencies = [ "rand_core 0.3.1", ] +[[package]] +name = "raw-cpuid" +version = "10.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c297679cb867470fa8c9f67dbba74a78d78e3e98d7cf2b08d6d71540f797332" +dependencies = [ + "bitflags 1.3.2", +] + [[package]] name = "rayon" version = "1.7.0" @@ -2583,7 +2684,7 @@ dependencies = [ "serde", "serde_json", "serde_urlencoded 0.5.5", - "time 0.1.45", + "time 0.1.43", "tokio 0.1.22", "tokio-executor", "tokio-io", @@ -3157,12 +3258,11 @@ checksum = "3bf63baf9f5039dadc247375c29eb13706706cfde997d0330d05aa63a77d8820" [[package]] name = "time" -version = "0.1.45" +version = "0.1.43" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1b797afad3f312d1c66a56d11d0316f916356d11bd158fbc6ca6389ff6bf805a" +checksum = "ca8a50ef2360fbd1eeb0ecd46795a87a19024eb4b53c5dc916ca1fd95fe62438" dependencies = [ "libc", - "wasi 0.10.0+wasi-snapshot-preview1", "winapi 0.3.9", ] @@ -3609,9 +3709,9 @@ dependencies = [ [[package]] name = "wasi" -version = "0.10.0+wasi-snapshot-preview1" +version = "0.10.2+wasi-snapshot-preview1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a143597ca7c7793eff794def352d41792a93c481eb1042423ff7ff72ba2c31f" +checksum = "fd6fbd9a79829dd1ad0cc20627bf1ed606756a7f77edff7b66b7064f9cb327c6" [[package]] name = "wasi" @@ -3701,6 +3801,7 @@ version = "0.20.7" dependencies = [ "actix-cors", "actix-files", + "actix-governor", "actix-web", "async-once-cell", "async-trait", diff --git a/Cargo.toml b/Cargo.toml index 5e94aa2..d686dd5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -32,6 +32,7 @@ futures = {version="0.3.28"} dhat = {version="0.3.2", optional = true} mimalloc = { version = "0.1.38", default-features = false } async-once-cell = {version="0.5.3"} +actix-governor = {version="0.4.1"} [dev-dependencies] rusty-hook = "^0.11.2" diff --git a/src/cache/cacher.rs b/src/cache/cacher.rs index b2508b5..57351cd 100644 --- a/src/cache/cacher.rs +++ b/src/cache/cacher.rs @@ -10,17 +10,14 @@ use super::error::PoolError; /// A named struct which stores the redis Connection url address to which the client will /// connect to. -/// -/// # Fields -/// -/// * `connection_pool` - It stores a pool of connections ready to be used. -/// * `pool_size` - It stores the size of the connection pool (in other words the number of -/// connections that should be stored in the pool). -/// * `current_connection` - It stores the index of which connection is being used at the moment. #[derive(Clone)] pub struct RedisCache { + /// It stores a pool of connections ready to be used. connection_pool: Vec, + /// It stores the size of the connection pool (in other words the number of + /// connections that should be stored in the pool). pool_size: u8, + /// It stores the index of which connection is being used at the moment. current_connection: u8, } diff --git a/src/cache/error.rs b/src/cache/error.rs index efd87c9..8bdb977 100644 --- a/src/cache/error.rs +++ b/src/cache/error.rs @@ -5,15 +5,12 @@ use std::fmt; use redis::RedisError; /// A custom error type used for handling redis async pool associated errors. -/// -/// This enum provides variants three different categories of errors: -/// * `RedisError` - This variant handles all errors related to `RedisError`, -/// * `PoolExhaustionWithConnectionDropError` - This variant handles the error -/// which occurs when all the connections in the connection pool return a connection -/// dropped redis error. #[derive(Debug)] pub enum PoolError { + /// This variant handles all errors related to `RedisError`, RedisError(RedisError), + /// This variant handles the errors which occurs when all the connections + /// in the connection pool return a connection dropped redis error. PoolExhaustionWithConnectionDropError, } diff --git a/src/cache/mod.rs b/src/cache/mod.rs index 03c4155..f40369f 100644 --- a/src/cache/mod.rs +++ b/src/cache/mod.rs @@ -1,2 +1,5 @@ +//! This module provides the modules which provide the functionality to cache the aggregated +//! results fetched and aggregated from the upstream search engines in a json format. + pub mod cacher; pub mod error; diff --git a/src/config/mod.rs b/src/config/mod.rs index 11ce559..babc54f 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -1,2 +1,4 @@ +//! This module provides the modules which handles the functionality to parse the lua config +//! and convert the config options into rust readable form. + pub mod parser; -pub mod parser_models; diff --git a/src/config/parser.rs b/src/config/parser.rs index f5e6d48..782b026 100644 --- a/src/config/parser.rs +++ b/src/config/parser.rs @@ -3,52 +3,42 @@ use crate::handler::paths::{file_path, FileType}; -use super::parser_models::Style; +use crate::models::parser_models::{AggregatorConfig, RateLimiter, Style}; use log::LevelFilter; use mlua::Lua; use std::{collections::HashMap, fs, thread::available_parallelism}; /// A named struct which stores the parsed config file options. -/// -/// # Fields -// -/// * `port` - It stores the parsed port number option on which the server should launch. -/// * `binding_ip` - It stores the parsed ip address option on which the server should launch -/// * `style` - It stores the theming options for the website. -/// * `redis_url` - It stores the redis connection url address on which the redis -/// client should connect. -/// * `aggregator` - It stores the option to whether enable or disable production use. -/// * `logging` - It stores the option to whether enable or disable logs. -/// * `debug` - It stores the option to whether enable or disable debug mode. -/// * `upstream_search_engines` - It stores all the engine names that were enabled by the user. -/// * `request_timeout` - It stores the time (secs) which controls the server request timeout. -/// * `threads` - It stores the number of threads which controls the app will use to run. #[derive(Clone)] pub struct Config { + /// It stores the parsed port number option on which the server should launch. pub port: u16, + /// It stores the parsed ip address option on which the server should launch pub binding_ip: String, + /// It stores the theming options for the website. pub style: Style, + /// It stores the redis connection url address on which the redis + /// client should connect. pub redis_url: String, + /// It stores the option to whether enable or disable production use. pub aggregator: AggregatorConfig, + /// It stores the option to whether enable or disable logs. pub logging: bool, + /// It stores the option to whether enable or disable debug mode. pub debug: bool, - pub upstream_search_engines: Vec, + /// It stores all the engine names that were enabled by the user. + pub upstream_search_engines: Vec, + /// It stores the time (secs) which controls the server request timeout. pub request_timeout: u8, + /// It stores the number of threads which controls the app will use to run. pub threads: u8, + /// It stores configuration options for the ratelimiting middleware. + pub rate_limiter: RateLimiter, + /// It stores the level of safe search to be used for restricting content in the + /// search results. pub safe_search: u8, } -/// Configuration options for the aggregator. -/// -/// # Fields -/// -/// * `random_delay` - It stores the option to whether enable or disable random delays between -/// requests. -#[derive(Clone)] -pub struct AggregatorConfig { - pub random_delay: bool, -} - impl Config { /// A function which parses the config.lua file and puts all the parsed options in the newly /// constructed Config struct and returns it. @@ -90,6 +80,8 @@ impl Config { parsed_threads }; + let rate_limiter = globals.get::<_, HashMap>("rate_limiter")?; + let parsed_safe_search: u8 = globals.get::<_, u8>("safe_search")?; let safe_search: u8 = match parsed_safe_search { 0..=4 => parsed_safe_search, @@ -117,16 +109,25 @@ impl Config { .get::<_, HashMap>("upstream_search_engines")? .into_iter() .filter_map(|(key, value)| value.then_some(key)) - .filter_map(|engine| crate::engines::engine_models::EngineHandler::new(&engine)) + .filter_map(|engine| crate::models::engine_models::EngineHandler::new(&engine)) .collect(), request_timeout: globals.get::<_, u8>("request_timeout")?, threads, + rate_limiter: RateLimiter { + number_of_requests: rate_limiter["number_of_requests"], + time_limit: rate_limiter["time_limit"], + }, safe_search, }) } } /// a helper function that sets the proper logging level +/// +/// # Arguments +/// +/// * `debug` - It takes the option to whether enable or disable debug mode. +/// * `logging` - It takes the option to whether enable or disable logs. fn set_logging_level(debug: bool, logging: bool) { if let Ok(pkg_env_var) = std::env::var("PKG_ENV") { if pkg_env_var.to_lowercase() == "dev" { diff --git a/src/engines/duckduckgo.rs b/src/engines/duckduckgo.rs index 7b9f7d6..0f06ea4 100644 --- a/src/engines/duckduckgo.rs +++ b/src/engines/duckduckgo.rs @@ -7,9 +7,9 @@ use std::collections::HashMap; use reqwest::header::HeaderMap; use scraper::{Html, Selector}; -use crate::results::aggregation_models::SearchResult; +use crate::models::aggregation_models::SearchResult; -use super::engine_models::{EngineError, SearchEngine}; +use crate::models::engine_models::{EngineError, SearchEngine}; use error_stack::{Report, Result, ResultExt}; @@ -19,24 +19,6 @@ pub struct DuckDuckGo; #[async_trait::async_trait] impl SearchEngine for DuckDuckGo { - /// This function scrapes results from the upstream engine duckduckgo and puts all the scraped - /// results like title, visiting_url (href in html),engine (from which engine it was fetched from) - /// and description in a RawSearchResult and then adds that to HashMap whose keys are url and - /// values are RawSearchResult struct and then returns it within a Result enum. - /// - /// # Arguments - /// - /// * `query` - Takes the user provided query to query to the upstream search engine with. - /// * `page` - Takes an u32 as an argument. - /// * `user_agent` - Takes a random user agent string as an argument. - /// * `request_timeout` - Takes a time (secs) as a value which controls the server request timeout. - /// - /// # Errors - /// - /// Returns an `EngineErrorKind` if the user is not connected to the internet or if their is failure to - /// reach the above `upstream search engine` page or if the `upstream search engine` is unable to - /// provide results for the requested search query and also returns error if the scraping selector - /// or HeaderMap fails to initialize. async fn results( &self, query: &str, diff --git a/src/engines/engine_models.rs b/src/engines/engine_models.rs deleted file mode 100644 index f4e7e5a..0000000 --- a/src/engines/engine_models.rs +++ /dev/null @@ -1,108 +0,0 @@ -//! This module provides the error enum to handle different errors associated while requesting data from -//! the upstream search engines with the search query provided by the user. - -use crate::results::aggregation_models::SearchResult; -use error_stack::{Result, ResultExt}; -use std::{collections::HashMap, fmt, time::Duration}; - -/// A custom error type used for handle engine associated errors. -/// -/// This enum provides variants three different categories of errors: -/// * `RequestError` - This variant handles all request related errors like forbidden, not found, -/// etc. -/// * `EmptyResultSet` - This variant handles the not results found error provide by the upstream -/// search engines. -/// * `UnexpectedError` - This variant handles all the errors which are unexpected or occur rarely -/// and are errors mostly related to failure in initialization of HeaderMap, Selector errors and -/// all other errors occurring within the code handling the `upstream search engines`. -#[derive(Debug)] -pub enum EngineError { - EmptyResultSet, - RequestError, - UnexpectedError, -} - -impl fmt::Display for EngineError { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - match self { - EngineError::EmptyResultSet => { - write!(f, "The upstream search engine returned an empty result set") - } - EngineError::RequestError => { - write!( - f, - "Error occurred while requesting data from upstream search engine" - ) - } - EngineError::UnexpectedError => { - write!(f, "An unexpected error occurred while processing the data") - } - } - } -} - -impl error_stack::Context for EngineError {} - -/// A trait to define common behavior for all search engines. -#[async_trait::async_trait] -pub trait SearchEngine: Sync + Send { - async fn fetch_html_from_upstream( - &self, - url: &str, - header_map: reqwest::header::HeaderMap, - request_timeout: u8, - ) -> Result { - // fetch the html from upstream search engine - Ok(reqwest::Client::new() - .get(url) - .timeout(Duration::from_secs(request_timeout as u64)) // Add timeout to request to avoid DDOSing the server - .headers(header_map) // add spoofed headers to emulate human behavior - .send() - .await - .change_context(EngineError::RequestError)? - .text() - .await - .change_context(EngineError::RequestError)?) - } - - async fn results( - &self, - query: &str, - page: u32, - user_agent: &str, - request_timeout: u8, - safe_search: u8, - ) -> Result, EngineError>; -} - -pub struct EngineHandler { - engine: Box, - name: &'static str, -} - -impl Clone for EngineHandler { - fn clone(&self) -> Self { - Self::new(self.name).unwrap() - } -} - -impl EngineHandler { - /// parses an engine name into an engine handler, returns none if the engine is unknown - pub fn new(engine_name: &str) -> Option { - let engine: (&'static str, Box) = - match engine_name.to_lowercase().as_str() { - "duckduckgo" => ("duckduckgo", Box::new(super::duckduckgo::DuckDuckGo)), - "searx" => ("searx", Box::new(super::searx::Searx)), - _ => return None, - }; - - Some(Self { - engine: engine.1, - name: engine.0, - }) - } - - pub fn into_name_engine(self) -> (&'static str, Box) { - (self.name, self.engine) - } -} diff --git a/src/engines/mod.rs b/src/engines/mod.rs index f9bb8ad..0016728 100644 --- a/src/engines/mod.rs +++ b/src/engines/mod.rs @@ -1,3 +1,7 @@ +//! This module provides different modules which handles the functionlity to fetch results from the +//! upstream search engines based on user requested queries. Also provides different models to +//! provide a standard functions to be implemented for all the upstream search engine handling +//! code. Moreover, it also provides a custom error for the upstream search engine handling code. + pub mod duckduckgo; -pub mod engine_models; pub mod searx; diff --git a/src/engines/searx.rs b/src/engines/searx.rs index 4eb22c5..6ab0469 100644 --- a/src/engines/searx.rs +++ b/src/engines/searx.rs @@ -6,9 +6,8 @@ use reqwest::header::HeaderMap; use scraper::{Html, Selector}; use std::collections::HashMap; -use crate::results::aggregation_models::SearchResult; - -use super::engine_models::{EngineError, SearchEngine}; +use crate::models::aggregation_models::SearchResult; +use crate::models::engine_models::{EngineError, SearchEngine}; use error_stack::{Report, Result, ResultExt}; /// A new Searx engine type defined in-order to implement the `SearchEngine` trait which allows to @@ -17,25 +16,6 @@ pub struct Searx; #[async_trait::async_trait] impl SearchEngine for Searx { - /// This function scrapes results from the upstream engine duckduckgo and puts all the scraped - /// results like title, visiting_url (href in html),engine (from which engine it was fetched from) - /// and description in a RawSearchResult and then adds that to HashMap whose keys are url and - /// values are RawSearchResult struct and then returns it within a Result enum. - /// - /// # Arguments - /// - /// * `query` - Takes the user provided query to query to the upstream search engine with. - /// * `page` - Takes an u32 as an argument. - /// * `user_agent` - Takes a random user agent string as an argument. - /// * `request_timeout` - Takes a time (secs) as a value which controls the server request timeout. - /// - /// # Errors - /// - /// Returns an `EngineErrorKind` if the user is not connected to the internet or if their is failure to - /// reach the above `upstream search engine` page or if the `upstream search engine` is unable to - /// provide results for the requested search query and also returns error if the scraping selector - /// or HeaderMap fails to initialize. - async fn results( &self, query: &str, diff --git a/src/handler/mod.rs b/src/handler/mod.rs index 8118b29..188767d 100644 --- a/src/handler/mod.rs +++ b/src/handler/mod.rs @@ -1 +1,5 @@ +//! This module provides modules which provide the functionality to handle paths for different +//! files present on different paths and provide one appropriate path on which it is present and +//! can be used. + pub mod paths; diff --git a/src/handler/paths.rs b/src/handler/paths.rs index 91f7f94..9ea5fff 100644 --- a/src/handler/paths.rs +++ b/src/handler/paths.rs @@ -7,42 +7,46 @@ use std::path::Path; use std::sync::OnceLock; // ------- Constants -------- -static PUBLIC_DIRECTORY_NAME: &str = "public"; -static COMMON_DIRECTORY_NAME: &str = "websurfx"; -static CONFIG_FILE_NAME: &str = "config.lua"; -static ALLOWLIST_FILE_NAME: &str = "allowlist.txt"; -static BLOCKLIST_FILE_NAME: &str = "blocklist.txt"; +/// The constant holding the name of the theme folder. +const PUBLIC_DIRECTORY_NAME: &str = "public"; +/// The constant holding the name of the common folder. +const COMMON_DIRECTORY_NAME: &str = "websurfx"; +/// The constant holding the name of the config file. +const CONFIG_FILE_NAME: &str = "config.lua"; +/// The constant holding the name of the AllowList text file. +const ALLOWLIST_FILE_NAME: &str = "allowlist.txt"; +/// The constant holding the name of the BlockList text file. +const BLOCKLIST_FILE_NAME: &str = "blocklist.txt"; +/// An enum type which provides different variants to handle paths for various files/folders. #[derive(Hash, PartialEq, Eq, Debug)] pub enum FileType { + /// This variant handles all the paths associated with the config file. Config, + /// This variant handles all the paths associated with the Allowlist text file. AllowList, + /// This variant handles all the paths associated with the BlockList text file. BlockList, + /// This variant handles all the paths associated with the public folder (Theme folder). Theme, } +/// A static variable which stores the different filesystem paths for various file/folder types. static FILE_PATHS_FOR_DIFF_FILE_TYPES: OnceLock>> = OnceLock::new(); -/// A helper function which returns an appropriate config file path checking if the config -/// file exists on that path. +/// A function which returns an appropriate path for thr provided file type by checking if the path +/// for the given file type exists on that path. /// /// # Error /// -/// Returns a `config file not found!!` error if the config file is not present under following -/// paths which are: -/// 1. `~/.config/websurfx/` if it not present here then it fallbacks to the next one (2) -/// 2. `/etc/xdg/websurfx/config.lua` if it is not present here then it fallbacks to the next -/// one (3). -/// 3. `websurfx/` (under project folder ( or codebase in other words)) if it is not present -/// here then it returns an error as mentioned above. - -/// A function which returns an appropriate theme directory path checking if the theme -/// directory exists on that path. +/// Returns a ` folder/file not found!!` error if the give file_type folder/file is not +/// present on the path on which it is being tested. /// -/// # Error +/// # Example +/// +/// If this function is give the file_type of Theme variant then the theme folder is checked by the +/// following steps: /// -/// Returns a `Theme (public) folder not found!!` error if the theme folder is not present under following -/// paths which are: /// 1. `/opt/websurfx` if it not present here then it fallbacks to the next one (2) /// 2. Under project folder ( or codebase in other words) if it is not present /// here then it returns an error as mentioned above. @@ -110,6 +114,6 @@ pub fn file_path(file_type: FileType) -> Result<&'static str, Error> { // if no of the configs above exist, return error Err(Error::new( std::io::ErrorKind::NotFound, - format!("{:?} file not found!!", file_type), + format!("{:?} file/folder not found!!", file_type), )) } diff --git a/src/lib.rs b/src/lib.rs index 97bff01..8c74e6a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,25 +1,26 @@ //! This main library module provides the functionality to provide and handle the Tcp server //! and register all the routes for the `websurfx` meta search engine website. +#![forbid(unsafe_code, clippy::panic)] +#![deny(missing_docs, clippy::missing_docs_in_private_items, clippy::perf)] +#![warn(clippy::cognitive_complexity, rust_2018_idioms)] + pub mod cache; pub mod config; pub mod engines; pub mod handler; +pub mod models; pub mod results; pub mod server; use std::net::TcpListener; -use crate::server::routes; +use crate::server::router; use actix_cors::Cors; use actix_files as fs; -use actix_web::{ - dev::Server, - http::header, - middleware::{Compress, Logger}, - web, App, HttpServer, -}; +use actix_governor::{Governor, GovernorConfigBuilder}; +use actix_web::{dev::Server, http::header, middleware::Logger, web, App, HttpServer}; use config::parser::Config; use handlebars::Handlebars; use handler::paths::{file_path, FileType}; @@ -45,7 +46,7 @@ use handler::paths::{file_path, FileType}; /// let server = run(listener,config).expect("Failed to start server"); /// ``` pub fn run(listener: TcpListener, config: Config) -> std::io::Result { - let mut handlebars: Handlebars = Handlebars::new(); + let mut handlebars: Handlebars<'_> = Handlebars::new(); let public_folder_path: &str = file_path(FileType::Theme)?; @@ -53,7 +54,7 @@ pub fn run(listener: TcpListener, config: Config) -> std::io::Result { .register_templates_directory(".html", format!("{}/templates", public_folder_path)) .unwrap(); - let handlebars_ref: web::Data = web::Data::new(handlebars); + let handlebars_ref: web::Data> = web::Data::new(handlebars); let cloned_config_threads_opt: u8 = config.threads; @@ -69,11 +70,17 @@ pub fn run(listener: TcpListener, config: Config) -> std::io::Result { ]); App::new() + .wrap(Logger::default()) // added logging middleware for logging. .app_data(handlebars_ref.clone()) .app_data(web::Data::new(config.clone())) .wrap(cors) - .wrap(Logger::default()) // added logging middleware for logging. - .wrap(Compress::default()) // compress request headers to reduce memory usage. + .wrap(Governor::new( + &GovernorConfigBuilder::default() + .per_second(config.rate_limiter.time_limit as u64) + .burst_size(config.rate_limiter.number_of_requests as u32) + .finish() + .unwrap(), + )) // Serve images and static files (css and js files). .service( fs::Files::new("/static", format!("{}/static", public_folder_path)) @@ -83,12 +90,12 @@ pub fn run(listener: TcpListener, config: Config) -> std::io::Result { fs::Files::new("/images", format!("{}/images", public_folder_path)) .show_files_listing(), ) - .service(routes::robots_data) // robots.txt - .service(routes::index) // index page - .service(routes::search) // search page - .service(routes::about) // about page - .service(routes::settings) // settings page - .default_service(web::route().to(routes::not_found)) // error page + .service(router::robots_data) // robots.txt + .service(router::index) // index page + .service(server::routes::search::search) // search page + .service(router::about) // about page + .service(router::settings) // settings page + .default_service(web::route().to(router::not_found)) // error page }) .workers(cloned_config_threads_opt as usize) // Start server on 127.0.0.1 with the user provided port number. for example 127.0.0.1:8080. diff --git a/src/results/aggregation_models.rs b/src/models/aggregation_models.rs similarity index 71% rename from src/results/aggregation_models.rs rename to src/models/aggregation_models.rs index 280767c..ea4a914 100644 --- a/src/results/aggregation_models.rs +++ b/src/models/aggregation_models.rs @@ -4,25 +4,22 @@ use serde::{Deserialize, Serialize}; use smallvec::SmallVec; -use crate::{config::parser_models::Style, engines::engine_models::EngineError}; +use super::{engine_models::EngineError, parser_models::Style}; /// A named struct to store the raw scraped search results scraped search results from the /// upstream search engines before aggregating it.It derives the Clone trait which is needed /// to write idiomatic rust using `Iterators`. -/// -/// # Fields -/// -/// * `title` - The title of the search result. -/// * `url` - The url which is accessed when clicked on it /// (href url in html in simple words). -/// * `description` - The description of the search result. -/// * `engine` - The names of the upstream engines from which this results were provided. -#[derive(Clone, Serialize, Deserialize, Debug)] +#[derive(Clone, Serialize, Deserialize)] #[serde(rename_all = "camelCase")] pub struct SearchResult { + /// The title of the search result. pub title: String, + /// The url which is accessed when clicked on it pub url: String, + /// The description of the search result. pub description: String, + /// The names of the upstream engines from which this results were provided. pub engine: SmallVec<[String; 0]>, } @@ -64,14 +61,27 @@ impl SearchResult { } } +/// A named struct that stores the error info related to the upstream search engines. #[derive(Serialize, Deserialize, Clone)] pub struct EngineErrorInfo { + /// It stores the error type which occured while fetching the result from a particular search + /// engine. pub error: String, + /// It stores the name of the engine that failed to provide the requested search results. pub engine: String, + /// It stores the name of the color to indicate whether how severe the particular error is (In + /// other words it indicates the severity of the error/issue). pub severity_color: String, } impl EngineErrorInfo { + /// Constructs a new `SearchResult` with the given arguments needed for the struct. + /// + /// # Arguments + /// + /// * `error` - It takes the error type which occured while fetching the result from a particular + /// search engine. + /// * `engine` - It takes the name of the engine that failed to provide the requested search results. pub fn new(error: &EngineError, engine: &str) -> Self { Self { error: match error { @@ -91,25 +101,26 @@ impl EngineErrorInfo { /// A named struct to store, serialize, deserialize the all the search results scraped and /// aggregated from the upstream search engines. -/// -/// # Fields -/// -/// * `results` - Stores the individual serializable `SearchResult` struct into a vector of /// `SearchResult` structs. -/// * `page_query` - Stores the current pages search query `q` provided in the search url. -/// * `style` - Stores the theming options for the website. -/// * `engine_errors_info` - Stores the information on which engines failed with their engine name -/// and the type of error that caused it. -/// * `empty_result_set` - Stores a boolean which indicates that no engines gave a result for the -/// given search query. #[derive(Serialize, Deserialize, Default)] #[serde(rename_all = "camelCase")] pub struct SearchResults { + /// Stores the individual serializable `SearchResult` struct into a vector of pub results: Vec, + /// Stores the current pages search query `q` provided in the search url. pub page_query: String, + /// Stores the theming options for the website. pub style: Style, + /// Stores the information on which engines failed with their engine name + /// and the type of error that caused it. pub engine_errors_info: Vec, + /// Stores the flag option which holds the check value that the following + /// search query was disallowed when the safe search level set to 4 and it + /// was present in the `Blocklist` file. pub disallowed: bool, + /// Stores the flag option which holds the check value that the following + /// search query was filtered when the safe search level set to 3 and it + /// was present in the `Blocklist` file. pub filtered: bool, } @@ -122,9 +133,8 @@ impl SearchResults { /// and stores it into a vector of `SearchResult` structs. /// * `page_query` - Takes an argument of current page`s search query `q` provided in /// the search url. - /// * `empty_result_set` - Takes a boolean which indicates that no engines gave a result for the - /// given search query. - /// * `` + /// * `engine_errors_info` - Takes an array of structs which contains information regarding + /// which engines failed with their names, reason and their severity color name. pub fn new( results: Vec, page_query: &str, diff --git a/src/models/engine_models.rs b/src/models/engine_models.rs new file mode 100644 index 0000000..d4a4e72 --- /dev/null +++ b/src/models/engine_models.rs @@ -0,0 +1,159 @@ +//! This module provides the error enum to handle different errors associated while requesting data from +//! the upstream search engines with the search query provided by the user. + +use super::aggregation_models::SearchResult; +use error_stack::{Result, ResultExt}; +use std::{collections::HashMap, fmt, time::Duration}; + +/// A custom error type used for handle engine associated errors. +#[derive(Debug)] +pub enum EngineError { + /// This variant handles all request related errors like forbidden, not found, + /// etc. + EmptyResultSet, + /// This variant handles the not results found error provide by the upstream + /// search engines. + RequestError, + /// This variant handles all the errors which are unexpected or occur rarely + /// and are errors mostly related to failure in initialization of HeaderMap, + /// Selector errors and all other errors occurring within the code handling + /// the `upstream search engines`. + UnexpectedError, +} + +impl fmt::Display for EngineError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + EngineError::EmptyResultSet => { + write!(f, "The upstream search engine returned an empty result set") + } + EngineError::RequestError => { + write!( + f, + "Error occurred while requesting data from upstream search engine" + ) + } + EngineError::UnexpectedError => { + write!(f, "An unexpected error occurred while processing the data") + } + } + } +} + +impl error_stack::Context for EngineError {} + +/// A trait to define common behavior for all search engines. +#[async_trait::async_trait] +pub trait SearchEngine: Sync + Send { + /// This helper function fetches/requests the search results from the upstream search engine in + /// an html form. + /// + /// # Arguments + /// + /// * `url` - It takes the url of the upstream search engine with the user requested search + /// query appended in the search parameters. + /// * `header_map` - It takes the http request headers to be sent to the upstream engine in + /// order to prevent being detected as a bot. It takes the header as a HeaderMap type. + /// * `request_timeout` - It takes the request timeout value as seconds which is used to limit + /// the amount of time for each request to remain connected when until the results can be provided + /// by the upstream engine. + /// + /// # Error + /// + /// It returns the html data as a string if the upstream engine provides the data as expected + /// otherwise it returns a custom `EngineError`. + async fn fetch_html_from_upstream( + &self, + url: &str, + header_map: reqwest::header::HeaderMap, + request_timeout: u8, + ) -> Result { + // fetch the html from upstream search engine + Ok(reqwest::Client::new() + .get(url) + .timeout(Duration::from_secs(request_timeout as u64)) // Add timeout to request to avoid DDOSing the server + .headers(header_map) // add spoofed headers to emulate human behavior + .send() + .await + .change_context(EngineError::RequestError)? + .text() + .await + .change_context(EngineError::RequestError)?) + } + + /// This function scrapes results from the upstream engine and puts all the scraped results like + /// title, visiting_url (href in html),engine (from which engine it was fetched from) and description + /// in a RawSearchResult and then adds that to HashMap whose keys are url and values are RawSearchResult + /// struct and then returns it within a Result enum. + /// + /// # Arguments + /// + /// * `query` - Takes the user provided query to query to the upstream search engine with. + /// * `page` - Takes an u32 as an argument. + /// * `user_agent` - Takes a random user agent string as an argument. + /// * `request_timeout` - Takes a time (secs) as a value which controls the server request timeout. + /// + /// # Errors + /// + /// Returns an `EngineErrorKind` if the user is not connected to the internet or if their is failure to + /// reach the above `upstream search engine` page or if the `upstream search engine` is unable to + /// provide results for the requested search query and also returns error if the scraping selector + /// or HeaderMap fails to initialize. + async fn results( + &self, + query: &str, + page: u32, + user_agent: &str, + request_timeout: u8, + safe_search: u8, + ) -> Result, EngineError>; +} + +/// A named struct which stores the engine struct with the name of the associated engine. +pub struct EngineHandler { + /// It stores the engine struct wrapped in a box smart pointer as the engine struct implements + /// the `SearchEngine` trait. + engine: Box, + /// It stores the name of the engine to which the struct is associated to. + name: &'static str, +} + +impl Clone for EngineHandler { + fn clone(&self) -> Self { + Self::new(self.name).unwrap() + } +} + +impl EngineHandler { + /// Parses an engine name into an engine handler. + /// + /// # Arguments + /// + /// * `engine_name` - It takes the name of the engine to which the struct was associated to. + /// + /// # Returns + /// + /// It returns an option either containing the value or a none if the engine is unknown + pub fn new(engine_name: &str) -> Option { + let engine: (&'static str, Box) = + match engine_name.to_lowercase().as_str() { + "duckduckgo" => ( + "duckduckgo", + Box::new(crate::engines::duckduckgo::DuckDuckGo), + ), + "searx" => ("searx", Box::new(crate::engines::searx::Searx)), + _ => return None, + }; + + Some(Self { + engine: engine.1, + name: engine.0, + }) + } + + /// This function converts the EngineHandler type into a tuple containing the engine name and + /// the associated engine struct. + pub fn into_name_engine(self) -> (&'static str, Box) { + (self.name, self.engine) + } +} diff --git a/src/models/mod.rs b/src/models/mod.rs new file mode 100644 index 0000000..6a7d235 --- /dev/null +++ b/src/models/mod.rs @@ -0,0 +1,8 @@ +//! This module provides modules which in turn provides various models for aggregrating search +//! results, parsing config file, providing trait to standardize search engine handling code, +//! custom engine error for the search engine, etc. + +pub mod aggregation_models; +pub mod engine_models; +pub mod parser_models; +pub mod server_models; diff --git a/src/config/parser_models.rs b/src/models/parser_models.rs similarity index 68% rename from src/config/parser_models.rs rename to src/models/parser_models.rs index 7528715..9dad348 100644 --- a/src/config/parser_models.rs +++ b/src/models/parser_models.rs @@ -12,15 +12,12 @@ use serde::{Deserialize, Serialize}; /// order to allow the deserializing the json back to struct in aggregate function in /// aggregator.rs and create a new struct out of it and then serialize it back to json and pass /// it to the template files. -/// -/// # Fields -// -/// * `theme` - It stores the parsed theme option used to set a theme for the website. -/// * `colorscheme` - It stores the parsed colorscheme option used to set a colorscheme for the -/// theme being used. #[derive(Serialize, Deserialize, Clone, Default)] pub struct Style { + /// It stores the parsed theme option used to set a theme for the website. pub theme: String, + /// It stores the parsed colorscheme option used to set a colorscheme for the + /// theme being used. pub colorscheme: String, } @@ -36,3 +33,20 @@ impl Style { Style { theme, colorscheme } } } + +/// Configuration options for the aggregator. +#[derive(Clone)] +pub struct AggregatorConfig { + /// It stores the option to whether enable or disable random delays between + /// requests. + pub random_delay: bool, +} + +/// Configuration options for the rate limiter middleware. +#[derive(Clone)] +pub struct RateLimiter { + /// The number of request that are allowed within a provided time limit. + pub number_of_requests: u8, + /// The time limit in which the quantity of requests that should be accepted. + pub time_limit: u8, +} diff --git a/src/models/server_models.rs b/src/models/server_models.rs new file mode 100644 index 0000000..3da6717 --- /dev/null +++ b/src/models/server_models.rs @@ -0,0 +1,26 @@ +//! This module provides the models to parse cookies and search parameters from the search +//! engine website. +use serde::Deserialize; + +/// A named struct which deserializes all the user provided search parameters and stores them. +#[derive(Deserialize)] +pub struct SearchParams { + /// It stores the search parameter option `q` (or query in simple words) + /// of the search url. + pub q: Option, + /// It stores the search parameter `page` (or pageno in simple words) + /// of the search url. + pub page: Option, +} + +/// A named struct which is used to deserialize the cookies fetched from the client side. +#[allow(dead_code)] +#[derive(Deserialize)] +pub struct Cookie { + /// It stores the theme name used in the website. + pub theme: String, + /// It stores the colorscheme name used for the website theme. + pub colorscheme: String, + /// It stores the user selected upstream search engines selected from the UI. + pub engines: Vec, +} diff --git a/src/results/aggregator.rs b/src/results/aggregator.rs index 734a65f..8c9be2c 100644 --- a/src/results/aggregator.rs +++ b/src/results/aggregator.rs @@ -1,27 +1,23 @@ //! This module provides the functionality to scrape and gathers all the results from the upstream //! search engines and then removes duplicate results. +use super::user_agent::random_user_agent; +use crate::handler::paths::{file_path, FileType}; +use crate::models::{ + aggregation_models::{EngineErrorInfo, SearchResult, SearchResults}, + engine_models::{EngineError, EngineHandler}, +}; +use error_stack::Report; +use rand::Rng; +use regex::Regex; use std::{ collections::HashMap, io::{BufReader, Read}, time::Duration, }; - -use super::{ - aggregation_models::{EngineErrorInfo, SearchResult, SearchResults}, - user_agent::random_user_agent, -}; -use error_stack::Report; -use rand::Rng; -use regex::Regex; use std::{fs::File, io::BufRead}; use tokio::task::JoinHandle; -use crate::{ - engines::engine_models::{EngineError, EngineHandler}, - handler::paths::{file_path, FileType}, -}; - /// Aliases for long type annotations type FutureVec = Vec, Report>>>; diff --git a/src/results/mod.rs b/src/results/mod.rs index 0c13442..9ec3229 100644 --- a/src/results/mod.rs +++ b/src/results/mod.rs @@ -1,3 +1,6 @@ -pub mod aggregation_models; +//! This module provides modules that handle the functionality to aggregate the fetched search +//! results from the upstream search engines and filters it if safe search is set to 3 or 4. Also, +//! provides various models to aggregate search results into a standardized form. + pub mod aggregator; pub mod user_agent; diff --git a/src/results/user_agent.rs b/src/results/user_agent.rs index 3bfa05b..ab2811b 100644 --- a/src/results/user_agent.rs +++ b/src/results/user_agent.rs @@ -4,6 +4,8 @@ use std::sync::OnceLock; use fake_useragent::{Browsers, UserAgents, UserAgentsBuilder}; +/// A static variable which stores the initially build `UserAgents` struct. So as it can be resused +/// again and again without the need of reinitializing the `UserAgents` struct. static USER_AGENTS: OnceLock = OnceLock::new(); /// A function to generate random user agent to improve privacy of the user. diff --git a/src/server/mod.rs b/src/server/mod.rs index 6a664ab..7f4274f 100644 --- a/src/server/mod.rs +++ b/src/server/mod.rs @@ -1 +1,7 @@ +//! This module provides modules that handle the functionality of handling different routes/paths +//! for the `websurfx` search engine website. Also it handles the parsing of search parameters in +//! the search route. Also, caches the next, current and previous search results in the search +//! routes with the help of the redis server. + +pub mod router; pub mod routes; diff --git a/src/server/router.rs b/src/server/router.rs new file mode 100644 index 0000000..69a3ede --- /dev/null +++ b/src/server/router.rs @@ -0,0 +1,64 @@ +//! This module provides the functionality to handle different routes of the `websurfx` +//! meta search engine website and provide appropriate response to each route/page +//! when requested. + +use crate::{ + config::parser::Config, + handler::paths::{file_path, FileType}, +}; +use actix_web::{get, web, HttpRequest, HttpResponse}; +use handlebars::Handlebars; +use std::fs::read_to_string; + +/// Handles the route of index page or main page of the `websurfx` meta search engine website. +#[get("/")] +pub async fn index( + hbs: web::Data>, + config: web::Data, +) -> Result> { + let page_content: String = hbs.render("index", &config.style).unwrap(); + Ok(HttpResponse::Ok().body(page_content)) +} + +/// Handles the route of any other accessed route/page which is not provided by the +/// website essentially the 404 error page. +pub async fn not_found( + hbs: web::Data>, + config: web::Data, +) -> Result> { + let page_content: String = hbs.render("404", &config.style)?; + + Ok(HttpResponse::Ok() + .content_type("text/html; charset=utf-8") + .body(page_content)) +} + +/// Handles the route of robots.txt page of the `websurfx` meta search engine website. +#[get("/robots.txt")] +pub async fn robots_data(_req: HttpRequest) -> Result> { + let page_content: String = + read_to_string(format!("{}/robots.txt", file_path(FileType::Theme)?))?; + Ok(HttpResponse::Ok() + .content_type("text/plain; charset=ascii") + .body(page_content)) +} + +/// Handles the route of about page of the `websurfx` meta search engine website. +#[get("/about")] +pub async fn about( + hbs: web::Data>, + config: web::Data, +) -> Result> { + let page_content: String = hbs.render("about", &config.style)?; + Ok(HttpResponse::Ok().body(page_content)) +} + +/// Handles the route of settings page of the `websurfx` meta search engine website. +#[get("/settings")] +pub async fn settings( + hbs: web::Data>, + config: web::Data, +) -> Result> { + let page_content: String = hbs.render("settings", &config.style)?; + Ok(HttpResponse::Ok().body(page_content)) +} diff --git a/src/server/routes/mod.rs b/src/server/routes/mod.rs new file mode 100644 index 0000000..6bc5750 --- /dev/null +++ b/src/server/routes/mod.rs @@ -0,0 +1,3 @@ +//! This module provides modules to handle various routes in the search engine website. + +pub mod search; diff --git a/src/server/routes.rs b/src/server/routes/search.rs similarity index 84% rename from src/server/routes.rs rename to src/server/routes/search.rs index 3d69e78..254c038 100644 --- a/src/server/routes.rs +++ b/src/server/routes/search.rs @@ -1,23 +1,20 @@ -//! This module provides the functionality to handle different routes of the `websurfx` -//! meta search engine website and provide appropriate response to each route/page -//! when requested. - -use std::{ - fs::{read_to_string, File}, - io::{BufRead, BufReader, Read}, -}; +//! This module handles the search route of the search engine website. use crate::{ cache::cacher::RedisCache, config::parser::Config, - engines::engine_models::EngineHandler, handler::paths::{file_path, FileType}, - results::{aggregation_models::SearchResults, aggregator::aggregate}, + models::{aggregation_models::SearchResults, engine_models::EngineHandler}, + results::aggregator::aggregate, }; use actix_web::{get, web, HttpRequest, HttpResponse}; use handlebars::Handlebars; use regex::Regex; use serde::Deserialize; +use std::{ + fs::{read_to_string, File}, + io::{BufRead, BufReader, Read}, +}; use tokio::join; // ---- Constants ---- @@ -25,17 +22,16 @@ use tokio::join; static REDIS_CACHE: async_once_cell::OnceCell = async_once_cell::OnceCell::new(); /// A named struct which deserializes all the user provided search parameters and stores them. -/// -/// # Fields -/// -/// * `q` - It stores the search parameter option `q` (or query in simple words) -/// of the search url. -/// * `page` - It stores the search parameter `page` (or pageno in simple words) -/// of the search url. #[derive(Deserialize)] -struct SearchParams { +pub struct SearchParams { + /// It stores the search parameter option `q` (or query in simple words) + /// of the search url. q: Option, + /// It stores the search parameter `page` (or pageno in simple words) + /// of the search url. page: Option, + /// It stores the search parameter `safesearch` (or safe search level in simple words) of the + /// search url. safesearch: Option, } @@ -63,17 +59,14 @@ pub async fn not_found( } /// A named struct which is used to deserialize the cookies fetched from the client side. -/// -/// # Fields -/// -/// * `theme` - It stores the theme name used in the website. -/// * `colorscheme` - It stores the colorscheme name used for the website theme. -/// * `engines` - It stores the user selected upstream search engines selected from the UI. #[allow(dead_code)] #[derive(Deserialize)] struct Cookie<'a> { + /// It stores the theme name used in the website. theme: &'a str, + /// It stores the colorscheme name used for the website theme. colorscheme: &'a str, + /// It stores the user selected upstream search engines selected from the UI. engines: Vec<&'a str>, } @@ -174,8 +167,21 @@ pub async fn search( } } -/// Fetches the results for a query and page. -/// First checks the redis cache, if that fails it gets proper results +/// Fetches the results for a query and page. It First checks the redis cache, if that +/// fails it gets proper results by requesting from the upstream search engines. +/// +/// # Arguments +/// +/// * `url` - It takes the url of the current page that requested the search results for a +/// particular search query. +/// * `config` - It takes a parsed config struct. +/// * `query` - It takes the page number as u32 value. +/// * `req` - It takes the `HttpRequest` struct as a value. +/// +/// # Error +/// +/// It returns the `SearchResults` struct if the search results could be successfully fetched from +/// the cache or from the upstream search engines otherwise it returns an appropriate error. async fn results( url: String, config: &Config, @@ -184,6 +190,7 @@ async fn results( req: HttpRequest, safe_search: u8, ) -> Result> { + // Initialize redis cache connection struct let mut redis_cache: RedisCache = REDIS_CACHE .get_or_init(async { // Initialize redis cache connection pool only one and store it in the heap. @@ -191,7 +198,6 @@ async fn results( }) .await .clone(); - // fetch the cached results json. let cached_results_json: Result> = redis_cache.clone().cached_json(&url).await; @@ -223,7 +229,8 @@ async fn results( // UI and use that. let mut results: SearchResults = match req.cookie("appCookie") { Some(cookie_value) => { - let cookie_value: Cookie = serde_json::from_str(cookie_value.name_value().1)?; + let cookie_value: Cookie<'_> = + serde_json::from_str(cookie_value.name_value().1)?; let engines: Vec = cookie_value .engines @@ -267,6 +274,8 @@ async fn results( } } +/// A helper function which checks whether the search query contains any keywords which should be +/// disallowed/allowed based on the regex based rules present in the blocklist and allowlist files. fn is_match_from_filter_list( file_path: &str, query: &str, diff --git a/websurfx/config.lua b/websurfx/config.lua index fb6c4fe..09b418d 100644 --- a/websurfx/config.lua +++ b/websurfx/config.lua @@ -10,6 +10,10 @@ production_use = false -- whether to use production mode or not (in other words -- if production_use is set to true -- There will be a random delay before sending the request to the search engines, this is to prevent DDoSing the upstream search engines from a large number of simultaneous requests. request_timeout = 30 -- timeout for the search requests sent to the upstream search engines to be fetched (value in seconds). +rate_limiter = { + number_of_requests = 20, -- The number of request that are allowed within a provided time limit. + time_limit = 3, -- The time limit in which the quantity of requests that should be accepted. +} -- ### Search ### -- Filter results based on different levels. The levels provided are: @@ -45,4 +49,7 @@ theme = "simple" -- the theme name which should be used for the website redis_url = "redis://127.0.0.1:8082" -- redis connection url address on which the client should connect on. -- ### Search Engines ### -upstream_search_engines = { DuckDuckGo = true, Searx = false } -- select the upstream search engines from which the results should be fetched. +upstream_search_engines = { + DuckDuckGo = true, + Searx = false, +} -- select the upstream search engines from which the results should be fetched.