+ Your search - {{{this.pageQuery}}} -
+ has been disallowed.
+
+
Dear user,
+
+ The query - {{{this.pageQuery}}} - has
+ been blacklisted via server configuration and hence disallowed by the
+ server. Henceforth no results could be displayed for your query.
+
+
+
+
+ {{else}} {{#if filtered}}
+
+
+
+ Your search - {{{this.pageQuery}}} -
+ has been filtered.
+
+
Dear user,
+
+ All the search results contain results that has been configured to be
+ filtered out via server configuration and henceforth has been
+ completely filtered out.
+
+
+
+
+ {{else}}
+
+
Your search - {{{this.pageQuery}}} - did not match any documents.
+
Suggestions:
+
+
Make sure that all words are spelled correctly.
+
Try different keywords.
+
Try more general keywords.
+
+
+
+ {{/if}} {{/if}} {{/if}}
- {{/each}} {{else}}
-
-
Your search - {{{this.pageQuery}}} - did not match any documents.
-
Suggestions:
-
-
Make sure that all words are spelled correctly.
-
Try different keywords.
-
Try more general keywords.
-
-
+
+
+
- {{/if}}
-
-
-
-
-
diff --git a/src/bin/websurfx.rs b/src/bin/websurfx.rs
index 75d0b8d..bc8e7ce 100644
--- a/src/bin/websurfx.rs
+++ b/src/bin/websurfx.rs
@@ -3,9 +3,19 @@
//! This module contains the main function which handles the logging of the application to the
//! stdout and handles the command line arguments provided and launches the `websurfx` server.
+use mimalloc::MiMalloc;
use std::net::TcpListener;
use websurfx::{config::parser::Config, run};
+/// A dhat heap memory profiler
+#[cfg(feature = "dhat-heap")]
+#[global_allocator]
+static ALLOC: dhat::Alloc = dhat::Alloc;
+
+#[cfg(not(feature = "dhat-heap"))]
+#[global_allocator]
+static GLOBAL: MiMalloc = MiMalloc;
+
/// The function that launches the main server and registers all the routes of the website.
///
/// # Error
@@ -14,6 +24,10 @@ use websurfx::{config::parser::Config, run};
/// available for being used for other applications.
#[actix_web::main]
async fn main() -> std::io::Result<()> {
+ // A dhat heap profiler initialization.
+ #[cfg(feature = "dhat-heap")]
+ let _profiler = dhat::Profiler::new_heap();
+
// Initialize the parsed config file.
let config = Config::parse(false).unwrap();
diff --git a/src/cache/cacher.rs b/src/cache/cacher.rs
index e268ac2..57351cd 100644
--- a/src/cache/cacher.rs
+++ b/src/cache/cacher.rs
@@ -1,14 +1,24 @@
//! This module provides the functionality to cache the aggregated results fetched and aggregated
//! from the upstream search engines in a json format.
+use error_stack::Report;
+use futures::future::try_join_all;
use md5::compute;
-use redis::{Client, Commands, Connection};
+use redis::{aio::ConnectionManager, AsyncCommands, Client, RedisError};
+
+use super::error::PoolError;
/// A named struct which stores the redis Connection url address to which the client will
/// connect to.
+#[derive(Clone)]
pub struct RedisCache {
- /// It stores the redis Connection url address.
- connection: Connection,
+ /// It stores a pool of connections ready to be used.
+ connection_pool: Vec,
+ /// It stores the size of the connection pool (in other words the number of
+ /// connections that should be stored in the pool).
+ pool_size: u8,
+ /// It stores the index of which connection is being used at the moment.
+ current_connection: u8,
}
impl RedisCache {
@@ -16,11 +26,25 @@ impl RedisCache {
///
/// # Arguments
///
- /// * `redis_connection_url` - It stores the redis Connection url address.
- pub fn new(redis_connection_url: String) -> Result> {
+ /// * `redis_connection_url` - It takes the redis Connection url address.
+ /// * `pool_size` - It takes the size of the connection pool (in other words the number of
+ /// connections that should be stored in the pool).
+ pub async fn new(
+ redis_connection_url: &str,
+ pool_size: u8,
+ ) -> Result> {
let client = Client::open(redis_connection_url)?;
- let connection = client.get_connection()?;
- let redis_cache = RedisCache { connection };
+ let mut tasks: Vec<_> = Vec::new();
+
+ for _ in 0..pool_size {
+ tasks.push(client.get_tokio_connection_manager());
+ }
+
+ let redis_cache = RedisCache {
+ connection_pool: try_join_all(tasks).await?,
+ pool_size,
+ current_connection: Default::default(),
+ };
Ok(redis_cache)
}
@@ -29,7 +53,7 @@ impl RedisCache {
/// # Arguments
///
/// * `url` - It takes an url as string.
- fn hash_url(url: &str) -> String {
+ fn hash_url(&self, url: &str) -> String {
format!("{:?}", compute(url))
}
@@ -38,9 +62,42 @@ impl RedisCache {
/// # Arguments
///
/// * `url` - It takes an url as a string.
- pub fn cached_json(&mut self, url: &str) -> Result> {
- let hashed_url_string = Self::hash_url(url);
- Ok(self.connection.get(hashed_url_string)?)
+ pub async fn cached_json(&mut self, url: &str) -> Result> {
+ self.current_connection = Default::default();
+ let hashed_url_string: &str = &self.hash_url(url);
+
+ let mut result: Result = self.connection_pool
+ [self.current_connection as usize]
+ .get(hashed_url_string)
+ .await;
+
+ // Code to check whether the current connection being used is dropped with connection error
+ // or not. if it drops with the connection error then the current connection is replaced
+ // with a new connection from the pool which is then used to run the redis command then
+ // that connection is also checked whether it is dropped or not if it is not then the
+ // result is passed as a `Result` or else the same process repeats again and if all of the
+ // connections in the pool result in connection drop error then a custom pool error is
+ // returned.
+ loop {
+ match result {
+ Err(error) => match error.is_connection_dropped() {
+ true => {
+ self.current_connection += 1;
+ if self.current_connection == self.pool_size {
+ return Err(Report::new(
+ PoolError::PoolExhaustionWithConnectionDropError,
+ ));
+ }
+ result = self.connection_pool[self.current_connection as usize]
+ .get(hashed_url_string)
+ .await;
+ continue;
+ }
+ false => return Err(Report::new(PoolError::RedisError(error))),
+ },
+ Ok(res) => return Ok(res),
+ }
+ }
}
/// A function which caches the results by using the hashed `url` as the key and
@@ -51,21 +108,45 @@ impl RedisCache {
///
/// * `json_results` - It takes the json results string as an argument.
/// * `url` - It takes the url as a String.
- pub fn cache_results(
+ pub async fn cache_results(
&mut self,
- json_results: String,
+ json_results: &str,
url: &str,
- ) -> Result<(), Box> {
- let hashed_url_string = Self::hash_url(url);
+ ) -> Result<(), Report> {
+ self.current_connection = Default::default();
+ let hashed_url_string: &str = &self.hash_url(url);
- // put results_json into cache
- self.connection.set(&hashed_url_string, json_results)?;
+ let mut result: Result<(), RedisError> = self.connection_pool
+ [self.current_connection as usize]
+ .set_ex(hashed_url_string, json_results, 60)
+ .await;
- // Set the TTL for the key to 60 seconds
- self.connection
- .expire::(hashed_url_string, 60)
- .unwrap();
-
- Ok(())
+ // Code to check whether the current connection being used is dropped with connection error
+ // or not. if it drops with the connection error then the current connection is replaced
+ // with a new connection from the pool which is then used to run the redis command then
+ // that connection is also checked whether it is dropped or not if it is not then the
+ // result is passed as a `Result` or else the same process repeats again and if all of the
+ // connections in the pool result in connection drop error then a custom pool error is
+ // returned.
+ loop {
+ match result {
+ Err(error) => match error.is_connection_dropped() {
+ true => {
+ self.current_connection += 1;
+ if self.current_connection == self.pool_size {
+ return Err(Report::new(
+ PoolError::PoolExhaustionWithConnectionDropError,
+ ));
+ }
+ result = self.connection_pool[self.current_connection as usize]
+ .set_ex(hashed_url_string, json_results, 60)
+ .await;
+ continue;
+ }
+ false => return Err(Report::new(PoolError::RedisError(error))),
+ },
+ Ok(_) => return Ok(()),
+ }
+ }
}
}
diff --git a/src/cache/error.rs b/src/cache/error.rs
new file mode 100644
index 0000000..efd87c9
--- /dev/null
+++ b/src/cache/error.rs
@@ -0,0 +1,40 @@
+//! This module provides the error enum to handle different errors associated while requesting data from
+//! the redis server using an async connection pool.
+use std::fmt;
+
+use redis::RedisError;
+
+/// A custom error type used for handling redis async pool associated errors.
+///
+/// This enum provides variants three different categories of errors:
+/// * `RedisError` - This variant handles all errors related to `RedisError`,
+/// * `PoolExhaustionWithConnectionDropError` - This variant handles the error
+/// which occurs when all the connections in the connection pool return a connection
+/// dropped redis error.
+#[derive(Debug)]
+pub enum PoolError {
+ RedisError(RedisError),
+ PoolExhaustionWithConnectionDropError,
+}
+
+impl fmt::Display for PoolError {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ match self {
+ PoolError::RedisError(redis_error) => {
+ if let Some(detail) = redis_error.detail() {
+ write!(f, "{}", detail)
+ } else {
+ write!(f, "")
+ }
+ }
+ PoolError::PoolExhaustionWithConnectionDropError => {
+ write!(
+ f,
+ "Error all connections from the pool dropped with connection error"
+ )
+ }
+ }
+ }
+}
+
+impl error_stack::Context for PoolError {}
diff --git a/src/cache/mod.rs b/src/cache/mod.rs
index 82e3377..f40369f 100644
--- a/src/cache/mod.rs
+++ b/src/cache/mod.rs
@@ -2,3 +2,4 @@
//! results fetched and aggregated from the upstream search engines in a json format.
pub mod cacher;
+pub mod error;
diff --git a/src/config/parser.rs b/src/config/parser.rs
index ca53f1b..e3b3588 100644
--- a/src/config/parser.rs
+++ b/src/config/parser.rs
@@ -3,9 +3,9 @@
use crate::handler::paths::{file_path, FileType};
-use super::parser_models::Style;
+use super::parser_models::{AggregatorConfig, RateLimiter, Style};
use log::LevelFilter;
-use rlua::Lua;
+use mlua::Lua;
use std::{collections::HashMap, fs, thread::available_parallelism};
/// A named struct which stores the parsed config file options.
@@ -32,14 +32,11 @@ pub struct Config {
pub request_timeout: u8,
/// It stores the number of threads which controls the app will use to run.
pub threads: u8,
-}
-
-/// Configuration options for the aggregator.
-#[derive(Clone)]
-pub struct AggregatorConfig {
- /// It stores the option to whether enable or disable random delays between
- /// requests.
- pub random_delay: bool,
+ /// It stores configuration options for the ratelimiting middleware.
+ pub rate_limiter: RateLimiter,
+ /// It stores the level of safe search to be used for restricting content in the
+ /// search results.
+ pub safe_search: u8,
}
impl Config {
@@ -57,53 +54,70 @@ impl Config {
/// or io error if the config.lua file doesn't exists otherwise it returns a newly constructed
/// Config struct with all the parsed config options from the parsed config file.
pub fn parse(logging_initialized: bool) -> Result> {
- Lua::new().context(|context| -> Result> {
- let globals = context.globals();
+ let lua = Lua::new();
+ let globals = lua.globals();
- context
- .load(&fs::read_to_string(file_path(FileType::Config)?)?)
- .exec()?;
+ lua.load(&fs::read_to_string(file_path(FileType::Config)?)?)
+ .exec()?;
- let parsed_threads: u8 = globals.get::<_, u8>("threads")?;
+ let parsed_threads: u8 = globals.get::<_, u8>("threads")?;
- let debug: bool = globals.get::<_, bool>("debug")?;
- let logging:bool= globals.get::<_, bool>("logging")?;
+ let debug: bool = globals.get::<_, bool>("debug")?;
+ let logging: bool = globals.get::<_, bool>("logging")?;
- if !logging_initialized {
- set_logging_level(debug, logging);
+ if !logging_initialized {
+ set_logging_level(debug, logging);
+ }
+
+ let threads: u8 = if parsed_threads == 0 {
+ let total_num_of_threads: usize = available_parallelism()?.get() / 2;
+ log::error!(
+ "Config Error: The value of `threads` option should be a non zero positive integer"
+ );
+ log::error!("Falling back to using {} threads", total_num_of_threads);
+ total_num_of_threads as u8
+ } else {
+ parsed_threads
+ };
+
+ let rate_limiter = globals.get::<_, HashMap>("rate_limiter")?;
+
+ let parsed_safe_search: u8 = globals.get::<_, u8>("safe_search")?;
+ let safe_search: u8 = match parsed_safe_search {
+ 0..=4 => parsed_safe_search,
+ _ => {
+ log::error!("Config Error: The value of `safe_search` option should be a non zero positive integer from 0 to 4.");
+ log::error!("Falling back to using the value `1` for the option");
+ 1
}
+ };
- let threads: u8 = if parsed_threads == 0 {
- let total_num_of_threads: usize = available_parallelism()?.get() / 2;
- log::error!("Config Error: The value of `threads` option should be a non zero positive integer");
- log::error!("Falling back to using {} threads", total_num_of_threads);
- total_num_of_threads as u8
- } else {
- parsed_threads
- };
-
- Ok(Config {
- port: globals.get::<_, u16>("port")?,
- binding_ip: globals.get::<_, String>("binding_ip")?,
- style: Style::new(
- globals.get::<_, String>("theme")?,
- globals.get::<_, String>("colorscheme")?,
- ),
- redis_url: globals.get::<_, String>("redis_url")?,
- aggregator: AggregatorConfig {
- random_delay: globals.get::<_, bool>("production_use")?,
- },
- logging,
- debug,
- upstream_search_engines: globals
- .get::<_, HashMap>("upstream_search_engines")?
- .into_iter()
- .filter_map(|(key, value)| value.then_some(key))
- .filter_map(|engine| crate::engines::engine_models::EngineHandler::new(&engine))
- .collect(),
- request_timeout: globals.get::<_, u8>("request_timeout")?,
- threads,
- })
+ Ok(Config {
+ port: globals.get::<_, u16>("port")?,
+ binding_ip: globals.get::<_, String>("binding_ip")?,
+ style: Style::new(
+ globals.get::<_, String>("theme")?,
+ globals.get::<_, String>("colorscheme")?,
+ ),
+ redis_url: globals.get::<_, String>("redis_url")?,
+ aggregator: AggregatorConfig {
+ random_delay: globals.get::<_, bool>("production_use")?,
+ },
+ logging,
+ debug,
+ upstream_search_engines: globals
+ .get::<_, HashMap>("upstream_search_engines")?
+ .into_iter()
+ .filter_map(|(key, value)| value.then_some(key))
+ .filter_map(|engine| crate::engines::engine_models::EngineHandler::new(&engine))
+ .collect(),
+ request_timeout: globals.get::<_, u8>("request_timeout")?,
+ threads,
+ rate_limiter: RateLimiter {
+ number_of_requests: rate_limiter["number_of_requests"],
+ time_limit: rate_limiter["time_limit"],
+ },
+ safe_search,
})
}
}
diff --git a/src/config/parser_models.rs b/src/config/parser_models.rs
index aa0b86f..9dad348 100644
--- a/src/config/parser_models.rs
+++ b/src/config/parser_models.rs
@@ -12,7 +12,7 @@ use serde::{Deserialize, Serialize};
/// order to allow the deserializing the json back to struct in aggregate function in
/// aggregator.rs and create a new struct out of it and then serialize it back to json and pass
/// it to the template files.
-#[derive(Serialize, Deserialize, Clone)]
+#[derive(Serialize, Deserialize, Clone, Default)]
pub struct Style {
/// It stores the parsed theme option used to set a theme for the website.
pub theme: String,
@@ -33,3 +33,20 @@ impl Style {
Style { theme, colorscheme }
}
}
+
+/// Configuration options for the aggregator.
+#[derive(Clone)]
+pub struct AggregatorConfig {
+ /// It stores the option to whether enable or disable random delays between
+ /// requests.
+ pub random_delay: bool,
+}
+
+/// Configuration options for the rate limiter middleware.
+#[derive(Clone)]
+pub struct RateLimiter {
+ /// The number of request that are allowed within a provided time limit.
+ pub number_of_requests: u8,
+ /// The time limit in which the quantity of requests that should be accepted.
+ pub time_limit: u8,
+}
diff --git a/src/engines/duckduckgo.rs b/src/engines/duckduckgo.rs
index 5b7a452..edca35a 100644
--- a/src/engines/duckduckgo.rs
+++ b/src/engines/duckduckgo.rs
@@ -4,14 +4,14 @@
use std::collections::HashMap;
-use reqwest::header::{HeaderMap, CONTENT_TYPE, COOKIE, REFERER, USER_AGENT};
+use reqwest::header::HeaderMap;
use scraper::{Html, Selector};
use crate::results::aggregation_models::SearchResult;
use super::engine_models::{EngineError, SearchEngine};
-use error_stack::{IntoReport, Report, Result, ResultExt};
+use error_stack::{Report, Result, ResultExt};
/// A new DuckDuckGo engine type defined in-order to implement the `SearchEngine` trait which allows to
/// reduce code duplication as well as allows to create vector of different search engines easily.
@@ -21,10 +21,11 @@ pub struct DuckDuckGo;
impl SearchEngine for DuckDuckGo {
async fn results(
&self,
- query: String,
+ query: &str,
page: u32,
- user_agent: String,
+ user_agent: &str,
request_timeout: u8,
+ _safe_search: u8,
) -> Result, EngineError> {
// Page number can be missing or empty string and so appropriate handling is required
// so that upstream server recieves valid page number.
@@ -43,38 +44,19 @@ impl SearchEngine for DuckDuckGo {
};
// initializing HeaderMap and adding appropriate headers.
- let mut header_map = HeaderMap::new();
- header_map.insert(
- USER_AGENT,
- user_agent
- .parse()
- .into_report()
- .change_context(EngineError::UnexpectedError)?,
- );
- header_map.insert(
- REFERER,
- "https://google.com/"
- .parse()
- .into_report()
- .change_context(EngineError::UnexpectedError)?,
- );
- header_map.insert(
- CONTENT_TYPE,
- "application/x-www-form-urlencoded"
- .parse()
- .into_report()
- .change_context(EngineError::UnexpectedError)?,
- );
- header_map.insert(
- COOKIE,
- "kl=wt-wt"
- .parse()
- .into_report()
- .change_context(EngineError::UnexpectedError)?,
- );
+ let header_map = HeaderMap::try_from(&HashMap::from([
+ ("USER_AGENT".to_string(), user_agent.to_string()),
+ ("REFERER".to_string(), "https://google.com/".to_string()),
+ (
+ "CONTENT_TYPE".to_string(),
+ "application/x-www-form-urlencoded".to_string(),
+ ),
+ ("COOKIE".to_string(), "kl=wt-wt".to_string()),
+ ]))
+ .change_context(EngineError::UnexpectedError)?;
let document: Html = Html::parse_document(
- &DuckDuckGo::fetch_html_from_upstream(self, url, header_map, request_timeout).await?,
+ &DuckDuckGo::fetch_html_from_upstream(self, &url, header_map, request_timeout).await?,
);
let no_result: Selector = Selector::parse(".no-results")
@@ -108,8 +90,7 @@ impl SearchEngine for DuckDuckGo {
.next()
.unwrap()
.inner_html()
- .trim()
- .to_string(),
+ .trim(),
format!(
"https://{}",
result
@@ -118,15 +99,15 @@ impl SearchEngine for DuckDuckGo {
.unwrap()
.inner_html()
.trim()
- ),
+ )
+ .as_str(),
result
.select(&result_desc)
.next()
.unwrap()
.inner_html()
- .trim()
- .to_string(),
- vec!["duckduckgo".to_string()],
+ .trim(),
+ &["duckduckgo"],
)
})
.map(|search_result| (search_result.url.clone(), search_result))
diff --git a/src/engines/engine_models.rs b/src/engines/engine_models.rs
index 2f28ee5..2bd50c6 100644
--- a/src/engines/engine_models.rs
+++ b/src/engines/engine_models.rs
@@ -2,7 +2,7 @@
//! the upstream search engines with the search query provided by the user.
use crate::results::aggregation_models::SearchResult;
-use error_stack::{IntoReport, Result, ResultExt};
+use error_stack::{Result, ResultExt};
use std::{collections::HashMap, fmt, time::Duration};
/// A custom error type used for handle engine associated errors.
@@ -64,7 +64,7 @@ pub trait SearchEngine: Sync + Send {
/// otherwise it returns a custom `EngineError`.
async fn fetch_html_from_upstream(
&self,
- url: String,
+ url: &str,
header_map: reqwest::header::HeaderMap,
request_timeout: u8,
) -> Result {
@@ -75,11 +75,9 @@ pub trait SearchEngine: Sync + Send {
.headers(header_map) // add spoofed headers to emulate human behavior
.send()
.await
- .into_report()
.change_context(EngineError::RequestError)?
.text()
.await
- .into_report()
.change_context(EngineError::RequestError)?)
}
@@ -103,10 +101,11 @@ pub trait SearchEngine: Sync + Send {
/// or HeaderMap fails to initialize.
async fn results(
&self,
- query: String,
+ query: &str,
page: u32,
- user_agent: String,
+ user_agent: &str,
request_timeout: u8,
+ safe_search: u8,
) -> Result, EngineError>;
}
diff --git a/src/engines/searx.rs b/src/engines/searx.rs
index 3f261ad..170364c 100644
--- a/src/engines/searx.rs
+++ b/src/engines/searx.rs
@@ -2,14 +2,14 @@
//! by querying the upstream searx search engine instance with user provided query and with a page
//! number if provided.
-use reqwest::header::{HeaderMap, CONTENT_TYPE, COOKIE, REFERER, USER_AGENT};
+use reqwest::header::HeaderMap;
use scraper::{Html, Selector};
use std::collections::HashMap;
use crate::results::aggregation_models::SearchResult;
use super::engine_models::{EngineError, SearchEngine};
-use error_stack::{IntoReport, Report, Result, ResultExt};
+use error_stack::{Report, Result, ResultExt};
/// A new Searx engine type defined in-order to implement the `SearchEngine` trait which allows to
/// reduce code duplication as well as allows to create vector of different search engines easily.
@@ -19,45 +19,38 @@ pub struct Searx;
impl SearchEngine for Searx {
async fn results(
&self,
- query: String,
+ query: &str,
page: u32,
- user_agent: String,
+ user_agent: &str,
request_timeout: u8,
+ mut safe_search: u8,
) -> Result, EngineError> {
// Page number can be missing or empty string and so appropriate handling is required
// so that upstream server recieves valid page number.
+ if safe_search == 3 {
+ safe_search = 2;
+ };
+
let url: String = match page {
- 0 | 1 => format!("https://searx.work/search?q={query}&pageno=1"),
- _ => format!("https://searx.work/search?q={query}&pageno={page}"),
+ 0 | 1 => {
+ format!("https://searx.work/search?q={query}&pageno=1&safesearch={safe_search}")
+ }
+ _ => format!(
+ "https://searx.work/search?q={query}&pageno={page}&safesearch={safe_search}"
+ ),
};
// initializing headers and adding appropriate headers.
- let mut header_map = HeaderMap::new();
- header_map.insert(
- USER_AGENT,
- user_agent
- .parse()
- .into_report()
- .change_context(EngineError::UnexpectedError)?,
- );
- header_map.insert(
- REFERER,
- "https://google.com/"
- .parse()
- .into_report()
- .change_context(EngineError::UnexpectedError)?,
- );
- header_map.insert(
- CONTENT_TYPE,
- "application/x-www-form-urlencoded"
- .parse()
- .into_report()
- .change_context(EngineError::UnexpectedError)?,
- );
- header_map.insert(COOKIE, "categories=general; language=auto; locale=en; autocomplete=duckduckgo; image_proxy=1; method=POST; safesearch=2; theme=simple; results_on_new_tab=1; doi_resolver=oadoi.org; simple_style=auto; center_alignment=1; query_in_title=1; infinite_scroll=0; disabled_engines=; enabled_engines=\"archive is__general\\054yep__general\\054curlie__general\\054currency__general\\054ddg definitions__general\\054wikidata__general\\054duckduckgo__general\\054tineye__general\\054lingva__general\\054startpage__general\\054yahoo__general\\054wiby__general\\054marginalia__general\\054alexandria__general\\054wikibooks__general\\054wikiquote__general\\054wikisource__general\\054wikiversity__general\\054wikivoyage__general\\054dictzone__general\\054seznam__general\\054mojeek__general\\054naver__general\\054wikimini__general\\054brave__general\\054petalsearch__general\\054goo__general\"; disabled_plugins=; enabled_plugins=\"searx.plugins.hostname_replace\\054searx.plugins.oa_doi_rewrite\\054searx.plugins.vim_hotkeys\"; tokens=; maintab=on; enginetab=on".parse().into_report().change_context(EngineError::UnexpectedError)?);
+ let header_map = HeaderMap::try_from(&HashMap::from([
+ ("USER_AGENT".to_string(), user_agent.to_string()),
+ ("REFERER".to_string(), "https://google.com/".to_string()),
+ ("CONTENT_TYPE".to_string(), "application/x-www-form-urlencoded".to_string()),
+ ("COOKIE".to_string(), "categories=general; language=auto; locale=en; autocomplete=duckduckgo; image_proxy=1; method=POST; safesearch=2; theme=simple; results_on_new_tab=1; doi_resolver=oadoi.org; simple_style=auto; center_alignment=1; query_in_title=1; infinite_scroll=0; disabled_engines=; enabled_engines=\"archive is__general\\054yep__general\\054curlie__general\\054currency__general\\054ddg definitions__general\\054wikidata__general\\054duckduckgo__general\\054tineye__general\\054lingva__general\\054startpage__general\\054yahoo__general\\054wiby__general\\054marginalia__general\\054alexandria__general\\054wikibooks__general\\054wikiquote__general\\054wikisource__general\\054wikiversity__general\\054wikivoyage__general\\054dictzone__general\\054seznam__general\\054mojeek__general\\054naver__general\\054wikimini__general\\054brave__general\\054petalsearch__general\\054goo__general\"; disabled_plugins=; enabled_plugins=\"searx.plugins.hostname_replace\\054searx.plugins.oa_doi_rewrite\\054searx.plugins.vim_hotkeys\"; tokens=; maintab=on; enginetab=on".to_string())
+ ]))
+ .change_context(EngineError::UnexpectedError)?;
let document: Html = Html::parse_document(
- &Searx::fetch_html_from_upstream(self, url, header_map, request_timeout).await?,
+ &Searx::fetch_html_from_upstream(self, &url, header_map, request_timeout).await?,
);
let no_result: Selector = Selector::parse("#urls>.dialog-error>p")
@@ -98,24 +91,21 @@ impl SearchEngine for Searx {
.next()
.unwrap()
.inner_html()
- .trim()
- .to_string(),
+ .trim(),
result
.select(&result_url)
.next()
.unwrap()
.value()
.attr("href")
- .unwrap()
- .to_string(),
+ .unwrap(),
result
.select(&result_desc)
.next()
.unwrap()
.inner_html()
- .trim()
- .to_string(),
- vec!["searx".to_string()],
+ .trim(),
+ &["searx"],
)
})
.map(|search_result| (search_result.url.clone(), search_result))
diff --git a/src/handler/paths.rs b/src/handler/paths.rs
index 44228d8..9ea5fff 100644
--- a/src/handler/paths.rs
+++ b/src/handler/paths.rs
@@ -4,6 +4,7 @@
use std::collections::HashMap;
use std::io::Error;
use std::path::Path;
+use std::sync::OnceLock;
// ------- Constants --------
/// The constant holding the name of the theme folder.
@@ -31,57 +32,7 @@ pub enum FileType {
}
/// A static variable which stores the different filesystem paths for various file/folder types.
-static FILE_PATHS_FOR_DIFF_FILE_TYPES: once_cell::sync::Lazy>> =
- once_cell::sync::Lazy::new(|| {
- HashMap::from([
- (
- FileType::Config,
- vec![
- format!(
- "{}/.config/{}/{}",
- std::env::var("HOME").unwrap(),
- COMMON_DIRECTORY_NAME,
- CONFIG_FILE_NAME
- ),
- format!("/etc/xdg/{}/{}", COMMON_DIRECTORY_NAME, CONFIG_FILE_NAME),
- format!("./{}/{}", COMMON_DIRECTORY_NAME, CONFIG_FILE_NAME),
- ],
- ),
- (
- FileType::Theme,
- vec![
- format!("/opt/websurfx/{}/", PUBLIC_DIRECTORY_NAME),
- format!("./{}/", PUBLIC_DIRECTORY_NAME),
- ],
- ),
- (
- FileType::AllowList,
- vec![
- format!(
- "{}/.config/{}/{}",
- std::env::var("HOME").unwrap(),
- COMMON_DIRECTORY_NAME,
- ALLOWLIST_FILE_NAME
- ),
- format!("/etc/xdg/{}/{}", COMMON_DIRECTORY_NAME, ALLOWLIST_FILE_NAME),
- format!("./{}/{}", COMMON_DIRECTORY_NAME, ALLOWLIST_FILE_NAME),
- ],
- ),
- (
- FileType::BlockList,
- vec![
- format!(
- "{}/.config/{}/{}",
- std::env::var("HOME").unwrap(),
- COMMON_DIRECTORY_NAME,
- BLOCKLIST_FILE_NAME
- ),
- format!("/etc/xdg/{}/{}", COMMON_DIRECTORY_NAME, BLOCKLIST_FILE_NAME),
- format!("./{}/{}", COMMON_DIRECTORY_NAME, BLOCKLIST_FILE_NAME),
- ],
- ),
- ])
- });
+static FILE_PATHS_FOR_DIFF_FILE_TYPES: OnceLock>> = OnceLock::new();
/// A function which returns an appropriate path for thr provided file type by checking if the path
/// for the given file type exists on that path.
@@ -99,11 +50,64 @@ static FILE_PATHS_FOR_DIFF_FILE_TYPES: once_cell::sync::Lazy Result {
- let file_path = FILE_PATHS_FOR_DIFF_FILE_TYPES.get(&file_type).unwrap();
+pub fn file_path(file_type: FileType) -> Result<&'static str, Error> {
+ let file_path: &Vec = FILE_PATHS_FOR_DIFF_FILE_TYPES
+ .get_or_init(|| {
+ HashMap::from([
+ (
+ FileType::Config,
+ vec![
+ format!(
+ "{}/.config/{}/{}",
+ std::env::var("HOME").unwrap(),
+ COMMON_DIRECTORY_NAME,
+ CONFIG_FILE_NAME
+ ),
+ format!("/etc/xdg/{}/{}", COMMON_DIRECTORY_NAME, CONFIG_FILE_NAME),
+ format!("./{}/{}", COMMON_DIRECTORY_NAME, CONFIG_FILE_NAME),
+ ],
+ ),
+ (
+ FileType::Theme,
+ vec![
+ format!("/opt/websurfx/{}/", PUBLIC_DIRECTORY_NAME),
+ format!("./{}/", PUBLIC_DIRECTORY_NAME),
+ ],
+ ),
+ (
+ FileType::AllowList,
+ vec![
+ format!(
+ "{}/.config/{}/{}",
+ std::env::var("HOME").unwrap(),
+ COMMON_DIRECTORY_NAME,
+ ALLOWLIST_FILE_NAME
+ ),
+ format!("/etc/xdg/{}/{}", COMMON_DIRECTORY_NAME, ALLOWLIST_FILE_NAME),
+ format!("./{}/{}", COMMON_DIRECTORY_NAME, ALLOWLIST_FILE_NAME),
+ ],
+ ),
+ (
+ FileType::BlockList,
+ vec![
+ format!(
+ "{}/.config/{}/{}",
+ std::env::var("HOME").unwrap(),
+ COMMON_DIRECTORY_NAME,
+ BLOCKLIST_FILE_NAME
+ ),
+ format!("/etc/xdg/{}/{}", COMMON_DIRECTORY_NAME, BLOCKLIST_FILE_NAME),
+ format!("./{}/{}", COMMON_DIRECTORY_NAME, BLOCKLIST_FILE_NAME),
+ ],
+ ),
+ ])
+ })
+ .get(&file_type)
+ .unwrap();
+
for (idx, _) in file_path.iter().enumerate() {
if Path::new(file_path[idx].as_str()).exists() {
- return Ok(file_path[idx].clone());
+ return Ok(std::mem::take(&mut &*file_path[idx]));
}
}
diff --git a/src/lib.rs b/src/lib.rs
index 52fb56d..b33ace3 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -18,6 +18,7 @@ use crate::server::routes;
use actix_cors::Cors;
use actix_files as fs;
+use actix_governor::{Governor, GovernorConfigBuilder};
use actix_web::{dev::Server, http::header, middleware::Logger, web, App, HttpServer};
use config::parser::Config;
use handlebars::Handlebars;
@@ -46,7 +47,7 @@ use handler::paths::{file_path, FileType};
pub fn run(listener: TcpListener, config: Config) -> std::io::Result {
let mut handlebars: Handlebars<'_> = Handlebars::new();
- let public_folder_path: String = file_path(FileType::Theme)?;
+ let public_folder_path: &str = file_path(FileType::Theme)?;
handlebars
.register_templates_directory(".html", format!("{}/templates", public_folder_path))
@@ -68,10 +69,17 @@ pub fn run(listener: TcpListener, config: Config) -> std::io::Result {
]);
App::new()
+ .wrap(Logger::default()) // added logging middleware for logging.
.app_data(handlebars_ref.clone())
.app_data(web::Data::new(config.clone()))
.wrap(cors)
- .wrap(Logger::default()) // added logging middleware for logging.
+ .wrap(Governor::new(
+ &GovernorConfigBuilder::default()
+ .per_second(config.rate_limiter.time_limit as u64)
+ .burst_size(config.rate_limiter.number_of_requests as u32)
+ .finish()
+ .unwrap(),
+ ))
// Serve images and static files (css and js files).
.service(
fs::Files::new("/static", format!("{}/static", public_folder_path))
diff --git a/src/results/aggregation_models.rs b/src/results/aggregation_models.rs
index 76d896d..495572a 100644
--- a/src/results/aggregation_models.rs
+++ b/src/results/aggregation_models.rs
@@ -2,6 +2,7 @@
//! data scraped from the upstream search engines.
use serde::{Deserialize, Serialize};
+use smallvec::SmallVec;
use crate::{config::parser_models::Style, engines::engine_models::EngineError};
@@ -19,7 +20,7 @@ pub struct SearchResult {
/// The description of the search result.
pub description: String,
/// The names of the upstream engines from which this results were provided.
- pub engine: Vec,
+ pub engine: SmallVec<[String; 0]>,
}
impl SearchResult {
@@ -32,12 +33,12 @@ impl SearchResult {
/// (href url in html in simple words).
/// * `description` - The description of the search result.
/// * `engine` - The names of the upstream engines from which this results were provided.
- pub fn new(title: String, url: String, description: String, engine: Vec) -> Self {
+ pub fn new(title: &str, url: &str, description: &str, engine: &[&str]) -> Self {
SearchResult {
- title,
- url,
- description,
- engine,
+ title: title.to_owned(),
+ url: url.to_owned(),
+ description: description.to_owned(),
+ engine: engine.iter().map(|name| name.to_string()).collect(),
}
}
@@ -46,8 +47,8 @@ impl SearchResult {
/// # Arguments
///
/// * `engine` - Takes an engine name provided as a String.
- pub fn add_engines(&mut self, engine: String) {
- self.engine.push(engine)
+ pub fn add_engines(&mut self, engine: &str) {
+ self.engine.push(engine.to_owned())
}
/// A function which returns the engine name stored from the struct as a string.
@@ -55,13 +56,13 @@ impl SearchResult {
/// # Returns
///
/// An engine name stored as a string from the struct.
- pub fn engine(self) -> String {
- self.engine.get(0).unwrap().to_string()
+ pub fn engine(&mut self) -> String {
+ std::mem::take(&mut self.engine[0])
}
}
/// A named struct that stores the error info related to the upstream search engines.
-#[derive(Serialize, Deserialize)]
+#[derive(Serialize, Deserialize, Clone)]
pub struct EngineErrorInfo {
/// It stores the error type which occured while fetching the result from a particular search
/// engine.
@@ -81,18 +82,18 @@ impl EngineErrorInfo {
/// * `error` - It takes the error type which occured while fetching the result from a particular
/// search engine.
/// * `engine` - It takes the name of the engine that failed to provide the requested search results.
- pub fn new(error: &EngineError, engine: String) -> Self {
+ pub fn new(error: &EngineError, engine: &str) -> Self {
Self {
error: match error {
- EngineError::RequestError => String::from("RequestError"),
- EngineError::EmptyResultSet => String::from("EmptyResultSet"),
- EngineError::UnexpectedError => String::from("UnexpectedError"),
+ EngineError::RequestError => "RequestError".to_owned(),
+ EngineError::EmptyResultSet => "EmptyResultSet".to_owned(),
+ EngineError::UnexpectedError => "UnexpectedError".to_owned(),
},
- engine,
+ engine: engine.to_owned(),
severity_color: match error {
- EngineError::RequestError => String::from("green"),
- EngineError::EmptyResultSet => String::from("blue"),
- EngineError::UnexpectedError => String::from("red"),
+ EngineError::RequestError => "green".to_owned(),
+ EngineError::EmptyResultSet => "blue".to_owned(),
+ EngineError::UnexpectedError => "red".to_owned(),
},
}
}
@@ -101,7 +102,7 @@ impl EngineErrorInfo {
/// A named struct to store, serialize, deserialize the all the search results scraped and
/// aggregated from the upstream search engines.
/// `SearchResult` structs.
-#[derive(Serialize, Deserialize)]
+#[derive(Serialize, Deserialize, Default)]
#[serde(rename_all = "camelCase")]
pub struct SearchResults {
/// Stores the individual serializable `SearchResult` struct into a vector of
@@ -113,6 +114,14 @@ pub struct SearchResults {
/// Stores the information on which engines failed with their engine name
/// and the type of error that caused it.
pub engine_errors_info: Vec,
+ /// Stores the flag option which holds the check value that the following
+ /// search query was disallowed when the safe search level set to 4 and it
+ /// was present in the `Blocklist` file.
+ pub disallowed: bool,
+ /// Stores the flag option which holds the check value that the following
+ /// search query was filtered when the safe search level set to 3 and it
+ /// was present in the `Blocklist` file.
+ pub filtered: bool,
}
impl SearchResults {
@@ -126,21 +135,48 @@ impl SearchResults {
/// the search url.
/// * `empty_result_set` - Takes a boolean which indicates that no engines gave a result for the
/// given search query.
+ /// * ``
pub fn new(
results: Vec,
- page_query: String,
- engine_errors_info: Vec,
+ page_query: &str,
+ engine_errors_info: &[EngineErrorInfo],
) -> Self {
- SearchResults {
+ Self {
results,
- page_query,
- style: Style::new("".to_string(), "".to_string()),
- engine_errors_info,
+ page_query: page_query.to_owned(),
+ style: Style::default(),
+ engine_errors_info: engine_errors_info.to_owned(),
+ disallowed: Default::default(),
+ filtered: Default::default(),
}
}
/// A setter function to add website style to the return search results.
- pub fn add_style(&mut self, style: Style) {
- self.style = style;
+ pub fn add_style(&mut self, style: &Style) {
+ self.style = style.clone();
+ }
+
+ /// A setter function that sets disallowed to true.
+ pub fn set_disallowed(&mut self) {
+ self.disallowed = true;
+ }
+
+ /// A setter function to set the current page search query.
+ pub fn set_page_query(&mut self, page: &str) {
+ self.page_query = page.to_owned();
+ }
+
+ /// A setter function that sets the filtered to true.
+ pub fn set_filtered(&mut self) {
+ self.filtered = true;
+ }
+
+ /// A getter function that gets the value of `engine_errors_info`.
+ pub fn engine_errors_info(&mut self) -> Vec {
+ std::mem::take(&mut self.engine_errors_info)
+ }
+ /// A getter function that gets the value of `results`.
+ pub fn results(&mut self) -> Vec {
+ self.results.clone()
}
}
diff --git a/src/results/aggregator.rs b/src/results/aggregator.rs
index 3f06ecb..734a65f 100644
--- a/src/results/aggregator.rs
+++ b/src/results/aggregator.rs
@@ -64,14 +64,15 @@ type FutureVec = Vec, Report,
+ upstream_search_engines: &[EngineHandler],
request_timeout: u8,
+ safe_search: u8,
) -> Result> {
- let user_agent: String = random_user_agent();
+ let user_agent: &str = random_user_agent();
// Add a random delay before making the request.
if random_delay || !debug {
@@ -80,19 +81,24 @@ pub async fn aggregate(
tokio::time::sleep(Duration::from_secs(delay_secs)).await;
}
- let mut names: Vec<&str> = vec![];
+ let mut names: Vec<&str> = Vec::with_capacity(0);
// create tasks for upstream result fetching
let mut tasks: FutureVec = FutureVec::new();
for engine_handler in upstream_search_engines {
- let (name, search_engine) = engine_handler.into_name_engine();
+ let (name, search_engine) = engine_handler.to_owned().into_name_engine();
names.push(name);
- let query: String = query.clone();
- let user_agent: String = user_agent.clone();
+ let query: String = query.to_owned();
tasks.push(tokio::spawn(async move {
search_engine
- .results(query, page, user_agent.clone(), request_timeout)
+ .results(
+ &query,
+ page,
+ user_agent.clone(),
+ request_timeout,
+ safe_search,
+ )
.await
}));
}
@@ -110,7 +116,7 @@ pub async fn aggregate(
let mut result_map: HashMap = HashMap::new();
let mut engine_errors_info: Vec = Vec::new();
- let mut handle_error = |error: Report, engine_name: String| {
+ let mut handle_error = |error: &Report, engine_name: &'static str| {
log::error!("Engine Error: {:?}", error);
engine_errors_info.push(EngineErrorInfo::new(
error.downcast_ref::().unwrap(),
@@ -120,7 +126,7 @@ pub async fn aggregate(
for _ in 0..responses.len() {
let response = responses.pop().unwrap();
- let engine = names.pop().unwrap().to_string();
+ let engine = names.pop().unwrap();
if result_map.is_empty() {
match response {
@@ -128,7 +134,7 @@ pub async fn aggregate(
result_map = results.clone();
}
Err(error) => {
- handle_error(error, engine);
+ handle_error(&error, engine);
}
}
continue;
@@ -140,39 +146,37 @@ pub async fn aggregate(
result_map
.entry(key)
.and_modify(|result| {
- result.add_engines(engine.clone());
+ result.add_engines(engine);
})
.or_insert_with(|| -> SearchResult { value });
});
}
Err(error) => {
- handle_error(error, engine);
+ handle_error(&error, engine);
}
}
}
- let mut blacklist_map: HashMap = HashMap::new();
- filter_with_lists(
- &mut result_map,
- &mut blacklist_map,
- &file_path(FileType::BlockList)?,
- )?;
+ if safe_search >= 3 {
+ let mut blacklist_map: HashMap = HashMap::new();
+ filter_with_lists(
+ &mut result_map,
+ &mut blacklist_map,
+ file_path(FileType::BlockList)?,
+ )?;
- filter_with_lists(
- &mut blacklist_map,
- &mut result_map,
- &file_path(FileType::AllowList)?,
- )?;
+ filter_with_lists(
+ &mut blacklist_map,
+ &mut result_map,
+ file_path(FileType::AllowList)?,
+ )?;
- drop(blacklist_map);
+ drop(blacklist_map);
+ }
let results: Vec = result_map.into_values().collect();
- Ok(SearchResults::new(
- results,
- query.to_string(),
- engine_errors_info,
- ))
+ Ok(SearchResults::new(results, query, &engine_errors_info))
}
/// Filters a map of search results using a list of regex patterns.
@@ -194,7 +198,7 @@ pub fn filter_with_lists(
let mut reader = BufReader::new(File::open(file_path)?);
for line in reader.by_ref().lines() {
- let re = Regex::new(&line?)?;
+ let re = Regex::new(line?.trim())?;
// Iterate over each search result in the map and check if it matches the regex pattern
for (url, search_result) in map_to_be_filtered.clone().into_iter() {
@@ -203,7 +207,10 @@ pub fn filter_with_lists(
|| re.is_match(&search_result.description.to_lowercase())
{
// If the search result matches the regex pattern, move it from the original map to the resultant map
- resultant_map.insert(url.clone(), map_to_be_filtered.remove(&url).unwrap());
+ resultant_map.insert(
+ url.to_owned(),
+ map_to_be_filtered.remove(&url.to_owned()).unwrap(),
+ );
}
}
}
@@ -214,6 +221,7 @@ pub fn filter_with_lists(
#[cfg(test)]
mod tests {
use super::*;
+ use smallvec::smallvec;
use std::collections::HashMap;
use std::io::Write;
use tempfile::NamedTempFile;
@@ -223,22 +231,22 @@ mod tests {
// Create a map of search results to filter
let mut map_to_be_filtered = HashMap::new();
map_to_be_filtered.insert(
- "https://www.example.com".to_string(),
+ "https://www.example.com".to_owned(),
SearchResult {
- title: "Example Domain".to_string(),
- url: "https://www.example.com".to_string(),
+ title: "Example Domain".to_owned(),
+ url: "https://www.example.com".to_owned(),
description: "This domain is for use in illustrative examples in documents."
- .to_string(),
- engine: vec!["Google".to_string(), "Bing".to_string()],
+ .to_owned(),
+ engine: smallvec!["Google".to_owned(), "Bing".to_owned()],
},
);
map_to_be_filtered.insert(
- "https://www.rust-lang.org/".to_string(),
+ "https://www.rust-lang.org/".to_owned(),
SearchResult {
- title: "Rust Programming Language".to_string(),
- url: "https://www.rust-lang.org/".to_string(),
- description: "A systems programming language that runs blazingly fast, prevents segfaults, and guarantees thread safety.".to_string(),
- engine: vec!["Google".to_string(), "DuckDuckGo".to_string()],
+ title: "Rust Programming Language".to_owned(),
+ url: "https://www.rust-lang.org/".to_owned(),
+ description: "A systems programming language that runs blazingly fast, prevents segfaults, and guarantees thread safety.".to_owned(),
+ engine: smallvec!["Google".to_owned(), "DuckDuckGo".to_owned()],
},
);
@@ -267,22 +275,22 @@ mod tests {
fn test_filter_with_lists_wildcard() -> Result<(), Box> {
let mut map_to_be_filtered = HashMap::new();
map_to_be_filtered.insert(
- "https://www.example.com".to_string(),
+ "https://www.example.com".to_owned(),
SearchResult {
- title: "Example Domain".to_string(),
- url: "https://www.example.com".to_string(),
+ title: "Example Domain".to_owned(),
+ url: "https://www.example.com".to_owned(),
description: "This domain is for use in illustrative examples in documents."
- .to_string(),
- engine: vec!["Google".to_string(), "Bing".to_string()],
+ .to_owned(),
+ engine: smallvec!["Google".to_owned(), "Bing".to_owned()],
},
);
map_to_be_filtered.insert(
- "https://www.rust-lang.org/".to_string(),
+ "https://www.rust-lang.org/".to_owned(),
SearchResult {
- title: "Rust Programming Language".to_string(),
- url: "https://www.rust-lang.org/".to_string(),
- description: "A systems programming language that runs blazingly fast, prevents segfaults, and guarantees thread safety.".to_string(),
- engine: vec!["Google".to_string(), "DuckDuckGo".to_string()],
+ title: "Rust Programming Language".to_owned(),
+ url: "https://www.rust-lang.org/".to_owned(),
+ description: "A systems programming language that runs blazingly fast, prevents segfaults, and guarantees thread safety.".to_owned(),
+ engine: smallvec!["Google".to_owned(), "DuckDuckGo".to_owned()],
},
);
@@ -327,13 +335,13 @@ mod tests {
fn test_filter_with_lists_invalid_regex() {
let mut map_to_be_filtered = HashMap::new();
map_to_be_filtered.insert(
- "https://www.example.com".to_string(),
+ "https://www.example.com".to_owned(),
SearchResult {
- title: "Example Domain".to_string(),
- url: "https://www.example.com".to_string(),
+ title: "Example Domain".to_owned(),
+ url: "https://www.example.com".to_owned(),
description: "This domain is for use in illustrative examples in documents."
- .to_string(),
- engine: vec!["Google".to_string(), "Bing".to_string()],
+ .to_owned(),
+ engine: smallvec!["Google".to_owned(), "Bing".to_owned()],
},
);
diff --git a/src/results/user_agent.rs b/src/results/user_agent.rs
index 8946e84..ab2811b 100644
--- a/src/results/user_agent.rs
+++ b/src/results/user_agent.rs
@@ -1,30 +1,34 @@
//! This module provides the functionality to generate random user agent string.
+use std::sync::OnceLock;
+
use fake_useragent::{Browsers, UserAgents, UserAgentsBuilder};
/// A static variable which stores the initially build `UserAgents` struct. So as it can be resused
/// again and again without the need of reinitializing the `UserAgents` struct.
-static USER_AGENTS: once_cell::sync::Lazy = once_cell::sync::Lazy::new(|| {
- UserAgentsBuilder::new()
- .cache(false)
- .dir("/tmp")
- .thread(1)
- .set_browsers(
- Browsers::new()
- .set_chrome()
- .set_safari()
- .set_edge()
- .set_firefox()
- .set_mozilla(),
- )
- .build()
-});
+static USER_AGENTS: OnceLock = OnceLock::new();
/// A function to generate random user agent to improve privacy of the user.
///
/// # Returns
///
/// A randomly generated user agent string.
-pub fn random_user_agent() -> String {
- USER_AGENTS.random().to_string()
+pub fn random_user_agent() -> &'static str {
+ USER_AGENTS
+ .get_or_init(|| {
+ UserAgentsBuilder::new()
+ .cache(false)
+ .dir("/tmp")
+ .thread(1)
+ .set_browsers(
+ Browsers::new()
+ .set_chrome()
+ .set_safari()
+ .set_edge()
+ .set_firefox()
+ .set_mozilla(),
+ )
+ .build()
+ })
+ .random()
}
diff --git a/src/server/routes.rs b/src/server/routes.rs
index 818fac5..96ad737 100644
--- a/src/server/routes.rs
+++ b/src/server/routes.rs
@@ -2,7 +2,10 @@
//! meta search engine website and provide appropriate response to each route/page
//! when requested.
-use std::fs::read_to_string;
+use std::{
+ fs::{read_to_string, File},
+ io::{BufRead, BufReader, Read},
+};
use crate::{
cache::cacher::RedisCache,
@@ -13,9 +16,14 @@ use crate::{
};
use actix_web::{get, web, HttpRequest, HttpResponse};
use handlebars::Handlebars;
+use regex::Regex;
use serde::Deserialize;
use tokio::join;
+// ---- Constants ----
+/// Initialize redis cache connection once and store it on the heap.
+static REDIS_CACHE: async_once_cell::OnceCell = async_once_cell::OnceCell::new();
+
/// A named struct which deserializes all the user provided search parameters and stores them.
#[derive(Deserialize)]
struct SearchParams {
@@ -25,6 +33,7 @@ struct SearchParams {
/// It stores the search parameter `page` (or pageno in simple words)
/// of the search url.
page: Option,
+ safesearch: Option,
}
/// Handles the route of index page or main page of the `websurfx` meta search engine website.
@@ -53,13 +62,13 @@ pub async fn not_found(
/// A named struct which is used to deserialize the cookies fetched from the client side.
#[allow(dead_code)]
#[derive(Deserialize)]
-struct Cookie {
+struct Cookie<'a> {
/// It stores the theme name used in the website.
- theme: String,
+ theme: &'a str,
/// It stores the colorscheme name used for the website theme.
- colorscheme: String,
+ colorscheme: &'a str,
/// It stores the user selected upstream search engines selected from the UI.
- engines: Vec,
+ engines: Vec<&'a str>,
}
/// Handles the route of search page of the `websurfx` meta search engine website and it takes
@@ -95,42 +104,58 @@ pub async fn search(
None => 1,
};
+ let safe_search: u8 = match config.safe_search {
+ 3..=4 => config.safe_search,
+ _ => match ¶ms.safesearch {
+ Some(safesearch) => match safesearch {
+ 0..=2 => *safesearch,
+ _ => 1,
+ },
+ None => config.safe_search,
+ },
+ };
+
let (_, results, _) = join!(
results(
format!(
- "http://{}:{}/search?q={}&page={}",
+ "http://{}:{}/search?q={}&page={}&safesearch={}",
config.binding_ip,
config.port,
query,
- page - 1
+ page - 1,
+ safe_search
),
&config,
- query.to_string(),
+ query,
page - 1,
req.clone(),
+ safe_search
),
results(
format!(
- "http://{}:{}/search?q={}&page={}",
- config.binding_ip, config.port, query, page
+ "http://{}:{}/search?q={}&page={}&safesearch={}",
+ config.binding_ip, config.port, query, page, safe_search
),
&config,
- query.to_string(),
+ query,
page,
req.clone(),
+ safe_search
),
results(
format!(
- "http://{}:{}/search?q={}&page={}",
+ "http://{}:{}/search?q={}&page={}&safesearch={}",
config.binding_ip,
config.port,
query,
- page + 1
+ page + 1,
+ safe_search
),
&config,
- query.to_string(),
+ query,
page + 1,
req.clone(),
+ safe_search
)
);
@@ -161,30 +186,53 @@ pub async fn search(
async fn results(
url: String,
config: &Config,
- query: String,
+ query: &str,
page: u32,
req: HttpRequest,
+ safe_search: u8,
) -> Result> {
// Initialize redis cache connection struct
- let mut redis_cache = RedisCache::new(config.redis_url.clone())?;
+ let mut redis_cache: RedisCache = REDIS_CACHE
+ .get_or_init(async {
+ // Initialize redis cache connection pool only one and store it in the heap.
+ RedisCache::new(&config.redis_url, 5).await.unwrap()
+ })
+ .await
+ .clone();
// fetch the cached results json.
- let cached_results_json = redis_cache.cached_json(&url);
+ let cached_results_json: Result> =
+ redis_cache.clone().cached_json(&url).await;
// check if fetched cache results was indeed fetched or it was an error and if so
// handle the data accordingly.
match cached_results_json {
- Ok(results) => Ok(serde_json::from_str::(&results).unwrap()),
+ Ok(results) => Ok(serde_json::from_str::(&results)?),
Err(_) => {
+ if safe_search == 4 {
+ let mut results: SearchResults = SearchResults::default();
+ let mut _flag: bool =
+ is_match_from_filter_list(file_path(FileType::BlockList)?, query)?;
+ _flag = !is_match_from_filter_list(file_path(FileType::AllowList)?, query)?;
+
+ if _flag {
+ results.set_disallowed();
+ results.add_style(&config.style);
+ results.set_page_query(query);
+ redis_cache
+ .cache_results(&serde_json::to_string(&results)?, &url)
+ .await?;
+ return Ok(results);
+ }
+ }
+
// check if the cookie value is empty or not if it is empty then use the
// default selected upstream search engines from the config file otherwise
// parse the non-empty cookie and grab the user selected engines from the
// UI and use that.
- let mut results: crate::results::aggregation_models::SearchResults = match req
- .cookie("appCookie")
- {
+ let mut results: SearchResults = match req.cookie("appCookie") {
Some(cookie_value) => {
let cookie_value: Cookie = serde_json::from_str(cookie_value.name_value().1)?;
- let engines = cookie_value
+ let engines: Vec = cookie_value
.engines
.iter()
.filter_map(|name| EngineHandler::new(name))
@@ -195,8 +243,9 @@ async fn results(
page,
config.aggregator.random_delay,
config.debug,
- engines,
+ &engines,
config.request_timeout,
+ safe_search,
)
.await?
}
@@ -206,19 +255,41 @@ async fn results(
page,
config.aggregator.random_delay,
config.debug,
- config.upstream_search_engines.clone(),
+ &config.upstream_search_engines,
config.request_timeout,
+ safe_search,
)
.await?
}
};
- results.add_style(config.style.clone());
- redis_cache.cache_results(serde_json::to_string(&results)?, &url)?;
+ if results.engine_errors_info().is_empty() && results.results().is_empty() {
+ results.set_filtered();
+ }
+ results.add_style(&config.style);
+ redis_cache
+ .cache_results(&serde_json::to_string(&results)?, &url)
+ .await?;
Ok(results)
}
}
}
+fn is_match_from_filter_list(
+ file_path: &str,
+ query: &str,
+) -> Result> {
+ let mut flag = false;
+ let mut reader = BufReader::new(File::open(file_path)?);
+ for line in reader.by_ref().lines() {
+ let re = Regex::new(&line?)?;
+ if re.is_match(query) {
+ flag = true;
+ break;
+ }
+ }
+ Ok(flag)
+}
+
/// Handles the route of robots.txt page of the `websurfx` meta search engine website.
#[get("/robots.txt")]
pub async fn robots_data(_req: HttpRequest) -> Result> {
diff --git a/websurfx/config.lua b/websurfx/config.lua
index 4f2633c..09b418d 100644
--- a/websurfx/config.lua
+++ b/websurfx/config.lua
@@ -10,6 +10,21 @@ production_use = false -- whether to use production mode or not (in other words
-- if production_use is set to true
-- There will be a random delay before sending the request to the search engines, this is to prevent DDoSing the upstream search engines from a large number of simultaneous requests.
request_timeout = 30 -- timeout for the search requests sent to the upstream search engines to be fetched (value in seconds).
+rate_limiter = {
+ number_of_requests = 20, -- The number of request that are allowed within a provided time limit.
+ time_limit = 3, -- The time limit in which the quantity of requests that should be accepted.
+}
+
+-- ### Search ###
+-- Filter results based on different levels. The levels provided are:
+-- {{
+-- 0 - None
+-- 1 - Low
+-- 2 - Moderate
+-- 3 - High
+-- 4 - Aggressive
+-- }}
+safe_search = 2
-- ### Website ###
-- The different colorschemes provided are:
@@ -34,4 +49,7 @@ theme = "simple" -- the theme name which should be used for the website
redis_url = "redis://127.0.0.1:8082" -- redis connection url address on which the client should connect on.
-- ### Search Engines ###
-upstream_search_engines = { DuckDuckGo = true, Searx = false } -- select the upstream search engines from which the results should be fetched.
+upstream_search_engines = {
+ DuckDuckGo = true,
+ Searx = false,
+} -- select the upstream search engines from which the results should be fetched.