0
0
mirror of https://github.com/neon-mmd/websurfx.git synced 2024-12-22 20:38:22 -05:00

Merge branch 'rolling' into fix-gitpod-setup

This commit is contained in:
alamin655 2023-09-14 12:34:52 +05:30 committed by GitHub
commit 2a04e647d5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
27 changed files with 594 additions and 312 deletions

121
Cargo.lock generated
View File

@ -57,6 +57,18 @@ dependencies = [
"pin-project-lite",
]
[[package]]
name = "actix-governor"
version = "0.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "46ff2d40f2bc627b8054c5e20fa6b0b0cf9428699b54bd41634e9ae3098ad555"
dependencies = [
"actix-http",
"actix-web",
"futures 0.3.28",
"governor",
]
[[package]]
name = "actix-http"
version = "3.4.0"
@ -590,7 +602,7 @@ version = "0.12.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "888604f00b3db336d2af898ec3c1d5d0ddf5e6d462220f2ededc33a87ac4bbd5"
dependencies = [
"time 0.1.45",
"time 0.1.43",
"url 1.7.2",
]
@ -618,7 +630,7 @@ dependencies = [
"publicsuffix",
"serde",
"serde_json",
"time 0.1.45",
"time 0.1.43",
"try_from",
"url 1.7.2",
]
@ -817,6 +829,19 @@ dependencies = [
"syn 2.0.32",
]
[[package]]
name = "dashmap"
version = "5.5.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "978747c1d849a7d2ee5e8adc0159961c48fb7e5db2f06af6723b80123bb53856"
dependencies = [
"cfg-if 1.0.0",
"hashbrown 0.14.0",
"lock_api 0.4.10",
"once_cell",
"parking_lot_core 0.9.8",
]
[[package]]
name = "deranged"
version = "0.3.8"
@ -1162,6 +1187,12 @@ version = "0.3.28"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "76d3d132be6c0e6aa1534069c705a74a5997a356c0dc2f86a47765e5617c5b65"
[[package]]
name = "futures-timer"
version = "3.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e64b03909df88034c26dc1547e8970b91f98bdb65165d6a4e9110d94263dbb2c"
[[package]]
name = "futures-util"
version = "0.3.28"
@ -1225,6 +1256,24 @@ version = "0.28.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6fb8d784f27acf97159b40fc4db5ecd8aa23b9ad5ef69cdd136d3bc80665f0c0"
[[package]]
name = "governor"
version = "0.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c390a940a5d157878dd057c78680a33ce3415bcd05b4799509ea44210914b4d5"
dependencies = [
"cfg-if 1.0.0",
"dashmap",
"futures 0.3.28",
"futures-timer",
"no-std-compat",
"nonzero_ext",
"parking_lot 0.12.1",
"quanta",
"rand 0.8.5",
"smallvec 1.11.0",
]
[[package]]
name = "h2"
version = "0.1.26"
@ -1289,6 +1338,12 @@ version = "0.12.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888"
[[package]]
name = "hashbrown"
version = "0.14.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2c6201b9ff9fd90a5a3bac2e56a830d0caa509576f0e503818ee82c181b3437a"
[[package]]
name = "hermit-abi"
version = "0.3.2"
@ -1410,7 +1465,7 @@ dependencies = [
"log",
"net2",
"rustc_version 0.2.3",
"time 0.1.45",
"time 0.1.43",
"tokio 0.1.22",
"tokio-buf",
"tokio-executor",
@ -1511,7 +1566,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99"
dependencies = [
"autocfg 1.1.0",
"hashbrown",
"hashbrown 0.12.3",
]
[[package]]
@ -1672,6 +1727,15 @@ version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4"
[[package]]
name = "mach"
version = "0.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b823e83b2affd8f40a9ee8c29dbc56404c1e34cd2710921f2801e2cf29527afa"
dependencies = [
"libc",
]
[[package]]
name = "markup5ever"
version = "0.8.1"
@ -1887,6 +1951,18 @@ version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ab250442c86f1850815b5d268639dff018c0627022bc1940eb2d642ca1ce12f0"
[[package]]
name = "no-std-compat"
version = "0.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b93853da6d84c2e3c7d730d6473e8817692dd89be387eb01b94d7f108ecb5b8c"
[[package]]
name = "nonzero_ext"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "38bf9645c8b145698bb0b18a4637dcacbc421ea49bef2317e4fd8065a387cf21"
[[package]]
name = "num-traits"
version = "0.2.16"
@ -2307,6 +2383,22 @@ dependencies = [
"url 2.4.1",
]
[[package]]
name = "quanta"
version = "0.9.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "20afe714292d5e879d8b12740aa223c6a88f118af41870e8b6196e39a02238a8"
dependencies = [
"crossbeam-utils 0.8.16",
"libc",
"mach",
"once_cell",
"raw-cpuid",
"wasi 0.10.2+wasi-snapshot-preview1",
"web-sys",
"winapi 0.3.9",
]
[[package]]
name = "quote"
version = "0.6.13"
@ -2461,6 +2553,15 @@ dependencies = [
"rand_core 0.3.1",
]
[[package]]
name = "raw-cpuid"
version = "10.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6c297679cb867470fa8c9f67dbba74a78d78e3e98d7cf2b08d6d71540f797332"
dependencies = [
"bitflags 1.3.2",
]
[[package]]
name = "rayon"
version = "1.7.0"
@ -2583,7 +2684,7 @@ dependencies = [
"serde",
"serde_json",
"serde_urlencoded 0.5.5",
"time 0.1.45",
"time 0.1.43",
"tokio 0.1.22",
"tokio-executor",
"tokio-io",
@ -3157,12 +3258,11 @@ checksum = "3bf63baf9f5039dadc247375c29eb13706706cfde997d0330d05aa63a77d8820"
[[package]]
name = "time"
version = "0.1.45"
version = "0.1.43"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1b797afad3f312d1c66a56d11d0316f916356d11bd158fbc6ca6389ff6bf805a"
checksum = "ca8a50ef2360fbd1eeb0ecd46795a87a19024eb4b53c5dc916ca1fd95fe62438"
dependencies = [
"libc",
"wasi 0.10.0+wasi-snapshot-preview1",
"winapi 0.3.9",
]
@ -3609,9 +3709,9 @@ dependencies = [
[[package]]
name = "wasi"
version = "0.10.0+wasi-snapshot-preview1"
version = "0.10.2+wasi-snapshot-preview1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1a143597ca7c7793eff794def352d41792a93c481eb1042423ff7ff72ba2c31f"
checksum = "fd6fbd9a79829dd1ad0cc20627bf1ed606756a7f77edff7b66b7064f9cb327c6"
[[package]]
name = "wasi"
@ -3701,6 +3801,7 @@ version = "0.20.7"
dependencies = [
"actix-cors",
"actix-files",
"actix-governor",
"actix-web",
"async-once-cell",
"async-trait",

View File

@ -32,6 +32,7 @@ futures = {version="0.3.28"}
dhat = {version="0.3.2", optional = true}
mimalloc = { version = "0.1.38", default-features = false }
async-once-cell = {version="0.5.3"}
actix-governor = {version="0.4.1"}
[dev-dependencies]
rusty-hook = "^0.11.2"

11
src/cache/cacher.rs vendored
View File

@ -10,17 +10,14 @@ use super::error::PoolError;
/// A named struct which stores the redis Connection url address to which the client will
/// connect to.
///
/// # Fields
///
/// * `connection_pool` - It stores a pool of connections ready to be used.
/// * `pool_size` - It stores the size of the connection pool (in other words the number of
/// connections that should be stored in the pool).
/// * `current_connection` - It stores the index of which connection is being used at the moment.
#[derive(Clone)]
pub struct RedisCache {
/// It stores a pool of connections ready to be used.
connection_pool: Vec<ConnectionManager>,
/// It stores the size of the connection pool (in other words the number of
/// connections that should be stored in the pool).
pool_size: u8,
/// It stores the index of which connection is being used at the moment.
current_connection: u8,
}

9
src/cache/error.rs vendored
View File

@ -5,15 +5,12 @@ use std::fmt;
use redis::RedisError;
/// A custom error type used for handling redis async pool associated errors.
///
/// This enum provides variants three different categories of errors:
/// * `RedisError` - This variant handles all errors related to `RedisError`,
/// * `PoolExhaustionWithConnectionDropError` - This variant handles the error
/// which occurs when all the connections in the connection pool return a connection
/// dropped redis error.
#[derive(Debug)]
pub enum PoolError {
/// This variant handles all errors related to `RedisError`,
RedisError(RedisError),
/// This variant handles the errors which occurs when all the connections
/// in the connection pool return a connection dropped redis error.
PoolExhaustionWithConnectionDropError,
}

3
src/cache/mod.rs vendored
View File

@ -1,2 +1,5 @@
//! This module provides the modules which provide the functionality to cache the aggregated
//! results fetched and aggregated from the upstream search engines in a json format.
pub mod cacher;
pub mod error;

View File

@ -1,2 +1,4 @@
//! This module provides the modules which handles the functionality to parse the lua config
//! and convert the config options into rust readable form.
pub mod parser;
pub mod parser_models;

View File

@ -3,52 +3,42 @@
use crate::handler::paths::{file_path, FileType};
use super::parser_models::Style;
use crate::models::parser_models::{AggregatorConfig, RateLimiter, Style};
use log::LevelFilter;
use mlua::Lua;
use std::{collections::HashMap, fs, thread::available_parallelism};
/// A named struct which stores the parsed config file options.
///
/// # Fields
//
/// * `port` - It stores the parsed port number option on which the server should launch.
/// * `binding_ip` - It stores the parsed ip address option on which the server should launch
/// * `style` - It stores the theming options for the website.
/// * `redis_url` - It stores the redis connection url address on which the redis
/// client should connect.
/// * `aggregator` - It stores the option to whether enable or disable production use.
/// * `logging` - It stores the option to whether enable or disable logs.
/// * `debug` - It stores the option to whether enable or disable debug mode.
/// * `upstream_search_engines` - It stores all the engine names that were enabled by the user.
/// * `request_timeout` - It stores the time (secs) which controls the server request timeout.
/// * `threads` - It stores the number of threads which controls the app will use to run.
#[derive(Clone)]
pub struct Config {
/// It stores the parsed port number option on which the server should launch.
pub port: u16,
/// It stores the parsed ip address option on which the server should launch
pub binding_ip: String,
/// It stores the theming options for the website.
pub style: Style,
/// It stores the redis connection url address on which the redis
/// client should connect.
pub redis_url: String,
/// It stores the option to whether enable or disable production use.
pub aggregator: AggregatorConfig,
/// It stores the option to whether enable or disable logs.
pub logging: bool,
/// It stores the option to whether enable or disable debug mode.
pub debug: bool,
pub upstream_search_engines: Vec<crate::engines::engine_models::EngineHandler>,
/// It stores all the engine names that were enabled by the user.
pub upstream_search_engines: Vec<crate::models::engine_models::EngineHandler>,
/// It stores the time (secs) which controls the server request timeout.
pub request_timeout: u8,
/// It stores the number of threads which controls the app will use to run.
pub threads: u8,
/// It stores configuration options for the ratelimiting middleware.
pub rate_limiter: RateLimiter,
/// It stores the level of safe search to be used for restricting content in the
/// search results.
pub safe_search: u8,
}
/// Configuration options for the aggregator.
///
/// # Fields
///
/// * `random_delay` - It stores the option to whether enable or disable random delays between
/// requests.
#[derive(Clone)]
pub struct AggregatorConfig {
pub random_delay: bool,
}
impl Config {
/// A function which parses the config.lua file and puts all the parsed options in the newly
/// constructed Config struct and returns it.
@ -90,6 +80,8 @@ impl Config {
parsed_threads
};
let rate_limiter = globals.get::<_, HashMap<String, u8>>("rate_limiter")?;
let parsed_safe_search: u8 = globals.get::<_, u8>("safe_search")?;
let safe_search: u8 = match parsed_safe_search {
0..=4 => parsed_safe_search,
@ -117,16 +109,25 @@ impl Config {
.get::<_, HashMap<String, bool>>("upstream_search_engines")?
.into_iter()
.filter_map(|(key, value)| value.then_some(key))
.filter_map(|engine| crate::engines::engine_models::EngineHandler::new(&engine))
.filter_map(|engine| crate::models::engine_models::EngineHandler::new(&engine))
.collect(),
request_timeout: globals.get::<_, u8>("request_timeout")?,
threads,
rate_limiter: RateLimiter {
number_of_requests: rate_limiter["number_of_requests"],
time_limit: rate_limiter["time_limit"],
},
safe_search,
})
}
}
/// a helper function that sets the proper logging level
///
/// # Arguments
///
/// * `debug` - It takes the option to whether enable or disable debug mode.
/// * `logging` - It takes the option to whether enable or disable logs.
fn set_logging_level(debug: bool, logging: bool) {
if let Ok(pkg_env_var) = std::env::var("PKG_ENV") {
if pkg_env_var.to_lowercase() == "dev" {

View File

@ -7,9 +7,9 @@ use std::collections::HashMap;
use reqwest::header::HeaderMap;
use scraper::{Html, Selector};
use crate::results::aggregation_models::SearchResult;
use crate::models::aggregation_models::SearchResult;
use super::engine_models::{EngineError, SearchEngine};
use crate::models::engine_models::{EngineError, SearchEngine};
use error_stack::{Report, Result, ResultExt};
@ -19,24 +19,6 @@ pub struct DuckDuckGo;
#[async_trait::async_trait]
impl SearchEngine for DuckDuckGo {
/// This function scrapes results from the upstream engine duckduckgo and puts all the scraped
/// results like title, visiting_url (href in html),engine (from which engine it was fetched from)
/// and description in a RawSearchResult and then adds that to HashMap whose keys are url and
/// values are RawSearchResult struct and then returns it within a Result enum.
///
/// # Arguments
///
/// * `query` - Takes the user provided query to query to the upstream search engine with.
/// * `page` - Takes an u32 as an argument.
/// * `user_agent` - Takes a random user agent string as an argument.
/// * `request_timeout` - Takes a time (secs) as a value which controls the server request timeout.
///
/// # Errors
///
/// Returns an `EngineErrorKind` if the user is not connected to the internet or if their is failure to
/// reach the above `upstream search engine` page or if the `upstream search engine` is unable to
/// provide results for the requested search query and also returns error if the scraping selector
/// or HeaderMap fails to initialize.
async fn results(
&self,
query: &str,

View File

@ -1,108 +0,0 @@
//! This module provides the error enum to handle different errors associated while requesting data from
//! the upstream search engines with the search query provided by the user.
use crate::results::aggregation_models::SearchResult;
use error_stack::{Result, ResultExt};
use std::{collections::HashMap, fmt, time::Duration};
/// A custom error type used for handle engine associated errors.
///
/// This enum provides variants three different categories of errors:
/// * `RequestError` - This variant handles all request related errors like forbidden, not found,
/// etc.
/// * `EmptyResultSet` - This variant handles the not results found error provide by the upstream
/// search engines.
/// * `UnexpectedError` - This variant handles all the errors which are unexpected or occur rarely
/// and are errors mostly related to failure in initialization of HeaderMap, Selector errors and
/// all other errors occurring within the code handling the `upstream search engines`.
#[derive(Debug)]
pub enum EngineError {
EmptyResultSet,
RequestError,
UnexpectedError,
}
impl fmt::Display for EngineError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
EngineError::EmptyResultSet => {
write!(f, "The upstream search engine returned an empty result set")
}
EngineError::RequestError => {
write!(
f,
"Error occurred while requesting data from upstream search engine"
)
}
EngineError::UnexpectedError => {
write!(f, "An unexpected error occurred while processing the data")
}
}
}
}
impl error_stack::Context for EngineError {}
/// A trait to define common behavior for all search engines.
#[async_trait::async_trait]
pub trait SearchEngine: Sync + Send {
async fn fetch_html_from_upstream(
&self,
url: &str,
header_map: reqwest::header::HeaderMap,
request_timeout: u8,
) -> Result<String, EngineError> {
// fetch the html from upstream search engine
Ok(reqwest::Client::new()
.get(url)
.timeout(Duration::from_secs(request_timeout as u64)) // Add timeout to request to avoid DDOSing the server
.headers(header_map) // add spoofed headers to emulate human behavior
.send()
.await
.change_context(EngineError::RequestError)?
.text()
.await
.change_context(EngineError::RequestError)?)
}
async fn results(
&self,
query: &str,
page: u32,
user_agent: &str,
request_timeout: u8,
safe_search: u8,
) -> Result<HashMap<String, SearchResult>, EngineError>;
}
pub struct EngineHandler {
engine: Box<dyn SearchEngine>,
name: &'static str,
}
impl Clone for EngineHandler {
fn clone(&self) -> Self {
Self::new(self.name).unwrap()
}
}
impl EngineHandler {
/// parses an engine name into an engine handler, returns none if the engine is unknown
pub fn new(engine_name: &str) -> Option<Self> {
let engine: (&'static str, Box<dyn SearchEngine>) =
match engine_name.to_lowercase().as_str() {
"duckduckgo" => ("duckduckgo", Box::new(super::duckduckgo::DuckDuckGo)),
"searx" => ("searx", Box::new(super::searx::Searx)),
_ => return None,
};
Some(Self {
engine: engine.1,
name: engine.0,
})
}
pub fn into_name_engine(self) -> (&'static str, Box<dyn SearchEngine>) {
(self.name, self.engine)
}
}

View File

@ -1,3 +1,7 @@
//! This module provides different modules which handles the functionlity to fetch results from the
//! upstream search engines based on user requested queries. Also provides different models to
//! provide a standard functions to be implemented for all the upstream search engine handling
//! code. Moreover, it also provides a custom error for the upstream search engine handling code.
pub mod duckduckgo;
pub mod engine_models;
pub mod searx;

View File

@ -6,9 +6,8 @@ use reqwest::header::HeaderMap;
use scraper::{Html, Selector};
use std::collections::HashMap;
use crate::results::aggregation_models::SearchResult;
use super::engine_models::{EngineError, SearchEngine};
use crate::models::aggregation_models::SearchResult;
use crate::models::engine_models::{EngineError, SearchEngine};
use error_stack::{Report, Result, ResultExt};
/// A new Searx engine type defined in-order to implement the `SearchEngine` trait which allows to
@ -17,25 +16,6 @@ pub struct Searx;
#[async_trait::async_trait]
impl SearchEngine for Searx {
/// This function scrapes results from the upstream engine duckduckgo and puts all the scraped
/// results like title, visiting_url (href in html),engine (from which engine it was fetched from)
/// and description in a RawSearchResult and then adds that to HashMap whose keys are url and
/// values are RawSearchResult struct and then returns it within a Result enum.
///
/// # Arguments
///
/// * `query` - Takes the user provided query to query to the upstream search engine with.
/// * `page` - Takes an u32 as an argument.
/// * `user_agent` - Takes a random user agent string as an argument.
/// * `request_timeout` - Takes a time (secs) as a value which controls the server request timeout.
///
/// # Errors
///
/// Returns an `EngineErrorKind` if the user is not connected to the internet or if their is failure to
/// reach the above `upstream search engine` page or if the `upstream search engine` is unable to
/// provide results for the requested search query and also returns error if the scraping selector
/// or HeaderMap fails to initialize.
async fn results(
&self,
query: &str,

View File

@ -1 +1,5 @@
//! This module provides modules which provide the functionality to handle paths for different
//! files present on different paths and provide one appropriate path on which it is present and
//! can be used.
pub mod paths;

View File

@ -7,42 +7,46 @@ use std::path::Path;
use std::sync::OnceLock;
// ------- Constants --------
static PUBLIC_DIRECTORY_NAME: &str = "public";
static COMMON_DIRECTORY_NAME: &str = "websurfx";
static CONFIG_FILE_NAME: &str = "config.lua";
static ALLOWLIST_FILE_NAME: &str = "allowlist.txt";
static BLOCKLIST_FILE_NAME: &str = "blocklist.txt";
/// The constant holding the name of the theme folder.
const PUBLIC_DIRECTORY_NAME: &str = "public";
/// The constant holding the name of the common folder.
const COMMON_DIRECTORY_NAME: &str = "websurfx";
/// The constant holding the name of the config file.
const CONFIG_FILE_NAME: &str = "config.lua";
/// The constant holding the name of the AllowList text file.
const ALLOWLIST_FILE_NAME: &str = "allowlist.txt";
/// The constant holding the name of the BlockList text file.
const BLOCKLIST_FILE_NAME: &str = "blocklist.txt";
/// An enum type which provides different variants to handle paths for various files/folders.
#[derive(Hash, PartialEq, Eq, Debug)]
pub enum FileType {
/// This variant handles all the paths associated with the config file.
Config,
/// This variant handles all the paths associated with the Allowlist text file.
AllowList,
/// This variant handles all the paths associated with the BlockList text file.
BlockList,
/// This variant handles all the paths associated with the public folder (Theme folder).
Theme,
}
/// A static variable which stores the different filesystem paths for various file/folder types.
static FILE_PATHS_FOR_DIFF_FILE_TYPES: OnceLock<HashMap<FileType, Vec<String>>> = OnceLock::new();
/// A helper function which returns an appropriate config file path checking if the config
/// file exists on that path.
/// A function which returns an appropriate path for thr provided file type by checking if the path
/// for the given file type exists on that path.
///
/// # Error
///
/// Returns a `config file not found!!` error if the config file is not present under following
/// paths which are:
/// 1. `~/.config/websurfx/` if it not present here then it fallbacks to the next one (2)
/// 2. `/etc/xdg/websurfx/config.lua` if it is not present here then it fallbacks to the next
/// one (3).
/// 3. `websurfx/` (under project folder ( or codebase in other words)) if it is not present
/// here then it returns an error as mentioned above.
/// A function which returns an appropriate theme directory path checking if the theme
/// directory exists on that path.
/// Returns a `<File Name> folder/file not found!!` error if the give file_type folder/file is not
/// present on the path on which it is being tested.
///
/// # Error
/// # Example
///
/// If this function is give the file_type of Theme variant then the theme folder is checked by the
/// following steps:
///
/// Returns a `Theme (public) folder not found!!` error if the theme folder is not present under following
/// paths which are:
/// 1. `/opt/websurfx` if it not present here then it fallbacks to the next one (2)
/// 2. Under project folder ( or codebase in other words) if it is not present
/// here then it returns an error as mentioned above.
@ -110,6 +114,6 @@ pub fn file_path(file_type: FileType) -> Result<&'static str, Error> {
// if no of the configs above exist, return error
Err(Error::new(
std::io::ErrorKind::NotFound,
format!("{:?} file not found!!", file_type),
format!("{:?} file/folder not found!!", file_type),
))
}

View File

@ -1,25 +1,26 @@
//! This main library module provides the functionality to provide and handle the Tcp server
//! and register all the routes for the `websurfx` meta search engine website.
#![forbid(unsafe_code, clippy::panic)]
#![deny(missing_docs, clippy::missing_docs_in_private_items, clippy::perf)]
#![warn(clippy::cognitive_complexity, rust_2018_idioms)]
pub mod cache;
pub mod config;
pub mod engines;
pub mod handler;
pub mod models;
pub mod results;
pub mod server;
use std::net::TcpListener;
use crate::server::routes;
use crate::server::router;
use actix_cors::Cors;
use actix_files as fs;
use actix_web::{
dev::Server,
http::header,
middleware::{Compress, Logger},
web, App, HttpServer,
};
use actix_governor::{Governor, GovernorConfigBuilder};
use actix_web::{dev::Server, http::header, middleware::Logger, web, App, HttpServer};
use config::parser::Config;
use handlebars::Handlebars;
use handler::paths::{file_path, FileType};
@ -45,7 +46,7 @@ use handler::paths::{file_path, FileType};
/// let server = run(listener,config).expect("Failed to start server");
/// ```
pub fn run(listener: TcpListener, config: Config) -> std::io::Result<Server> {
let mut handlebars: Handlebars = Handlebars::new();
let mut handlebars: Handlebars<'_> = Handlebars::new();
let public_folder_path: &str = file_path(FileType::Theme)?;
@ -53,7 +54,7 @@ pub fn run(listener: TcpListener, config: Config) -> std::io::Result<Server> {
.register_templates_directory(".html", format!("{}/templates", public_folder_path))
.unwrap();
let handlebars_ref: web::Data<Handlebars> = web::Data::new(handlebars);
let handlebars_ref: web::Data<Handlebars<'_>> = web::Data::new(handlebars);
let cloned_config_threads_opt: u8 = config.threads;
@ -69,11 +70,17 @@ pub fn run(listener: TcpListener, config: Config) -> std::io::Result<Server> {
]);
App::new()
.wrap(Logger::default()) // added logging middleware for logging.
.app_data(handlebars_ref.clone())
.app_data(web::Data::new(config.clone()))
.wrap(cors)
.wrap(Logger::default()) // added logging middleware for logging.
.wrap(Compress::default()) // compress request headers to reduce memory usage.
.wrap(Governor::new(
&GovernorConfigBuilder::default()
.per_second(config.rate_limiter.time_limit as u64)
.burst_size(config.rate_limiter.number_of_requests as u32)
.finish()
.unwrap(),
))
// Serve images and static files (css and js files).
.service(
fs::Files::new("/static", format!("{}/static", public_folder_path))
@ -83,12 +90,12 @@ pub fn run(listener: TcpListener, config: Config) -> std::io::Result<Server> {
fs::Files::new("/images", format!("{}/images", public_folder_path))
.show_files_listing(),
)
.service(routes::robots_data) // robots.txt
.service(routes::index) // index page
.service(routes::search) // search page
.service(routes::about) // about page
.service(routes::settings) // settings page
.default_service(web::route().to(routes::not_found)) // error page
.service(router::robots_data) // robots.txt
.service(router::index) // index page
.service(server::routes::search::search) // search page
.service(router::about) // about page
.service(router::settings) // settings page
.default_service(web::route().to(router::not_found)) // error page
})
.workers(cloned_config_threads_opt as usize)
// Start server on 127.0.0.1 with the user provided port number. for example 127.0.0.1:8080.

View File

@ -4,25 +4,22 @@
use serde::{Deserialize, Serialize};
use smallvec::SmallVec;
use crate::{config::parser_models::Style, engines::engine_models::EngineError};
use super::{engine_models::EngineError, parser_models::Style};
/// A named struct to store the raw scraped search results scraped search results from the
/// upstream search engines before aggregating it.It derives the Clone trait which is needed
/// to write idiomatic rust using `Iterators`.
///
/// # Fields
///
/// * `title` - The title of the search result.
/// * `url` - The url which is accessed when clicked on it
/// (href url in html in simple words).
/// * `description` - The description of the search result.
/// * `engine` - The names of the upstream engines from which this results were provided.
#[derive(Clone, Serialize, Deserialize, Debug)]
#[derive(Clone, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct SearchResult {
/// The title of the search result.
pub title: String,
/// The url which is accessed when clicked on it
pub url: String,
/// The description of the search result.
pub description: String,
/// The names of the upstream engines from which this results were provided.
pub engine: SmallVec<[String; 0]>,
}
@ -64,14 +61,27 @@ impl SearchResult {
}
}
/// A named struct that stores the error info related to the upstream search engines.
#[derive(Serialize, Deserialize, Clone)]
pub struct EngineErrorInfo {
/// It stores the error type which occured while fetching the result from a particular search
/// engine.
pub error: String,
/// It stores the name of the engine that failed to provide the requested search results.
pub engine: String,
/// It stores the name of the color to indicate whether how severe the particular error is (In
/// other words it indicates the severity of the error/issue).
pub severity_color: String,
}
impl EngineErrorInfo {
/// Constructs a new `SearchResult` with the given arguments needed for the struct.
///
/// # Arguments
///
/// * `error` - It takes the error type which occured while fetching the result from a particular
/// search engine.
/// * `engine` - It takes the name of the engine that failed to provide the requested search results.
pub fn new(error: &EngineError, engine: &str) -> Self {
Self {
error: match error {
@ -91,25 +101,26 @@ impl EngineErrorInfo {
/// A named struct to store, serialize, deserialize the all the search results scraped and
/// aggregated from the upstream search engines.
///
/// # Fields
///
/// * `results` - Stores the individual serializable `SearchResult` struct into a vector of
/// `SearchResult` structs.
/// * `page_query` - Stores the current pages search query `q` provided in the search url.
/// * `style` - Stores the theming options for the website.
/// * `engine_errors_info` - Stores the information on which engines failed with their engine name
/// and the type of error that caused it.
/// * `empty_result_set` - Stores a boolean which indicates that no engines gave a result for the
/// given search query.
#[derive(Serialize, Deserialize, Default)]
#[serde(rename_all = "camelCase")]
pub struct SearchResults {
/// Stores the individual serializable `SearchResult` struct into a vector of
pub results: Vec<SearchResult>,
/// Stores the current pages search query `q` provided in the search url.
pub page_query: String,
/// Stores the theming options for the website.
pub style: Style,
/// Stores the information on which engines failed with their engine name
/// and the type of error that caused it.
pub engine_errors_info: Vec<EngineErrorInfo>,
/// Stores the flag option which holds the check value that the following
/// search query was disallowed when the safe search level set to 4 and it
/// was present in the `Blocklist` file.
pub disallowed: bool,
/// Stores the flag option which holds the check value that the following
/// search query was filtered when the safe search level set to 3 and it
/// was present in the `Blocklist` file.
pub filtered: bool,
}
@ -122,9 +133,8 @@ impl SearchResults {
/// and stores it into a vector of `SearchResult` structs.
/// * `page_query` - Takes an argument of current page`s search query `q` provided in
/// the search url.
/// * `empty_result_set` - Takes a boolean which indicates that no engines gave a result for the
/// given search query.
/// * ``
/// * `engine_errors_info` - Takes an array of structs which contains information regarding
/// which engines failed with their names, reason and their severity color name.
pub fn new(
results: Vec<SearchResult>,
page_query: &str,

159
src/models/engine_models.rs Normal file
View File

@ -0,0 +1,159 @@
//! This module provides the error enum to handle different errors associated while requesting data from
//! the upstream search engines with the search query provided by the user.
use super::aggregation_models::SearchResult;
use error_stack::{Result, ResultExt};
use std::{collections::HashMap, fmt, time::Duration};
/// A custom error type used for handle engine associated errors.
#[derive(Debug)]
pub enum EngineError {
/// This variant handles all request related errors like forbidden, not found,
/// etc.
EmptyResultSet,
/// This variant handles the not results found error provide by the upstream
/// search engines.
RequestError,
/// This variant handles all the errors which are unexpected or occur rarely
/// and are errors mostly related to failure in initialization of HeaderMap,
/// Selector errors and all other errors occurring within the code handling
/// the `upstream search engines`.
UnexpectedError,
}
impl fmt::Display for EngineError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
EngineError::EmptyResultSet => {
write!(f, "The upstream search engine returned an empty result set")
}
EngineError::RequestError => {
write!(
f,
"Error occurred while requesting data from upstream search engine"
)
}
EngineError::UnexpectedError => {
write!(f, "An unexpected error occurred while processing the data")
}
}
}
}
impl error_stack::Context for EngineError {}
/// A trait to define common behavior for all search engines.
#[async_trait::async_trait]
pub trait SearchEngine: Sync + Send {
/// This helper function fetches/requests the search results from the upstream search engine in
/// an html form.
///
/// # Arguments
///
/// * `url` - It takes the url of the upstream search engine with the user requested search
/// query appended in the search parameters.
/// * `header_map` - It takes the http request headers to be sent to the upstream engine in
/// order to prevent being detected as a bot. It takes the header as a HeaderMap type.
/// * `request_timeout` - It takes the request timeout value as seconds which is used to limit
/// the amount of time for each request to remain connected when until the results can be provided
/// by the upstream engine.
///
/// # Error
///
/// It returns the html data as a string if the upstream engine provides the data as expected
/// otherwise it returns a custom `EngineError`.
async fn fetch_html_from_upstream(
&self,
url: &str,
header_map: reqwest::header::HeaderMap,
request_timeout: u8,
) -> Result<String, EngineError> {
// fetch the html from upstream search engine
Ok(reqwest::Client::new()
.get(url)
.timeout(Duration::from_secs(request_timeout as u64)) // Add timeout to request to avoid DDOSing the server
.headers(header_map) // add spoofed headers to emulate human behavior
.send()
.await
.change_context(EngineError::RequestError)?
.text()
.await
.change_context(EngineError::RequestError)?)
}
/// This function scrapes results from the upstream engine and puts all the scraped results like
/// title, visiting_url (href in html),engine (from which engine it was fetched from) and description
/// in a RawSearchResult and then adds that to HashMap whose keys are url and values are RawSearchResult
/// struct and then returns it within a Result enum.
///
/// # Arguments
///
/// * `query` - Takes the user provided query to query to the upstream search engine with.
/// * `page` - Takes an u32 as an argument.
/// * `user_agent` - Takes a random user agent string as an argument.
/// * `request_timeout` - Takes a time (secs) as a value which controls the server request timeout.
///
/// # Errors
///
/// Returns an `EngineErrorKind` if the user is not connected to the internet or if their is failure to
/// reach the above `upstream search engine` page or if the `upstream search engine` is unable to
/// provide results for the requested search query and also returns error if the scraping selector
/// or HeaderMap fails to initialize.
async fn results(
&self,
query: &str,
page: u32,
user_agent: &str,
request_timeout: u8,
safe_search: u8,
) -> Result<HashMap<String, SearchResult>, EngineError>;
}
/// A named struct which stores the engine struct with the name of the associated engine.
pub struct EngineHandler {
/// It stores the engine struct wrapped in a box smart pointer as the engine struct implements
/// the `SearchEngine` trait.
engine: Box<dyn SearchEngine>,
/// It stores the name of the engine to which the struct is associated to.
name: &'static str,
}
impl Clone for EngineHandler {
fn clone(&self) -> Self {
Self::new(self.name).unwrap()
}
}
impl EngineHandler {
/// Parses an engine name into an engine handler.
///
/// # Arguments
///
/// * `engine_name` - It takes the name of the engine to which the struct was associated to.
///
/// # Returns
///
/// It returns an option either containing the value or a none if the engine is unknown
pub fn new(engine_name: &str) -> Option<Self> {
let engine: (&'static str, Box<dyn SearchEngine>) =
match engine_name.to_lowercase().as_str() {
"duckduckgo" => (
"duckduckgo",
Box::new(crate::engines::duckduckgo::DuckDuckGo),
),
"searx" => ("searx", Box::new(crate::engines::searx::Searx)),
_ => return None,
};
Some(Self {
engine: engine.1,
name: engine.0,
})
}
/// This function converts the EngineHandler type into a tuple containing the engine name and
/// the associated engine struct.
pub fn into_name_engine(self) -> (&'static str, Box<dyn SearchEngine>) {
(self.name, self.engine)
}
}

8
src/models/mod.rs Normal file
View File

@ -0,0 +1,8 @@
//! This module provides modules which in turn provides various models for aggregrating search
//! results, parsing config file, providing trait to standardize search engine handling code,
//! custom engine error for the search engine, etc.
pub mod aggregation_models;
pub mod engine_models;
pub mod parser_models;
pub mod server_models;

View File

@ -12,15 +12,12 @@ use serde::{Deserialize, Serialize};
/// order to allow the deserializing the json back to struct in aggregate function in
/// aggregator.rs and create a new struct out of it and then serialize it back to json and pass
/// it to the template files.
///
/// # Fields
//
/// * `theme` - It stores the parsed theme option used to set a theme for the website.
/// * `colorscheme` - It stores the parsed colorscheme option used to set a colorscheme for the
/// theme being used.
#[derive(Serialize, Deserialize, Clone, Default)]
pub struct Style {
/// It stores the parsed theme option used to set a theme for the website.
pub theme: String,
/// It stores the parsed colorscheme option used to set a colorscheme for the
/// theme being used.
pub colorscheme: String,
}
@ -36,3 +33,20 @@ impl Style {
Style { theme, colorscheme }
}
}
/// Configuration options for the aggregator.
#[derive(Clone)]
pub struct AggregatorConfig {
/// It stores the option to whether enable or disable random delays between
/// requests.
pub random_delay: bool,
}
/// Configuration options for the rate limiter middleware.
#[derive(Clone)]
pub struct RateLimiter {
/// The number of request that are allowed within a provided time limit.
pub number_of_requests: u8,
/// The time limit in which the quantity of requests that should be accepted.
pub time_limit: u8,
}

View File

@ -0,0 +1,26 @@
//! This module provides the models to parse cookies and search parameters from the search
//! engine website.
use serde::Deserialize;
/// A named struct which deserializes all the user provided search parameters and stores them.
#[derive(Deserialize)]
pub struct SearchParams {
/// It stores the search parameter option `q` (or query in simple words)
/// of the search url.
pub q: Option<String>,
/// It stores the search parameter `page` (or pageno in simple words)
/// of the search url.
pub page: Option<u32>,
}
/// A named struct which is used to deserialize the cookies fetched from the client side.
#[allow(dead_code)]
#[derive(Deserialize)]
pub struct Cookie {
/// It stores the theme name used in the website.
pub theme: String,
/// It stores the colorscheme name used for the website theme.
pub colorscheme: String,
/// It stores the user selected upstream search engines selected from the UI.
pub engines: Vec<String>,
}

View File

@ -1,27 +1,23 @@
//! This module provides the functionality to scrape and gathers all the results from the upstream
//! search engines and then removes duplicate results.
use super::user_agent::random_user_agent;
use crate::handler::paths::{file_path, FileType};
use crate::models::{
aggregation_models::{EngineErrorInfo, SearchResult, SearchResults},
engine_models::{EngineError, EngineHandler},
};
use error_stack::Report;
use rand::Rng;
use regex::Regex;
use std::{
collections::HashMap,
io::{BufReader, Read},
time::Duration,
};
use super::{
aggregation_models::{EngineErrorInfo, SearchResult, SearchResults},
user_agent::random_user_agent,
};
use error_stack::Report;
use rand::Rng;
use regex::Regex;
use std::{fs::File, io::BufRead};
use tokio::task::JoinHandle;
use crate::{
engines::engine_models::{EngineError, EngineHandler},
handler::paths::{file_path, FileType},
};
/// Aliases for long type annotations
type FutureVec = Vec<JoinHandle<Result<HashMap<String, SearchResult>, Report<EngineError>>>>;

View File

@ -1,3 +1,6 @@
pub mod aggregation_models;
//! This module provides modules that handle the functionality to aggregate the fetched search
//! results from the upstream search engines and filters it if safe search is set to 3 or 4. Also,
//! provides various models to aggregate search results into a standardized form.
pub mod aggregator;
pub mod user_agent;

View File

@ -4,6 +4,8 @@ use std::sync::OnceLock;
use fake_useragent::{Browsers, UserAgents, UserAgentsBuilder};
/// A static variable which stores the initially build `UserAgents` struct. So as it can be resused
/// again and again without the need of reinitializing the `UserAgents` struct.
static USER_AGENTS: OnceLock<UserAgents> = OnceLock::new();
/// A function to generate random user agent to improve privacy of the user.

View File

@ -1 +1,7 @@
//! This module provides modules that handle the functionality of handling different routes/paths
//! for the `websurfx` search engine website. Also it handles the parsing of search parameters in
//! the search route. Also, caches the next, current and previous search results in the search
//! routes with the help of the redis server.
pub mod router;
pub mod routes;

64
src/server/router.rs Normal file
View File

@ -0,0 +1,64 @@
//! This module provides the functionality to handle different routes of the `websurfx`
//! meta search engine website and provide appropriate response to each route/page
//! when requested.
use crate::{
config::parser::Config,
handler::paths::{file_path, FileType},
};
use actix_web::{get, web, HttpRequest, HttpResponse};
use handlebars::Handlebars;
use std::fs::read_to_string;
/// Handles the route of index page or main page of the `websurfx` meta search engine website.
#[get("/")]
pub async fn index(
hbs: web::Data<Handlebars<'_>>,
config: web::Data<Config>,
) -> Result<HttpResponse, Box<dyn std::error::Error>> {
let page_content: String = hbs.render("index", &config.style).unwrap();
Ok(HttpResponse::Ok().body(page_content))
}
/// Handles the route of any other accessed route/page which is not provided by the
/// website essentially the 404 error page.
pub async fn not_found(
hbs: web::Data<Handlebars<'_>>,
config: web::Data<Config>,
) -> Result<HttpResponse, Box<dyn std::error::Error>> {
let page_content: String = hbs.render("404", &config.style)?;
Ok(HttpResponse::Ok()
.content_type("text/html; charset=utf-8")
.body(page_content))
}
/// Handles the route of robots.txt page of the `websurfx` meta search engine website.
#[get("/robots.txt")]
pub async fn robots_data(_req: HttpRequest) -> Result<HttpResponse, Box<dyn std::error::Error>> {
let page_content: String =
read_to_string(format!("{}/robots.txt", file_path(FileType::Theme)?))?;
Ok(HttpResponse::Ok()
.content_type("text/plain; charset=ascii")
.body(page_content))
}
/// Handles the route of about page of the `websurfx` meta search engine website.
#[get("/about")]
pub async fn about(
hbs: web::Data<Handlebars<'_>>,
config: web::Data<Config>,
) -> Result<HttpResponse, Box<dyn std::error::Error>> {
let page_content: String = hbs.render("about", &config.style)?;
Ok(HttpResponse::Ok().body(page_content))
}
/// Handles the route of settings page of the `websurfx` meta search engine website.
#[get("/settings")]
pub async fn settings(
hbs: web::Data<Handlebars<'_>>,
config: web::Data<Config>,
) -> Result<HttpResponse, Box<dyn std::error::Error>> {
let page_content: String = hbs.render("settings", &config.style)?;
Ok(HttpResponse::Ok().body(page_content))
}

3
src/server/routes/mod.rs Normal file
View File

@ -0,0 +1,3 @@
//! This module provides modules to handle various routes in the search engine website.
pub mod search;

View File

@ -1,23 +1,20 @@
//! This module provides the functionality to handle different routes of the `websurfx`
//! meta search engine website and provide appropriate response to each route/page
//! when requested.
use std::{
fs::{read_to_string, File},
io::{BufRead, BufReader, Read},
};
//! This module handles the search route of the search engine website.
use crate::{
cache::cacher::RedisCache,
config::parser::Config,
engines::engine_models::EngineHandler,
handler::paths::{file_path, FileType},
results::{aggregation_models::SearchResults, aggregator::aggregate},
models::{aggregation_models::SearchResults, engine_models::EngineHandler},
results::aggregator::aggregate,
};
use actix_web::{get, web, HttpRequest, HttpResponse};
use handlebars::Handlebars;
use regex::Regex;
use serde::Deserialize;
use std::{
fs::{read_to_string, File},
io::{BufRead, BufReader, Read},
};
use tokio::join;
// ---- Constants ----
@ -25,17 +22,16 @@ use tokio::join;
static REDIS_CACHE: async_once_cell::OnceCell<RedisCache> = async_once_cell::OnceCell::new();
/// A named struct which deserializes all the user provided search parameters and stores them.
///
/// # Fields
///
/// * `q` - It stores the search parameter option `q` (or query in simple words)
/// of the search url.
/// * `page` - It stores the search parameter `page` (or pageno in simple words)
/// of the search url.
#[derive(Deserialize)]
struct SearchParams {
pub struct SearchParams {
/// It stores the search parameter option `q` (or query in simple words)
/// of the search url.
q: Option<String>,
/// It stores the search parameter `page` (or pageno in simple words)
/// of the search url.
page: Option<u32>,
/// It stores the search parameter `safesearch` (or safe search level in simple words) of the
/// search url.
safesearch: Option<u8>,
}
@ -63,17 +59,14 @@ pub async fn not_found(
}
/// A named struct which is used to deserialize the cookies fetched from the client side.
///
/// # Fields
///
/// * `theme` - It stores the theme name used in the website.
/// * `colorscheme` - It stores the colorscheme name used for the website theme.
/// * `engines` - It stores the user selected upstream search engines selected from the UI.
#[allow(dead_code)]
#[derive(Deserialize)]
struct Cookie<'a> {
/// It stores the theme name used in the website.
theme: &'a str,
/// It stores the colorscheme name used for the website theme.
colorscheme: &'a str,
/// It stores the user selected upstream search engines selected from the UI.
engines: Vec<&'a str>,
}
@ -174,8 +167,21 @@ pub async fn search(
}
}
/// Fetches the results for a query and page.
/// First checks the redis cache, if that fails it gets proper results
/// Fetches the results for a query and page. It First checks the redis cache, if that
/// fails it gets proper results by requesting from the upstream search engines.
///
/// # Arguments
///
/// * `url` - It takes the url of the current page that requested the search results for a
/// particular search query.
/// * `config` - It takes a parsed config struct.
/// * `query` - It takes the page number as u32 value.
/// * `req` - It takes the `HttpRequest` struct as a value.
///
/// # Error
///
/// It returns the `SearchResults` struct if the search results could be successfully fetched from
/// the cache or from the upstream search engines otherwise it returns an appropriate error.
async fn results(
url: String,
config: &Config,
@ -184,6 +190,7 @@ async fn results(
req: HttpRequest,
safe_search: u8,
) -> Result<SearchResults, Box<dyn std::error::Error>> {
// Initialize redis cache connection struct
let mut redis_cache: RedisCache = REDIS_CACHE
.get_or_init(async {
// Initialize redis cache connection pool only one and store it in the heap.
@ -191,7 +198,6 @@ async fn results(
})
.await
.clone();
// fetch the cached results json.
let cached_results_json: Result<String, error_stack::Report<crate::cache::error::PoolError>> =
redis_cache.clone().cached_json(&url).await;
@ -223,7 +229,8 @@ async fn results(
// UI and use that.
let mut results: SearchResults = match req.cookie("appCookie") {
Some(cookie_value) => {
let cookie_value: Cookie = serde_json::from_str(cookie_value.name_value().1)?;
let cookie_value: Cookie<'_> =
serde_json::from_str(cookie_value.name_value().1)?;
let engines: Vec<EngineHandler> = cookie_value
.engines
@ -267,6 +274,8 @@ async fn results(
}
}
/// A helper function which checks whether the search query contains any keywords which should be
/// disallowed/allowed based on the regex based rules present in the blocklist and allowlist files.
fn is_match_from_filter_list(
file_path: &str,
query: &str,

View File

@ -10,6 +10,10 @@ production_use = false -- whether to use production mode or not (in other words
-- if production_use is set to true
-- There will be a random delay before sending the request to the search engines, this is to prevent DDoSing the upstream search engines from a large number of simultaneous requests.
request_timeout = 30 -- timeout for the search requests sent to the upstream search engines to be fetched (value in seconds).
rate_limiter = {
number_of_requests = 20, -- The number of request that are allowed within a provided time limit.
time_limit = 3, -- The time limit in which the quantity of requests that should be accepted.
}
-- ### Search ###
-- Filter results based on different levels. The levels provided are:
@ -45,4 +49,7 @@ theme = "simple" -- the theme name which should be used for the website
redis_url = "redis://127.0.0.1:8082" -- redis connection url address on which the client should connect on.
-- ### Search Engines ###
upstream_search_engines = { DuckDuckGo = true, Searx = false } -- select the upstream search engines from which the results should be fetched.
upstream_search_engines = {
DuckDuckGo = true,
Searx = false,
} -- select the upstream search engines from which the results should be fetched.