0
0
mirror of https://github.com/neon-mmd/websurfx.git synced 2024-12-25 22:08:22 -05:00

Merge branch 'rolling' into change-document-style-with-linter-warnings

This commit is contained in:
neon_arch 2023-09-12 17:49:46 +03:00 committed by GitHub
commit fb231de416
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
26 changed files with 1116 additions and 486 deletions

1
.gitignore vendored
View File

@ -4,3 +4,4 @@ package-lock.json
dump.rdb
.vscode
megalinter-reports/
dhat-heap.json

476
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -8,9 +8,9 @@ license = "AGPL-3.0"
[dependencies]
reqwest = {version="0.11.20",features=["json"]}
tokio = {version="1.32.0",features=["full"]}
tokio = {version="1.32.0",features=["rt-multi-thread","macros"]}
serde = {version="1.0.188",features=["derive"]}
handlebars = { version = "4.3.7", features = ["dir_source"] }
handlebars = { version = "4.4.0", features = ["dir_source"] }
scraper = {version="0.17.1"}
actix-web = {version="4.4.0", features = ["cookies"]}
actix-files = {version="0.6.2"}
@ -19,14 +19,20 @@ serde_json = {version="1.0.105"}
fake-useragent = {version="0.1.3"}
env_logger = {version="0.10.0"}
log = {version="0.4.20"}
rlua = {version="0.19.7"}
redis = {version="0.23.2"}
mlua = {version="0.8.10", features=["luajit"]}
redis = {version="0.23.3", features=["tokio-comp","connection-manager"]}
md5 = {version="0.7.0"}
rand={version="0.8.5"}
once_cell = {version="1.18.0"}
error-stack = {version="0.4.0"}
async-trait = {version="0.1.73"}
regex = {version="1.9.4", features=["perf"]}
smallvec = {version="1.11.0", features=["union", "serde"]}
futures = {version="0.3.28"}
dhat = {version="0.3.2", optional = true}
mimalloc = { version = "0.1.38", default-features = false }
async-once-cell = {version="0.5.3"}
actix-governor = {version="0.4.1"}
[dev-dependencies]
rusty-hook = "^0.11.2"
@ -47,13 +53,17 @@ rpath = false
[profile.release]
opt-level = 3
debug = false
debug = false # This should only be commented when testing with dhat profiler
# debug = 1 # This should only be uncommented when testing with dhat profiler
split-debuginfo = '...'
debug-assertions = false
overflow-checks = false
lto = 'thin'
lto = true
panic = 'abort'
incremental = false
codegen-units = 16
codegen-units = 1
rpath = false
strip = "debuginfo"
[features]
dhat-heap = ["dep:dhat"]

View File

@ -19,7 +19,7 @@ COPY . .
RUN cargo install --path .
# We do not need the Rust toolchain to run the binary!
FROM gcr.io/distroless/cc-debian11
FROM gcr.io/distroless/cc-debian12
COPY --from=builder /app/public/ /opt/websurfx/public/
COPY --from=builder /app/websurfx/config.lua /etc/xdg/websurfx/config.lua
COPY --from=builder /usr/local/cargo/bin/* /usr/local/bin/

View File

@ -5,7 +5,7 @@
<b align="center"><a href="README.md">Readme</a></b> |
<b><a href="https://discord.gg/SWnda7Mw5u">Discord</a></b> |
<b><a href="https://github.com/neon-mmd/websurfx">GitHub</a></b> |
<b><a href="./docs/README.md">Documentation</a></b>
<b><a href="../../tree/HEAD/docs/">Documentation</a></b>
<br /><br />
<a href="#">
<img
@ -51,7 +51,7 @@
- **Getting Started**
- [🔭 Preview](#preview-)
- [🚀 Features](#features-)
- [🛠️ Installation and Testing](#installation-and-testing-)
- [🛠️ Installation and Testing](#installation-and-testing-%EF%B8%8F)
- [🔧 Configuration](#configuration-)
- **Feature Overview**
- [🎨 Theming](#theming-)

View File

@ -109,7 +109,7 @@ colorscheme = "catppuccin-mocha" -- the colorscheme name which should be used fo
theme = "simple" -- the theme name which should be used for the website
-- ### Caching ###
redis_connection_url = "redis://redis:6379" -- redis connection url address on which the client should connect on.
redis_url = "redis://redis:6379" -- redis connection url address on which the client should connect on.
-- ### Search Engines ###
upstream_search_engines = { DuckDuckGo = true, Searx = false } -- select the upstream search engines from which the results should be fetched.

BIN
public/images/barricade.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 892 KiB

BIN
public/images/filter.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 102 KiB

View File

@ -132,6 +132,35 @@ body {
width: 1.2rem;
height: 1.2rem;
}
.results .result_disallowed,
.results .result_filtered {
display: flex;
justify-content: center;
align-items: center;
gap: 10rem;
font-size: 2rem;
color: var(--foreground-color);
margin: 0rem 7rem;
}
.results .result_disallowed .user_query,
.results .result_filtered .user_query {
color: var(--background-color);
font-weight: 300;
}
.results .result_disallowed img,
.results .result_filtered img {
width: 30rem;
}
.results .result_disallowed div,
.results .result_filtered div {
display: flex;
flex-direction: column;
gap: 1rem;
line-break: strict;
}
/* styles for the footer and header */

View File

@ -1,37 +1,69 @@
{{>header this.style}}
<main class="results">
{{>search_bar this}}
<div class="results_aggregated">
{{#if results}} {{#each results}}
<div class="result">
<h1><a href="{{{this.url}}}">{{{this.title}}}</a></h1>
<small>{{{this.url}}}</small>
<p>{{{this.description}}}</p>
<div class="upstream_engines">
{{#each engine}}
<span>{{{this}}}</span>
{{/each}}
</div>
{{>search_bar this}}
<div class="results_aggregated">
{{#if results}} {{#each results}}
<div class="result">
<h1><a href="{{{this.url}}}">{{{this.title}}}</a></h1>
<small>{{{this.url}}}</small>
<p>{{{this.description}}}</p>
<div class="upstream_engines">
{{#each engine}}
<span>{{{this}}}</span>
{{/each}}
</div>
</div>
{{/each}} {{else}} {{#if disallowed}}
<div class="result_disallowed">
<div class="description">
<p>
Your search - <span class="user_query">{{{this.pageQuery}}}</span> -
has been disallowed.
</p>
<p class="description_paragraph">Dear user,</p>
<p class="description_paragraph">
The query - <span class="user_query">{{{this.pageQuery}}}</span> - has
been blacklisted via server configuration and hence disallowed by the
server. Henceforth no results could be displayed for your query.
</p>
</div>
<img src="./images/barricade.png" alt="Image of a Barricade" />
</div>
{{else}} {{#if filtered}}
<div class="result_filtered">
<div class="description">
<p>
Your search - <span class="user_query">{{{this.pageQuery}}}</span> -
has been filtered.
</p>
<p class="description_paragraph">Dear user,</p>
<p class="description_paragraph">
All the search results contain results that has been configured to be
filtered out via server configuration and henceforth has been
completely filtered out.
</p>
</div>
<img src="./images/filter.png" alt="Image of a paper inside a funnel" />
</div>
{{else}}
<div class="result_not_found">
<p>Your search - {{{this.pageQuery}}} - did not match any documents.</p>
<p class="suggestions">Suggestions:</p>
<ul>
<li>Make sure that all words are spelled correctly.</li>
<li>Try different keywords.</li>
<li>Try more general keywords.</li>
</ul>
<img src="./images/no_results.gif" alt="Man fishing gif" />
</div>
{{/if}} {{/if}} {{/if}}
</div>
{{/each}} {{else}}
<div class="result_not_found">
<p>Your search - {{{this.pageQuery}}} - did not match any documents.</p>
<p class="suggestions">Suggestions:</p>
<ul>
<li>Make sure that all words are spelled correctly.</li>
<li>Try different keywords.</li>
<li>Try more general keywords.</li>
</ul>
<img src="./images/no_results.gif" alt="Man fishing gif" />
<div class="page_navigation">
<button type="button" onclick="navigate_backward()">
&#8592; previous
</button>
<button type="button" onclick="navigate_forward()">next &#8594;</button>
</div>
{{/if}}
</div>
<div class="page_navigation">
<button type="button" onclick="navigate_backward()">
&#8592; previous
</button>
<button type="button" onclick="navigate_forward()">next &#8594;</button>
</div>
</main>
<script src="static/index.js"></script>
<script src="static/pagination.js"></script>

View File

@ -3,9 +3,19 @@
//! This module contains the main function which handles the logging of the application to the
//! stdout and handles the command line arguments provided and launches the `websurfx` server.
use mimalloc::MiMalloc;
use std::net::TcpListener;
use websurfx::{config::parser::Config, run};
/// A dhat heap memory profiler
#[cfg(feature = "dhat-heap")]
#[global_allocator]
static ALLOC: dhat::Alloc = dhat::Alloc;
#[cfg(not(feature = "dhat-heap"))]
#[global_allocator]
static GLOBAL: MiMalloc = MiMalloc;
/// The function that launches the main server and registers all the routes of the website.
///
/// # Error
@ -14,6 +24,10 @@ use websurfx::{config::parser::Config, run};
/// available for being used for other applications.
#[actix_web::main]
async fn main() -> std::io::Result<()> {
// A dhat heap profiler initialization.
#[cfg(feature = "dhat-heap")]
let _profiler = dhat::Profiler::new_heap();
// Initialize the parsed config file.
let config = Config::parse(false).unwrap();

127
src/cache/cacher.rs vendored
View File

@ -1,14 +1,24 @@
//! This module provides the functionality to cache the aggregated results fetched and aggregated
//! from the upstream search engines in a json format.
use error_stack::Report;
use futures::future::try_join_all;
use md5::compute;
use redis::{Client, Commands, Connection};
use redis::{aio::ConnectionManager, AsyncCommands, Client, RedisError};
use super::error::PoolError;
/// A named struct which stores the redis Connection url address to which the client will
/// connect to.
#[derive(Clone)]
pub struct RedisCache {
/// It stores the redis Connection url address.
connection: Connection,
/// It stores a pool of connections ready to be used.
connection_pool: Vec<ConnectionManager>,
/// It stores the size of the connection pool (in other words the number of
/// connections that should be stored in the pool).
pool_size: u8,
/// It stores the index of which connection is being used at the moment.
current_connection: u8,
}
impl RedisCache {
@ -16,11 +26,25 @@ impl RedisCache {
///
/// # Arguments
///
/// * `redis_connection_url` - It stores the redis Connection url address.
pub fn new(redis_connection_url: String) -> Result<Self, Box<dyn std::error::Error>> {
/// * `redis_connection_url` - It takes the redis Connection url address.
/// * `pool_size` - It takes the size of the connection pool (in other words the number of
/// connections that should be stored in the pool).
pub async fn new(
redis_connection_url: &str,
pool_size: u8,
) -> Result<Self, Box<dyn std::error::Error>> {
let client = Client::open(redis_connection_url)?;
let connection = client.get_connection()?;
let redis_cache = RedisCache { connection };
let mut tasks: Vec<_> = Vec::new();
for _ in 0..pool_size {
tasks.push(client.get_tokio_connection_manager());
}
let redis_cache = RedisCache {
connection_pool: try_join_all(tasks).await?,
pool_size,
current_connection: Default::default(),
};
Ok(redis_cache)
}
@ -29,7 +53,7 @@ impl RedisCache {
/// # Arguments
///
/// * `url` - It takes an url as string.
fn hash_url(url: &str) -> String {
fn hash_url(&self, url: &str) -> String {
format!("{:?}", compute(url))
}
@ -38,9 +62,42 @@ impl RedisCache {
/// # Arguments
///
/// * `url` - It takes an url as a string.
pub fn cached_json(&mut self, url: &str) -> Result<String, Box<dyn std::error::Error>> {
let hashed_url_string = Self::hash_url(url);
Ok(self.connection.get(hashed_url_string)?)
pub async fn cached_json(&mut self, url: &str) -> Result<String, Report<PoolError>> {
self.current_connection = Default::default();
let hashed_url_string: &str = &self.hash_url(url);
let mut result: Result<String, RedisError> = self.connection_pool
[self.current_connection as usize]
.get(hashed_url_string)
.await;
// Code to check whether the current connection being used is dropped with connection error
// or not. if it drops with the connection error then the current connection is replaced
// with a new connection from the pool which is then used to run the redis command then
// that connection is also checked whether it is dropped or not if it is not then the
// result is passed as a `Result` or else the same process repeats again and if all of the
// connections in the pool result in connection drop error then a custom pool error is
// returned.
loop {
match result {
Err(error) => match error.is_connection_dropped() {
true => {
self.current_connection += 1;
if self.current_connection == self.pool_size {
return Err(Report::new(
PoolError::PoolExhaustionWithConnectionDropError,
));
}
result = self.connection_pool[self.current_connection as usize]
.get(hashed_url_string)
.await;
continue;
}
false => return Err(Report::new(PoolError::RedisError(error))),
},
Ok(res) => return Ok(res),
}
}
}
/// A function which caches the results by using the hashed `url` as the key and
@ -51,21 +108,45 @@ impl RedisCache {
///
/// * `json_results` - It takes the json results string as an argument.
/// * `url` - It takes the url as a String.
pub fn cache_results(
pub async fn cache_results(
&mut self,
json_results: String,
json_results: &str,
url: &str,
) -> Result<(), Box<dyn std::error::Error>> {
let hashed_url_string = Self::hash_url(url);
) -> Result<(), Report<PoolError>> {
self.current_connection = Default::default();
let hashed_url_string: &str = &self.hash_url(url);
// put results_json into cache
self.connection.set(&hashed_url_string, json_results)?;
let mut result: Result<(), RedisError> = self.connection_pool
[self.current_connection as usize]
.set_ex(hashed_url_string, json_results, 60)
.await;
// Set the TTL for the key to 60 seconds
self.connection
.expire::<String, u32>(hashed_url_string, 60)
.unwrap();
Ok(())
// Code to check whether the current connection being used is dropped with connection error
// or not. if it drops with the connection error then the current connection is replaced
// with a new connection from the pool which is then used to run the redis command then
// that connection is also checked whether it is dropped or not if it is not then the
// result is passed as a `Result` or else the same process repeats again and if all of the
// connections in the pool result in connection drop error then a custom pool error is
// returned.
loop {
match result {
Err(error) => match error.is_connection_dropped() {
true => {
self.current_connection += 1;
if self.current_connection == self.pool_size {
return Err(Report::new(
PoolError::PoolExhaustionWithConnectionDropError,
));
}
result = self.connection_pool[self.current_connection as usize]
.set_ex(hashed_url_string, json_results, 60)
.await;
continue;
}
false => return Err(Report::new(PoolError::RedisError(error))),
},
Ok(_) => return Ok(()),
}
}
}
}

40
src/cache/error.rs vendored Normal file
View File

@ -0,0 +1,40 @@
//! This module provides the error enum to handle different errors associated while requesting data from
//! the redis server using an async connection pool.
use std::fmt;
use redis::RedisError;
/// A custom error type used for handling redis async pool associated errors.
///
/// This enum provides variants three different categories of errors:
/// * `RedisError` - This variant handles all errors related to `RedisError`,
/// * `PoolExhaustionWithConnectionDropError` - This variant handles the error
/// which occurs when all the connections in the connection pool return a connection
/// dropped redis error.
#[derive(Debug)]
pub enum PoolError {
RedisError(RedisError),
PoolExhaustionWithConnectionDropError,
}
impl fmt::Display for PoolError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
PoolError::RedisError(redis_error) => {
if let Some(detail) = redis_error.detail() {
write!(f, "{}", detail)
} else {
write!(f, "")
}
}
PoolError::PoolExhaustionWithConnectionDropError => {
write!(
f,
"Error all connections from the pool dropped with connection error"
)
}
}
}
}
impl error_stack::Context for PoolError {}

1
src/cache/mod.rs vendored
View File

@ -2,3 +2,4 @@
//! results fetched and aggregated from the upstream search engines in a json format.
pub mod cacher;
pub mod error;

View File

@ -3,9 +3,9 @@
use crate::handler::paths::{file_path, FileType};
use super::parser_models::Style;
use super::parser_models::{AggregatorConfig, RateLimiter, Style};
use log::LevelFilter;
use rlua::Lua;
use mlua::Lua;
use std::{collections::HashMap, fs, thread::available_parallelism};
/// A named struct which stores the parsed config file options.
@ -32,14 +32,11 @@ pub struct Config {
pub request_timeout: u8,
/// It stores the number of threads which controls the app will use to run.
pub threads: u8,
}
/// Configuration options for the aggregator.
#[derive(Clone)]
pub struct AggregatorConfig {
/// It stores the option to whether enable or disable random delays between
/// requests.
pub random_delay: bool,
/// It stores configuration options for the ratelimiting middleware.
pub rate_limiter: RateLimiter,
/// It stores the level of safe search to be used for restricting content in the
/// search results.
pub safe_search: u8,
}
impl Config {
@ -57,53 +54,70 @@ impl Config {
/// or io error if the config.lua file doesn't exists otherwise it returns a newly constructed
/// Config struct with all the parsed config options from the parsed config file.
pub fn parse(logging_initialized: bool) -> Result<Self, Box<dyn std::error::Error>> {
Lua::new().context(|context| -> Result<Self, Box<dyn std::error::Error>> {
let globals = context.globals();
let lua = Lua::new();
let globals = lua.globals();
context
.load(&fs::read_to_string(file_path(FileType::Config)?)?)
.exec()?;
lua.load(&fs::read_to_string(file_path(FileType::Config)?)?)
.exec()?;
let parsed_threads: u8 = globals.get::<_, u8>("threads")?;
let parsed_threads: u8 = globals.get::<_, u8>("threads")?;
let debug: bool = globals.get::<_, bool>("debug")?;
let logging:bool= globals.get::<_, bool>("logging")?;
let debug: bool = globals.get::<_, bool>("debug")?;
let logging: bool = globals.get::<_, bool>("logging")?;
if !logging_initialized {
set_logging_level(debug, logging);
if !logging_initialized {
set_logging_level(debug, logging);
}
let threads: u8 = if parsed_threads == 0 {
let total_num_of_threads: usize = available_parallelism()?.get() / 2;
log::error!(
"Config Error: The value of `threads` option should be a non zero positive integer"
);
log::error!("Falling back to using {} threads", total_num_of_threads);
total_num_of_threads as u8
} else {
parsed_threads
};
let rate_limiter = globals.get::<_, HashMap<String, u8>>("rate_limiter")?;
let parsed_safe_search: u8 = globals.get::<_, u8>("safe_search")?;
let safe_search: u8 = match parsed_safe_search {
0..=4 => parsed_safe_search,
_ => {
log::error!("Config Error: The value of `safe_search` option should be a non zero positive integer from 0 to 4.");
log::error!("Falling back to using the value `1` for the option");
1
}
};
let threads: u8 = if parsed_threads == 0 {
let total_num_of_threads: usize = available_parallelism()?.get() / 2;
log::error!("Config Error: The value of `threads` option should be a non zero positive integer");
log::error!("Falling back to using {} threads", total_num_of_threads);
total_num_of_threads as u8
} else {
parsed_threads
};
Ok(Config {
port: globals.get::<_, u16>("port")?,
binding_ip: globals.get::<_, String>("binding_ip")?,
style: Style::new(
globals.get::<_, String>("theme")?,
globals.get::<_, String>("colorscheme")?,
),
redis_url: globals.get::<_, String>("redis_url")?,
aggregator: AggregatorConfig {
random_delay: globals.get::<_, bool>("production_use")?,
},
logging,
debug,
upstream_search_engines: globals
.get::<_, HashMap<String, bool>>("upstream_search_engines")?
.into_iter()
.filter_map(|(key, value)| value.then_some(key))
.filter_map(|engine| crate::engines::engine_models::EngineHandler::new(&engine))
.collect(),
request_timeout: globals.get::<_, u8>("request_timeout")?,
threads,
})
Ok(Config {
port: globals.get::<_, u16>("port")?,
binding_ip: globals.get::<_, String>("binding_ip")?,
style: Style::new(
globals.get::<_, String>("theme")?,
globals.get::<_, String>("colorscheme")?,
),
redis_url: globals.get::<_, String>("redis_url")?,
aggregator: AggregatorConfig {
random_delay: globals.get::<_, bool>("production_use")?,
},
logging,
debug,
upstream_search_engines: globals
.get::<_, HashMap<String, bool>>("upstream_search_engines")?
.into_iter()
.filter_map(|(key, value)| value.then_some(key))
.filter_map(|engine| crate::engines::engine_models::EngineHandler::new(&engine))
.collect(),
request_timeout: globals.get::<_, u8>("request_timeout")?,
threads,
rate_limiter: RateLimiter {
number_of_requests: rate_limiter["number_of_requests"],
time_limit: rate_limiter["time_limit"],
},
safe_search,
})
}
}

View File

@ -12,7 +12,7 @@ use serde::{Deserialize, Serialize};
/// order to allow the deserializing the json back to struct in aggregate function in
/// aggregator.rs and create a new struct out of it and then serialize it back to json and pass
/// it to the template files.
#[derive(Serialize, Deserialize, Clone)]
#[derive(Serialize, Deserialize, Clone, Default)]
pub struct Style {
/// It stores the parsed theme option used to set a theme for the website.
pub theme: String,
@ -33,3 +33,20 @@ impl Style {
Style { theme, colorscheme }
}
}
/// Configuration options for the aggregator.
#[derive(Clone)]
pub struct AggregatorConfig {
/// It stores the option to whether enable or disable random delays between
/// requests.
pub random_delay: bool,
}
/// Configuration options for the rate limiter middleware.
#[derive(Clone)]
pub struct RateLimiter {
/// The number of request that are allowed within a provided time limit.
pub number_of_requests: u8,
/// The time limit in which the quantity of requests that should be accepted.
pub time_limit: u8,
}

View File

@ -4,14 +4,14 @@
use std::collections::HashMap;
use reqwest::header::{HeaderMap, CONTENT_TYPE, COOKIE, REFERER, USER_AGENT};
use reqwest::header::HeaderMap;
use scraper::{Html, Selector};
use crate::results::aggregation_models::SearchResult;
use super::engine_models::{EngineError, SearchEngine};
use error_stack::{IntoReport, Report, Result, ResultExt};
use error_stack::{Report, Result, ResultExt};
/// A new DuckDuckGo engine type defined in-order to implement the `SearchEngine` trait which allows to
/// reduce code duplication as well as allows to create vector of different search engines easily.
@ -21,10 +21,11 @@ pub struct DuckDuckGo;
impl SearchEngine for DuckDuckGo {
async fn results(
&self,
query: String,
query: &str,
page: u32,
user_agent: String,
user_agent: &str,
request_timeout: u8,
_safe_search: u8,
) -> Result<HashMap<String, SearchResult>, EngineError> {
// Page number can be missing or empty string and so appropriate handling is required
// so that upstream server recieves valid page number.
@ -43,38 +44,19 @@ impl SearchEngine for DuckDuckGo {
};
// initializing HeaderMap and adding appropriate headers.
let mut header_map = HeaderMap::new();
header_map.insert(
USER_AGENT,
user_agent
.parse()
.into_report()
.change_context(EngineError::UnexpectedError)?,
);
header_map.insert(
REFERER,
"https://google.com/"
.parse()
.into_report()
.change_context(EngineError::UnexpectedError)?,
);
header_map.insert(
CONTENT_TYPE,
"application/x-www-form-urlencoded"
.parse()
.into_report()
.change_context(EngineError::UnexpectedError)?,
);
header_map.insert(
COOKIE,
"kl=wt-wt"
.parse()
.into_report()
.change_context(EngineError::UnexpectedError)?,
);
let header_map = HeaderMap::try_from(&HashMap::from([
("USER_AGENT".to_string(), user_agent.to_string()),
("REFERER".to_string(), "https://google.com/".to_string()),
(
"CONTENT_TYPE".to_string(),
"application/x-www-form-urlencoded".to_string(),
),
("COOKIE".to_string(), "kl=wt-wt".to_string()),
]))
.change_context(EngineError::UnexpectedError)?;
let document: Html = Html::parse_document(
&DuckDuckGo::fetch_html_from_upstream(self, url, header_map, request_timeout).await?,
&DuckDuckGo::fetch_html_from_upstream(self, &url, header_map, request_timeout).await?,
);
let no_result: Selector = Selector::parse(".no-results")
@ -108,8 +90,7 @@ impl SearchEngine for DuckDuckGo {
.next()
.unwrap()
.inner_html()
.trim()
.to_string(),
.trim(),
format!(
"https://{}",
result
@ -118,15 +99,15 @@ impl SearchEngine for DuckDuckGo {
.unwrap()
.inner_html()
.trim()
),
)
.as_str(),
result
.select(&result_desc)
.next()
.unwrap()
.inner_html()
.trim()
.to_string(),
vec!["duckduckgo".to_string()],
.trim(),
&["duckduckgo"],
)
})
.map(|search_result| (search_result.url.clone(), search_result))

View File

@ -2,7 +2,7 @@
//! the upstream search engines with the search query provided by the user.
use crate::results::aggregation_models::SearchResult;
use error_stack::{IntoReport, Result, ResultExt};
use error_stack::{Result, ResultExt};
use std::{collections::HashMap, fmt, time::Duration};
/// A custom error type used for handle engine associated errors.
@ -64,7 +64,7 @@ pub trait SearchEngine: Sync + Send {
/// otherwise it returns a custom `EngineError`.
async fn fetch_html_from_upstream(
&self,
url: String,
url: &str,
header_map: reqwest::header::HeaderMap,
request_timeout: u8,
) -> Result<String, EngineError> {
@ -75,11 +75,9 @@ pub trait SearchEngine: Sync + Send {
.headers(header_map) // add spoofed headers to emulate human behavior
.send()
.await
.into_report()
.change_context(EngineError::RequestError)?
.text()
.await
.into_report()
.change_context(EngineError::RequestError)?)
}
@ -103,10 +101,11 @@ pub trait SearchEngine: Sync + Send {
/// or HeaderMap fails to initialize.
async fn results(
&self,
query: String,
query: &str,
page: u32,
user_agent: String,
user_agent: &str,
request_timeout: u8,
safe_search: u8,
) -> Result<HashMap<String, SearchResult>, EngineError>;
}

View File

@ -2,14 +2,14 @@
//! by querying the upstream searx search engine instance with user provided query and with a page
//! number if provided.
use reqwest::header::{HeaderMap, CONTENT_TYPE, COOKIE, REFERER, USER_AGENT};
use reqwest::header::HeaderMap;
use scraper::{Html, Selector};
use std::collections::HashMap;
use crate::results::aggregation_models::SearchResult;
use super::engine_models::{EngineError, SearchEngine};
use error_stack::{IntoReport, Report, Result, ResultExt};
use error_stack::{Report, Result, ResultExt};
/// A new Searx engine type defined in-order to implement the `SearchEngine` trait which allows to
/// reduce code duplication as well as allows to create vector of different search engines easily.
@ -19,45 +19,38 @@ pub struct Searx;
impl SearchEngine for Searx {
async fn results(
&self,
query: String,
query: &str,
page: u32,
user_agent: String,
user_agent: &str,
request_timeout: u8,
mut safe_search: u8,
) -> Result<HashMap<String, SearchResult>, EngineError> {
// Page number can be missing or empty string and so appropriate handling is required
// so that upstream server recieves valid page number.
if safe_search == 3 {
safe_search = 2;
};
let url: String = match page {
0 | 1 => format!("https://searx.work/search?q={query}&pageno=1"),
_ => format!("https://searx.work/search?q={query}&pageno={page}"),
0 | 1 => {
format!("https://searx.work/search?q={query}&pageno=1&safesearch={safe_search}")
}
_ => format!(
"https://searx.work/search?q={query}&pageno={page}&safesearch={safe_search}"
),
};
// initializing headers and adding appropriate headers.
let mut header_map = HeaderMap::new();
header_map.insert(
USER_AGENT,
user_agent
.parse()
.into_report()
.change_context(EngineError::UnexpectedError)?,
);
header_map.insert(
REFERER,
"https://google.com/"
.parse()
.into_report()
.change_context(EngineError::UnexpectedError)?,
);
header_map.insert(
CONTENT_TYPE,
"application/x-www-form-urlencoded"
.parse()
.into_report()
.change_context(EngineError::UnexpectedError)?,
);
header_map.insert(COOKIE, "categories=general; language=auto; locale=en; autocomplete=duckduckgo; image_proxy=1; method=POST; safesearch=2; theme=simple; results_on_new_tab=1; doi_resolver=oadoi.org; simple_style=auto; center_alignment=1; query_in_title=1; infinite_scroll=0; disabled_engines=; enabled_engines=\"archive is__general\\054yep__general\\054curlie__general\\054currency__general\\054ddg definitions__general\\054wikidata__general\\054duckduckgo__general\\054tineye__general\\054lingva__general\\054startpage__general\\054yahoo__general\\054wiby__general\\054marginalia__general\\054alexandria__general\\054wikibooks__general\\054wikiquote__general\\054wikisource__general\\054wikiversity__general\\054wikivoyage__general\\054dictzone__general\\054seznam__general\\054mojeek__general\\054naver__general\\054wikimini__general\\054brave__general\\054petalsearch__general\\054goo__general\"; disabled_plugins=; enabled_plugins=\"searx.plugins.hostname_replace\\054searx.plugins.oa_doi_rewrite\\054searx.plugins.vim_hotkeys\"; tokens=; maintab=on; enginetab=on".parse().into_report().change_context(EngineError::UnexpectedError)?);
let header_map = HeaderMap::try_from(&HashMap::from([
("USER_AGENT".to_string(), user_agent.to_string()),
("REFERER".to_string(), "https://google.com/".to_string()),
("CONTENT_TYPE".to_string(), "application/x-www-form-urlencoded".to_string()),
("COOKIE".to_string(), "categories=general; language=auto; locale=en; autocomplete=duckduckgo; image_proxy=1; method=POST; safesearch=2; theme=simple; results_on_new_tab=1; doi_resolver=oadoi.org; simple_style=auto; center_alignment=1; query_in_title=1; infinite_scroll=0; disabled_engines=; enabled_engines=\"archive is__general\\054yep__general\\054curlie__general\\054currency__general\\054ddg definitions__general\\054wikidata__general\\054duckduckgo__general\\054tineye__general\\054lingva__general\\054startpage__general\\054yahoo__general\\054wiby__general\\054marginalia__general\\054alexandria__general\\054wikibooks__general\\054wikiquote__general\\054wikisource__general\\054wikiversity__general\\054wikivoyage__general\\054dictzone__general\\054seznam__general\\054mojeek__general\\054naver__general\\054wikimini__general\\054brave__general\\054petalsearch__general\\054goo__general\"; disabled_plugins=; enabled_plugins=\"searx.plugins.hostname_replace\\054searx.plugins.oa_doi_rewrite\\054searx.plugins.vim_hotkeys\"; tokens=; maintab=on; enginetab=on".to_string())
]))
.change_context(EngineError::UnexpectedError)?;
let document: Html = Html::parse_document(
&Searx::fetch_html_from_upstream(self, url, header_map, request_timeout).await?,
&Searx::fetch_html_from_upstream(self, &url, header_map, request_timeout).await?,
);
let no_result: Selector = Selector::parse("#urls>.dialog-error>p")
@ -98,24 +91,21 @@ impl SearchEngine for Searx {
.next()
.unwrap()
.inner_html()
.trim()
.to_string(),
.trim(),
result
.select(&result_url)
.next()
.unwrap()
.value()
.attr("href")
.unwrap()
.to_string(),
.unwrap(),
result
.select(&result_desc)
.next()
.unwrap()
.inner_html()
.trim()
.to_string(),
vec!["searx".to_string()],
.trim(),
&["searx"],
)
})
.map(|search_result| (search_result.url.clone(), search_result))

View File

@ -4,6 +4,7 @@
use std::collections::HashMap;
use std::io::Error;
use std::path::Path;
use std::sync::OnceLock;
// ------- Constants --------
/// The constant holding the name of the theme folder.
@ -31,57 +32,7 @@ pub enum FileType {
}
/// A static variable which stores the different filesystem paths for various file/folder types.
static FILE_PATHS_FOR_DIFF_FILE_TYPES: once_cell::sync::Lazy<HashMap<FileType, Vec<String>>> =
once_cell::sync::Lazy::new(|| {
HashMap::from([
(
FileType::Config,
vec![
format!(
"{}/.config/{}/{}",
std::env::var("HOME").unwrap(),
COMMON_DIRECTORY_NAME,
CONFIG_FILE_NAME
),
format!("/etc/xdg/{}/{}", COMMON_DIRECTORY_NAME, CONFIG_FILE_NAME),
format!("./{}/{}", COMMON_DIRECTORY_NAME, CONFIG_FILE_NAME),
],
),
(
FileType::Theme,
vec![
format!("/opt/websurfx/{}/", PUBLIC_DIRECTORY_NAME),
format!("./{}/", PUBLIC_DIRECTORY_NAME),
],
),
(
FileType::AllowList,
vec![
format!(
"{}/.config/{}/{}",
std::env::var("HOME").unwrap(),
COMMON_DIRECTORY_NAME,
ALLOWLIST_FILE_NAME
),
format!("/etc/xdg/{}/{}", COMMON_DIRECTORY_NAME, ALLOWLIST_FILE_NAME),
format!("./{}/{}", COMMON_DIRECTORY_NAME, ALLOWLIST_FILE_NAME),
],
),
(
FileType::BlockList,
vec![
format!(
"{}/.config/{}/{}",
std::env::var("HOME").unwrap(),
COMMON_DIRECTORY_NAME,
BLOCKLIST_FILE_NAME
),
format!("/etc/xdg/{}/{}", COMMON_DIRECTORY_NAME, BLOCKLIST_FILE_NAME),
format!("./{}/{}", COMMON_DIRECTORY_NAME, BLOCKLIST_FILE_NAME),
],
),
])
});
static FILE_PATHS_FOR_DIFF_FILE_TYPES: OnceLock<HashMap<FileType, Vec<String>>> = OnceLock::new();
/// A function which returns an appropriate path for thr provided file type by checking if the path
/// for the given file type exists on that path.
@ -99,11 +50,64 @@ static FILE_PATHS_FOR_DIFF_FILE_TYPES: once_cell::sync::Lazy<HashMap<FileType, V
/// 1. `/opt/websurfx` if it not present here then it fallbacks to the next one (2)
/// 2. Under project folder ( or codebase in other words) if it is not present
/// here then it returns an error as mentioned above.
pub fn file_path(file_type: FileType) -> Result<String, Error> {
let file_path = FILE_PATHS_FOR_DIFF_FILE_TYPES.get(&file_type).unwrap();
pub fn file_path(file_type: FileType) -> Result<&'static str, Error> {
let file_path: &Vec<String> = FILE_PATHS_FOR_DIFF_FILE_TYPES
.get_or_init(|| {
HashMap::from([
(
FileType::Config,
vec![
format!(
"{}/.config/{}/{}",
std::env::var("HOME").unwrap(),
COMMON_DIRECTORY_NAME,
CONFIG_FILE_NAME
),
format!("/etc/xdg/{}/{}", COMMON_DIRECTORY_NAME, CONFIG_FILE_NAME),
format!("./{}/{}", COMMON_DIRECTORY_NAME, CONFIG_FILE_NAME),
],
),
(
FileType::Theme,
vec![
format!("/opt/websurfx/{}/", PUBLIC_DIRECTORY_NAME),
format!("./{}/", PUBLIC_DIRECTORY_NAME),
],
),
(
FileType::AllowList,
vec![
format!(
"{}/.config/{}/{}",
std::env::var("HOME").unwrap(),
COMMON_DIRECTORY_NAME,
ALLOWLIST_FILE_NAME
),
format!("/etc/xdg/{}/{}", COMMON_DIRECTORY_NAME, ALLOWLIST_FILE_NAME),
format!("./{}/{}", COMMON_DIRECTORY_NAME, ALLOWLIST_FILE_NAME),
],
),
(
FileType::BlockList,
vec![
format!(
"{}/.config/{}/{}",
std::env::var("HOME").unwrap(),
COMMON_DIRECTORY_NAME,
BLOCKLIST_FILE_NAME
),
format!("/etc/xdg/{}/{}", COMMON_DIRECTORY_NAME, BLOCKLIST_FILE_NAME),
format!("./{}/{}", COMMON_DIRECTORY_NAME, BLOCKLIST_FILE_NAME),
],
),
])
})
.get(&file_type)
.unwrap();
for (idx, _) in file_path.iter().enumerate() {
if Path::new(file_path[idx].as_str()).exists() {
return Ok(file_path[idx].clone());
return Ok(std::mem::take(&mut &*file_path[idx]));
}
}

View File

@ -18,6 +18,7 @@ use crate::server::routes;
use actix_cors::Cors;
use actix_files as fs;
use actix_governor::{Governor, GovernorConfigBuilder};
use actix_web::{dev::Server, http::header, middleware::Logger, web, App, HttpServer};
use config::parser::Config;
use handlebars::Handlebars;
@ -46,7 +47,7 @@ use handler::paths::{file_path, FileType};
pub fn run(listener: TcpListener, config: Config) -> std::io::Result<Server> {
let mut handlebars: Handlebars<'_> = Handlebars::new();
let public_folder_path: String = file_path(FileType::Theme)?;
let public_folder_path: &str = file_path(FileType::Theme)?;
handlebars
.register_templates_directory(".html", format!("{}/templates", public_folder_path))
@ -68,10 +69,17 @@ pub fn run(listener: TcpListener, config: Config) -> std::io::Result<Server> {
]);
App::new()
.wrap(Logger::default()) // added logging middleware for logging.
.app_data(handlebars_ref.clone())
.app_data(web::Data::new(config.clone()))
.wrap(cors)
.wrap(Logger::default()) // added logging middleware for logging.
.wrap(Governor::new(
&GovernorConfigBuilder::default()
.per_second(config.rate_limiter.time_limit as u64)
.burst_size(config.rate_limiter.number_of_requests as u32)
.finish()
.unwrap(),
))
// Serve images and static files (css and js files).
.service(
fs::Files::new("/static", format!("{}/static", public_folder_path))

View File

@ -2,6 +2,7 @@
//! data scraped from the upstream search engines.
use serde::{Deserialize, Serialize};
use smallvec::SmallVec;
use crate::{config::parser_models::Style, engines::engine_models::EngineError};
@ -19,7 +20,7 @@ pub struct SearchResult {
/// The description of the search result.
pub description: String,
/// The names of the upstream engines from which this results were provided.
pub engine: Vec<String>,
pub engine: SmallVec<[String; 0]>,
}
impl SearchResult {
@ -32,12 +33,12 @@ impl SearchResult {
/// (href url in html in simple words).
/// * `description` - The description of the search result.
/// * `engine` - The names of the upstream engines from which this results were provided.
pub fn new(title: String, url: String, description: String, engine: Vec<String>) -> Self {
pub fn new(title: &str, url: &str, description: &str, engine: &[&str]) -> Self {
SearchResult {
title,
url,
description,
engine,
title: title.to_owned(),
url: url.to_owned(),
description: description.to_owned(),
engine: engine.iter().map(|name| name.to_string()).collect(),
}
}
@ -46,8 +47,8 @@ impl SearchResult {
/// # Arguments
///
/// * `engine` - Takes an engine name provided as a String.
pub fn add_engines(&mut self, engine: String) {
self.engine.push(engine)
pub fn add_engines(&mut self, engine: &str) {
self.engine.push(engine.to_owned())
}
/// A function which returns the engine name stored from the struct as a string.
@ -55,13 +56,13 @@ impl SearchResult {
/// # Returns
///
/// An engine name stored as a string from the struct.
pub fn engine(self) -> String {
self.engine.get(0).unwrap().to_string()
pub fn engine(&mut self) -> String {
std::mem::take(&mut self.engine[0])
}
}
/// A named struct that stores the error info related to the upstream search engines.
#[derive(Serialize, Deserialize)]
#[derive(Serialize, Deserialize, Clone)]
pub struct EngineErrorInfo {
/// It stores the error type which occured while fetching the result from a particular search
/// engine.
@ -81,18 +82,18 @@ impl EngineErrorInfo {
/// * `error` - It takes the error type which occured while fetching the result from a particular
/// search engine.
/// * `engine` - It takes the name of the engine that failed to provide the requested search results.
pub fn new(error: &EngineError, engine: String) -> Self {
pub fn new(error: &EngineError, engine: &str) -> Self {
Self {
error: match error {
EngineError::RequestError => String::from("RequestError"),
EngineError::EmptyResultSet => String::from("EmptyResultSet"),
EngineError::UnexpectedError => String::from("UnexpectedError"),
EngineError::RequestError => "RequestError".to_owned(),
EngineError::EmptyResultSet => "EmptyResultSet".to_owned(),
EngineError::UnexpectedError => "UnexpectedError".to_owned(),
},
engine,
engine: engine.to_owned(),
severity_color: match error {
EngineError::RequestError => String::from("green"),
EngineError::EmptyResultSet => String::from("blue"),
EngineError::UnexpectedError => String::from("red"),
EngineError::RequestError => "green".to_owned(),
EngineError::EmptyResultSet => "blue".to_owned(),
EngineError::UnexpectedError => "red".to_owned(),
},
}
}
@ -101,7 +102,7 @@ impl EngineErrorInfo {
/// A named struct to store, serialize, deserialize the all the search results scraped and
/// aggregated from the upstream search engines.
/// `SearchResult` structs.
#[derive(Serialize, Deserialize)]
#[derive(Serialize, Deserialize, Default)]
#[serde(rename_all = "camelCase")]
pub struct SearchResults {
/// Stores the individual serializable `SearchResult` struct into a vector of
@ -113,6 +114,14 @@ pub struct SearchResults {
/// Stores the information on which engines failed with their engine name
/// and the type of error that caused it.
pub engine_errors_info: Vec<EngineErrorInfo>,
/// Stores the flag option which holds the check value that the following
/// search query was disallowed when the safe search level set to 4 and it
/// was present in the `Blocklist` file.
pub disallowed: bool,
/// Stores the flag option which holds the check value that the following
/// search query was filtered when the safe search level set to 3 and it
/// was present in the `Blocklist` file.
pub filtered: bool,
}
impl SearchResults {
@ -126,21 +135,48 @@ impl SearchResults {
/// the search url.
/// * `empty_result_set` - Takes a boolean which indicates that no engines gave a result for the
/// given search query.
/// * ``
pub fn new(
results: Vec<SearchResult>,
page_query: String,
engine_errors_info: Vec<EngineErrorInfo>,
page_query: &str,
engine_errors_info: &[EngineErrorInfo],
) -> Self {
SearchResults {
Self {
results,
page_query,
style: Style::new("".to_string(), "".to_string()),
engine_errors_info,
page_query: page_query.to_owned(),
style: Style::default(),
engine_errors_info: engine_errors_info.to_owned(),
disallowed: Default::default(),
filtered: Default::default(),
}
}
/// A setter function to add website style to the return search results.
pub fn add_style(&mut self, style: Style) {
self.style = style;
pub fn add_style(&mut self, style: &Style) {
self.style = style.clone();
}
/// A setter function that sets disallowed to true.
pub fn set_disallowed(&mut self) {
self.disallowed = true;
}
/// A setter function to set the current page search query.
pub fn set_page_query(&mut self, page: &str) {
self.page_query = page.to_owned();
}
/// A setter function that sets the filtered to true.
pub fn set_filtered(&mut self) {
self.filtered = true;
}
/// A getter function that gets the value of `engine_errors_info`.
pub fn engine_errors_info(&mut self) -> Vec<EngineErrorInfo> {
std::mem::take(&mut self.engine_errors_info)
}
/// A getter function that gets the value of `results`.
pub fn results(&mut self) -> Vec<SearchResult> {
self.results.clone()
}
}

View File

@ -64,14 +64,15 @@ type FutureVec = Vec<JoinHandle<Result<HashMap<String, SearchResult>, Report<Eng
/// function in either `searx` or `duckduckgo` or both otherwise returns a `SearchResults struct`
/// containing appropriate values.
pub async fn aggregate(
query: String,
query: &str,
page: u32,
random_delay: bool,
debug: bool,
upstream_search_engines: Vec<EngineHandler>,
upstream_search_engines: &[EngineHandler],
request_timeout: u8,
safe_search: u8,
) -> Result<SearchResults, Box<dyn std::error::Error>> {
let user_agent: String = random_user_agent();
let user_agent: &str = random_user_agent();
// Add a random delay before making the request.
if random_delay || !debug {
@ -80,19 +81,24 @@ pub async fn aggregate(
tokio::time::sleep(Duration::from_secs(delay_secs)).await;
}
let mut names: Vec<&str> = vec![];
let mut names: Vec<&str> = Vec::with_capacity(0);
// create tasks for upstream result fetching
let mut tasks: FutureVec = FutureVec::new();
for engine_handler in upstream_search_engines {
let (name, search_engine) = engine_handler.into_name_engine();
let (name, search_engine) = engine_handler.to_owned().into_name_engine();
names.push(name);
let query: String = query.clone();
let user_agent: String = user_agent.clone();
let query: String = query.to_owned();
tasks.push(tokio::spawn(async move {
search_engine
.results(query, page, user_agent.clone(), request_timeout)
.results(
&query,
page,
user_agent.clone(),
request_timeout,
safe_search,
)
.await
}));
}
@ -110,7 +116,7 @@ pub async fn aggregate(
let mut result_map: HashMap<String, SearchResult> = HashMap::new();
let mut engine_errors_info: Vec<EngineErrorInfo> = Vec::new();
let mut handle_error = |error: Report<EngineError>, engine_name: String| {
let mut handle_error = |error: &Report<EngineError>, engine_name: &'static str| {
log::error!("Engine Error: {:?}", error);
engine_errors_info.push(EngineErrorInfo::new(
error.downcast_ref::<EngineError>().unwrap(),
@ -120,7 +126,7 @@ pub async fn aggregate(
for _ in 0..responses.len() {
let response = responses.pop().unwrap();
let engine = names.pop().unwrap().to_string();
let engine = names.pop().unwrap();
if result_map.is_empty() {
match response {
@ -128,7 +134,7 @@ pub async fn aggregate(
result_map = results.clone();
}
Err(error) => {
handle_error(error, engine);
handle_error(&error, engine);
}
}
continue;
@ -140,39 +146,37 @@ pub async fn aggregate(
result_map
.entry(key)
.and_modify(|result| {
result.add_engines(engine.clone());
result.add_engines(engine);
})
.or_insert_with(|| -> SearchResult { value });
});
}
Err(error) => {
handle_error(error, engine);
handle_error(&error, engine);
}
}
}
let mut blacklist_map: HashMap<String, SearchResult> = HashMap::new();
filter_with_lists(
&mut result_map,
&mut blacklist_map,
&file_path(FileType::BlockList)?,
)?;
if safe_search >= 3 {
let mut blacklist_map: HashMap<String, SearchResult> = HashMap::new();
filter_with_lists(
&mut result_map,
&mut blacklist_map,
file_path(FileType::BlockList)?,
)?;
filter_with_lists(
&mut blacklist_map,
&mut result_map,
&file_path(FileType::AllowList)?,
)?;
filter_with_lists(
&mut blacklist_map,
&mut result_map,
file_path(FileType::AllowList)?,
)?;
drop(blacklist_map);
drop(blacklist_map);
}
let results: Vec<SearchResult> = result_map.into_values().collect();
Ok(SearchResults::new(
results,
query.to_string(),
engine_errors_info,
))
Ok(SearchResults::new(results, query, &engine_errors_info))
}
/// Filters a map of search results using a list of regex patterns.
@ -194,7 +198,7 @@ pub fn filter_with_lists(
let mut reader = BufReader::new(File::open(file_path)?);
for line in reader.by_ref().lines() {
let re = Regex::new(&line?)?;
let re = Regex::new(line?.trim())?;
// Iterate over each search result in the map and check if it matches the regex pattern
for (url, search_result) in map_to_be_filtered.clone().into_iter() {
@ -203,7 +207,10 @@ pub fn filter_with_lists(
|| re.is_match(&search_result.description.to_lowercase())
{
// If the search result matches the regex pattern, move it from the original map to the resultant map
resultant_map.insert(url.clone(), map_to_be_filtered.remove(&url).unwrap());
resultant_map.insert(
url.to_owned(),
map_to_be_filtered.remove(&url.to_owned()).unwrap(),
);
}
}
}
@ -214,6 +221,7 @@ pub fn filter_with_lists(
#[cfg(test)]
mod tests {
use super::*;
use smallvec::smallvec;
use std::collections::HashMap;
use std::io::Write;
use tempfile::NamedTempFile;
@ -223,22 +231,22 @@ mod tests {
// Create a map of search results to filter
let mut map_to_be_filtered = HashMap::new();
map_to_be_filtered.insert(
"https://www.example.com".to_string(),
"https://www.example.com".to_owned(),
SearchResult {
title: "Example Domain".to_string(),
url: "https://www.example.com".to_string(),
title: "Example Domain".to_owned(),
url: "https://www.example.com".to_owned(),
description: "This domain is for use in illustrative examples in documents."
.to_string(),
engine: vec!["Google".to_string(), "Bing".to_string()],
.to_owned(),
engine: smallvec!["Google".to_owned(), "Bing".to_owned()],
},
);
map_to_be_filtered.insert(
"https://www.rust-lang.org/".to_string(),
"https://www.rust-lang.org/".to_owned(),
SearchResult {
title: "Rust Programming Language".to_string(),
url: "https://www.rust-lang.org/".to_string(),
description: "A systems programming language that runs blazingly fast, prevents segfaults, and guarantees thread safety.".to_string(),
engine: vec!["Google".to_string(), "DuckDuckGo".to_string()],
title: "Rust Programming Language".to_owned(),
url: "https://www.rust-lang.org/".to_owned(),
description: "A systems programming language that runs blazingly fast, prevents segfaults, and guarantees thread safety.".to_owned(),
engine: smallvec!["Google".to_owned(), "DuckDuckGo".to_owned()],
},
);
@ -267,22 +275,22 @@ mod tests {
fn test_filter_with_lists_wildcard() -> Result<(), Box<dyn std::error::Error>> {
let mut map_to_be_filtered = HashMap::new();
map_to_be_filtered.insert(
"https://www.example.com".to_string(),
"https://www.example.com".to_owned(),
SearchResult {
title: "Example Domain".to_string(),
url: "https://www.example.com".to_string(),
title: "Example Domain".to_owned(),
url: "https://www.example.com".to_owned(),
description: "This domain is for use in illustrative examples in documents."
.to_string(),
engine: vec!["Google".to_string(), "Bing".to_string()],
.to_owned(),
engine: smallvec!["Google".to_owned(), "Bing".to_owned()],
},
);
map_to_be_filtered.insert(
"https://www.rust-lang.org/".to_string(),
"https://www.rust-lang.org/".to_owned(),
SearchResult {
title: "Rust Programming Language".to_string(),
url: "https://www.rust-lang.org/".to_string(),
description: "A systems programming language that runs blazingly fast, prevents segfaults, and guarantees thread safety.".to_string(),
engine: vec!["Google".to_string(), "DuckDuckGo".to_string()],
title: "Rust Programming Language".to_owned(),
url: "https://www.rust-lang.org/".to_owned(),
description: "A systems programming language that runs blazingly fast, prevents segfaults, and guarantees thread safety.".to_owned(),
engine: smallvec!["Google".to_owned(), "DuckDuckGo".to_owned()],
},
);
@ -327,13 +335,13 @@ mod tests {
fn test_filter_with_lists_invalid_regex() {
let mut map_to_be_filtered = HashMap::new();
map_to_be_filtered.insert(
"https://www.example.com".to_string(),
"https://www.example.com".to_owned(),
SearchResult {
title: "Example Domain".to_string(),
url: "https://www.example.com".to_string(),
title: "Example Domain".to_owned(),
url: "https://www.example.com".to_owned(),
description: "This domain is for use in illustrative examples in documents."
.to_string(),
engine: vec!["Google".to_string(), "Bing".to_string()],
.to_owned(),
engine: smallvec!["Google".to_owned(), "Bing".to_owned()],
},
);

View File

@ -1,30 +1,34 @@
//! This module provides the functionality to generate random user agent string.
use std::sync::OnceLock;
use fake_useragent::{Browsers, UserAgents, UserAgentsBuilder};
/// A static variable which stores the initially build `UserAgents` struct. So as it can be resused
/// again and again without the need of reinitializing the `UserAgents` struct.
static USER_AGENTS: once_cell::sync::Lazy<UserAgents> = once_cell::sync::Lazy::new(|| {
UserAgentsBuilder::new()
.cache(false)
.dir("/tmp")
.thread(1)
.set_browsers(
Browsers::new()
.set_chrome()
.set_safari()
.set_edge()
.set_firefox()
.set_mozilla(),
)
.build()
});
static USER_AGENTS: OnceLock<UserAgents> = OnceLock::new();
/// A function to generate random user agent to improve privacy of the user.
///
/// # Returns
///
/// A randomly generated user agent string.
pub fn random_user_agent() -> String {
USER_AGENTS.random().to_string()
pub fn random_user_agent() -> &'static str {
USER_AGENTS
.get_or_init(|| {
UserAgentsBuilder::new()
.cache(false)
.dir("/tmp")
.thread(1)
.set_browsers(
Browsers::new()
.set_chrome()
.set_safari()
.set_edge()
.set_firefox()
.set_mozilla(),
)
.build()
})
.random()
}

View File

@ -2,7 +2,10 @@
//! meta search engine website and provide appropriate response to each route/page
//! when requested.
use std::fs::read_to_string;
use std::{
fs::{read_to_string, File},
io::{BufRead, BufReader, Read},
};
use crate::{
cache::cacher::RedisCache,
@ -13,9 +16,14 @@ use crate::{
};
use actix_web::{get, web, HttpRequest, HttpResponse};
use handlebars::Handlebars;
use regex::Regex;
use serde::Deserialize;
use tokio::join;
// ---- Constants ----
/// Initialize redis cache connection once and store it on the heap.
static REDIS_CACHE: async_once_cell::OnceCell<RedisCache> = async_once_cell::OnceCell::new();
/// A named struct which deserializes all the user provided search parameters and stores them.
#[derive(Deserialize)]
struct SearchParams {
@ -25,6 +33,7 @@ struct SearchParams {
/// It stores the search parameter `page` (or pageno in simple words)
/// of the search url.
page: Option<u32>,
safesearch: Option<u8>,
}
/// Handles the route of index page or main page of the `websurfx` meta search engine website.
@ -53,13 +62,13 @@ pub async fn not_found(
/// A named struct which is used to deserialize the cookies fetched from the client side.
#[allow(dead_code)]
#[derive(Deserialize)]
struct Cookie {
struct Cookie<'a> {
/// It stores the theme name used in the website.
theme: String,
theme: &'a str,
/// It stores the colorscheme name used for the website theme.
colorscheme: String,
colorscheme: &'a str,
/// It stores the user selected upstream search engines selected from the UI.
engines: Vec<String>,
engines: Vec<&'a str>,
}
/// Handles the route of search page of the `websurfx` meta search engine website and it takes
@ -95,42 +104,58 @@ pub async fn search(
None => 1,
};
let safe_search: u8 = match config.safe_search {
3..=4 => config.safe_search,
_ => match &params.safesearch {
Some(safesearch) => match safesearch {
0..=2 => *safesearch,
_ => 1,
},
None => config.safe_search,
},
};
let (_, results, _) = join!(
results(
format!(
"http://{}:{}/search?q={}&page={}",
"http://{}:{}/search?q={}&page={}&safesearch={}",
config.binding_ip,
config.port,
query,
page - 1
page - 1,
safe_search
),
&config,
query.to_string(),
query,
page - 1,
req.clone(),
safe_search
),
results(
format!(
"http://{}:{}/search?q={}&page={}",
config.binding_ip, config.port, query, page
"http://{}:{}/search?q={}&page={}&safesearch={}",
config.binding_ip, config.port, query, page, safe_search
),
&config,
query.to_string(),
query,
page,
req.clone(),
safe_search
),
results(
format!(
"http://{}:{}/search?q={}&page={}",
"http://{}:{}/search?q={}&page={}&safesearch={}",
config.binding_ip,
config.port,
query,
page + 1
page + 1,
safe_search
),
&config,
query.to_string(),
query,
page + 1,
req.clone(),
safe_search
)
);
@ -161,30 +186,53 @@ pub async fn search(
async fn results(
url: String,
config: &Config,
query: String,
query: &str,
page: u32,
req: HttpRequest,
safe_search: u8,
) -> Result<SearchResults, Box<dyn std::error::Error>> {
// Initialize redis cache connection struct
let mut redis_cache = RedisCache::new(config.redis_url.clone())?;
let mut redis_cache: RedisCache = REDIS_CACHE
.get_or_init(async {
// Initialize redis cache connection pool only one and store it in the heap.
RedisCache::new(&config.redis_url, 5).await.unwrap()
})
.await
.clone();
// fetch the cached results json.
let cached_results_json = redis_cache.cached_json(&url);
let cached_results_json: Result<String, error_stack::Report<crate::cache::error::PoolError>> =
redis_cache.clone().cached_json(&url).await;
// check if fetched cache results was indeed fetched or it was an error and if so
// handle the data accordingly.
match cached_results_json {
Ok(results) => Ok(serde_json::from_str::<SearchResults>(&results).unwrap()),
Ok(results) => Ok(serde_json::from_str::<SearchResults>(&results)?),
Err(_) => {
if safe_search == 4 {
let mut results: SearchResults = SearchResults::default();
let mut _flag: bool =
is_match_from_filter_list(file_path(FileType::BlockList)?, query)?;
_flag = !is_match_from_filter_list(file_path(FileType::AllowList)?, query)?;
if _flag {
results.set_disallowed();
results.add_style(&config.style);
results.set_page_query(query);
redis_cache
.cache_results(&serde_json::to_string(&results)?, &url)
.await?;
return Ok(results);
}
}
// check if the cookie value is empty or not if it is empty then use the
// default selected upstream search engines from the config file otherwise
// parse the non-empty cookie and grab the user selected engines from the
// UI and use that.
let mut results: crate::results::aggregation_models::SearchResults = match req
.cookie("appCookie")
{
let mut results: SearchResults = match req.cookie("appCookie") {
Some(cookie_value) => {
let cookie_value: Cookie = serde_json::from_str(cookie_value.name_value().1)?;
let engines = cookie_value
let engines: Vec<EngineHandler> = cookie_value
.engines
.iter()
.filter_map(|name| EngineHandler::new(name))
@ -195,8 +243,9 @@ async fn results(
page,
config.aggregator.random_delay,
config.debug,
engines,
&engines,
config.request_timeout,
safe_search,
)
.await?
}
@ -206,19 +255,41 @@ async fn results(
page,
config.aggregator.random_delay,
config.debug,
config.upstream_search_engines.clone(),
&config.upstream_search_engines,
config.request_timeout,
safe_search,
)
.await?
}
};
results.add_style(config.style.clone());
redis_cache.cache_results(serde_json::to_string(&results)?, &url)?;
if results.engine_errors_info().is_empty() && results.results().is_empty() {
results.set_filtered();
}
results.add_style(&config.style);
redis_cache
.cache_results(&serde_json::to_string(&results)?, &url)
.await?;
Ok(results)
}
}
}
fn is_match_from_filter_list(
file_path: &str,
query: &str,
) -> Result<bool, Box<dyn std::error::Error>> {
let mut flag = false;
let mut reader = BufReader::new(File::open(file_path)?);
for line in reader.by_ref().lines() {
let re = Regex::new(&line?)?;
if re.is_match(query) {
flag = true;
break;
}
}
Ok(flag)
}
/// Handles the route of robots.txt page of the `websurfx` meta search engine website.
#[get("/robots.txt")]
pub async fn robots_data(_req: HttpRequest) -> Result<HttpResponse, Box<dyn std::error::Error>> {

View File

@ -10,6 +10,21 @@ production_use = false -- whether to use production mode or not (in other words
-- if production_use is set to true
-- There will be a random delay before sending the request to the search engines, this is to prevent DDoSing the upstream search engines from a large number of simultaneous requests.
request_timeout = 30 -- timeout for the search requests sent to the upstream search engines to be fetched (value in seconds).
rate_limiter = {
number_of_requests = 20, -- The number of request that are allowed within a provided time limit.
time_limit = 3, -- The time limit in which the quantity of requests that should be accepted.
}
-- ### Search ###
-- Filter results based on different levels. The levels provided are:
-- {{
-- 0 - None
-- 1 - Low
-- 2 - Moderate
-- 3 - High
-- 4 - Aggressive
-- }}
safe_search = 2
-- ### Website ###
-- The different colorschemes provided are:
@ -34,4 +49,7 @@ theme = "simple" -- the theme name which should be used for the website
redis_url = "redis://127.0.0.1:8082" -- redis connection url address on which the client should connect on.
-- ### Search Engines ###
upstream_search_engines = { DuckDuckGo = true, Searx = false } -- select the upstream search engines from which the results should be fetched.
upstream_search_engines = {
DuckDuckGo = true,
Searx = false,
} -- select the upstream search engines from which the results should be fetched.