0
0
mirror of https://github.com/neon-mmd/websurfx.git synced 2024-11-22 14:08:23 -05:00

Compare commits

..

No commits in common. "b7d0ef7252288d2bc5cdd189f534af375f28b3f5" and "c6b93403b8964243ddbce5df2b657b25a909a1c0" have entirely different histories.

15 changed files with 663 additions and 739 deletions

1160
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -1,6 +1,6 @@
[package]
name = "websurfx"
version = "1.17.22"
version = "1.17.20"
edition = "2021"
description = "An open-source alternative to Searx that provides clean, ad-free, and organic results with incredible speed while keeping privacy and security in mind."
repository = "https://github.com/neon-mmd/websurfx"
@ -27,7 +27,6 @@ tokio = { version = "1.32.0", features = [
], default-features = false }
serde = { version = "1.0.209", default-features = false, features = ["derive"] }
serde_json = { version = "1.0.122", default-features = false }
bincode = {version="1.3.3", default-features=false}
maud = { version = "0.26.0", default-features = false, features = [
"actix-web",
] }
@ -49,7 +48,6 @@ mlua = { version = "0.9.9", features = [
redis = { version = "0.25.4", features = [
"tokio-comp",
"connection-manager",
"tcp_nodelay"
], default-features = false, optional = true }
blake3 = { version = "1.5.4", default-features = false }
error-stack = { version = "0.4.0", default-features = false, features = [
@ -57,13 +55,17 @@ error-stack = { version = "0.4.0", default-features = false, features = [
] }
async-trait = { version = "0.1.80", default-features = false }
regex = { version = "1.9.4", features = ["perf"], default-features = false }
smallvec = { version = "1.13.1", features = [
"union",
"serde",
], default-features = false }
futures = { version = "0.3.30", default-features = false, features = ["alloc"] }
dhat = { version = "0.3.2", optional = true, default-features = false }
mimalloc = { version = "0.1.43", default-features = false }
async-once-cell = { version = "0.5.3", default-features = false }
actix-governor = { version = "0.5.0", default-features = false }
moka = { version = "0.12.8", optional = true, default-features = false, features = [
"future",
mini-moka = { version = "0.10", optional = true, default-features = false, features = [
"sync",
] }
async-compression = { version = "0.4.12", default-features = false, features = [
"brotli",
@ -80,8 +82,8 @@ base64 = { version = "0.21.5", default-features = false, features = [
cfg-if = { version = "1.0.0", default-features = false, optional = true }
keyword_extraction = { version = "1.4.3", default-features = false, features = [
"tf_idf",
"rayon",
] }
stop-words = { version = "0.8.0", default-features = false, features = ["iso"] }
thesaurus = { version = "0.5.2", default-features = false, optional = true, features = [
"moby",
@ -102,6 +104,8 @@ lightningcss = { version = "1.0.0-alpha.57", default-features = false, features
# Temporary fork with fix
minify-js = { git = "https://github.com/RuairidhWilliamson/minify-js", branch = "master", version = "0.6.0", default-features = false}
[profile.dev]
opt-level = 0
debug = true
@ -176,7 +180,7 @@ opt-level = "z"
use-synonyms-search = ["thesaurus/static"]
default = ["memory-cache"]
dhat-heap = ["dep:dhat"]
memory-cache = ["dep:moka"]
memory-cache = ["dep:mini-moka"]
redis-cache = ["dep:redis", "dep:base64"]
compress-cache-results = ["dep:async-compression", "dep:cfg-if"]
encrypt-cache-results = ["dep:chacha20poly1305", "dep:chacha20"]

View File

@ -34,11 +34,11 @@
},
"nixpkgs_2": {
"locked": {
"lastModified": 1725194671,
"narHash": "sha256-tLGCFEFTB5TaOKkpfw3iYT9dnk4awTP/q4w+ROpMfuw=",
"lastModified": 1695318763,
"narHash": "sha256-FHVPDRP2AfvsxAdc+AsgFJevMz5VBmnZglFUMlxBkcY=",
"owner": "NixOS",
"repo": "nixpkgs",
"rev": "b833ff01a0d694b910daca6e2ff4a3f26dee478c",
"rev": "e12483116b3b51a185a33a272bf351e357ba9a99",
"type": "github"
},
"original": {

View File

@ -36,7 +36,7 @@
haskellPackages.hadolint
nodejs
nodePackages_latest.cspell
eslint
nodePackages_latest.eslint
nodePackages_latest.markdownlint-cli2
nodePackages_latest.stylelint
redis

47
src/cache/cacher.rs vendored
View File

@ -2,9 +2,10 @@
//! from the upstream search engines in a json format.
use error_stack::Report;
use futures::future::join_all;
#[cfg(feature = "memory-cache")]
use moka::future::Cache as MokaCache;
use mini_moka::sync::Cache as MokaCache;
#[cfg(feature = "memory-cache")]
use mini_moka::sync::ConcurrentCacheExt;
#[cfg(feature = "memory-cache")]
use std::time::Duration;
@ -375,13 +376,13 @@ impl Cacher for RedisCache {
}
}
/// TryInto implementation for SearchResults from Vec<u8>
use std::{convert::TryInto, sync::Arc};
use std::convert::TryInto;
impl TryInto<SearchResults> for Vec<u8> {
type Error = CacheError;
fn try_into(self) -> Result<SearchResults, Self::Error> {
bincode::deserialize_from(self.as_slice()).map_err(|_| CacheError::SerializationError)
serde_json::from_slice(&self).map_err(|_| CacheError::SerializationError)
}
}
@ -389,7 +390,7 @@ impl TryInto<Vec<u8>> for &SearchResults {
type Error = CacheError;
fn try_into(self) -> Result<Vec<u8>, Self::Error> {
bincode::serialize(self).map_err(|_| CacheError::SerializationError)
serde_json::to_vec(self).map_err(|_| CacheError::SerializationError)
}
}
@ -397,16 +398,7 @@ impl TryInto<Vec<u8>> for &SearchResults {
#[cfg(feature = "memory-cache")]
pub struct InMemoryCache {
/// The backend cache which stores data.
cache: Arc<MokaCache<String, Vec<u8>>>,
}
#[cfg(feature = "memory-cache")]
impl Clone for InMemoryCache {
fn clone(&self) -> Self {
Self {
cache: self.cache.clone(),
}
}
cache: MokaCache<String, Vec<u8>>,
}
#[cfg(feature = "memory-cache")]
@ -416,17 +408,15 @@ impl Cacher for InMemoryCache {
log::info!("Initialising in-memory cache");
InMemoryCache {
cache: Arc::new(
MokaCache::builder()
.time_to_live(Duration::from_secs(config.cache_expiry_time.into()))
.build(),
),
cache: MokaCache::builder()
.time_to_live(Duration::from_secs(config.cache_expiry_time.into()))
.build(),
}
}
async fn cached_results(&mut self, url: &str) -> Result<SearchResults, Report<CacheError>> {
let hashed_url_string = self.hash_url(url);
match self.cache.get(&hashed_url_string).await {
match self.cache.get(&hashed_url_string) {
Some(res) => self.post_process_search_results(res).await,
None => Err(Report::new(CacheError::MissingValue)),
}
@ -437,18 +427,13 @@ impl Cacher for InMemoryCache {
search_results: &[SearchResults],
urls: &[String],
) -> Result<(), Report<CacheError>> {
let mut tasks: Vec<_> = Vec::with_capacity(urls.len());
for (url, search_result) in urls.iter().zip(search_results.iter()) {
let hashed_url_string = self.hash_url(url);
let bytes = self.pre_process_search_results(search_result).await?;
let new_self = self.clone();
tasks.push(tokio::spawn(async move {
new_self.cache.insert(hashed_url_string, bytes).await
}));
self.cache.insert(hashed_url_string, bytes);
}
join_all(tasks).await;
self.cache.sync();
Ok(())
}
}
@ -546,7 +531,7 @@ impl SharedCache {
/// # Arguments
///
/// * `url` - It takes the search url as an argument which will be used as the key to fetch the
/// cached results from the cache.
/// cached results from the cache.
///
/// # Error
///
@ -563,9 +548,9 @@ impl SharedCache {
/// # Arguments
///
/// * `search_results` - It takes the `SearchResults` as an argument which are results that
/// needs to be cached.
/// needs to be cached.
/// * `url` - It takes the search url as an argument which will be used as the key for storing
/// results in the cache.
/// results in the cache.
///
/// # Error
///

View File

@ -16,7 +16,7 @@ const REDIS_PIPELINE_SIZE: usize = 3;
/// connect to.
pub struct RedisCache {
/// It stores a pool of connections ready to be used.
connection_pool: Box<[ConnectionManager]>,
connection_pool: Vec<ConnectionManager>,
/// It stores the size of the connection pool (in other words the number of
/// connections that should be stored in the pool).
pool_size: u8,
@ -58,13 +58,13 @@ impl RedisCache {
}));
}
let mut outputs = Vec::with_capacity(tasks.len());
let mut outputs = Vec::new();
for task in tasks {
outputs.push(task.await??);
}
let redis_cache = RedisCache {
connection_pool: outputs.into_boxed_slice(),
connection_pool: outputs,
pool_size,
current_connection: Default::default(),
cache_ttl,

View File

@ -48,8 +48,6 @@ pub struct Config {
pub tcp_connection_keep_alive: u8,
/// It stores the pool idle connection timeout in seconds.
pub pool_idle_connection_timeout: u8,
/// It stores the number of https connections to keep in the pool.
pub number_of_https_connections: u8,
}
impl Config {
@ -59,7 +57,7 @@ impl Config {
/// # Arguments
///
/// * `logging_initialized` - It takes a boolean which ensures that the logging doesn't get
/// initialized twice. Pass false if the logger has not yet been initialized.
/// initialized twice. Pass false if the logger has not yet been initialized.
///
/// # Error
///
@ -141,7 +139,6 @@ impl Config {
request_timeout: globals.get::<_, u8>("request_timeout")?,
tcp_connection_keep_alive: globals.get::<_, u8>("tcp_connection_keep_alive")?,
pool_idle_connection_timeout: globals.get::<_, u8>("pool_idle_connection_timeout")?,
number_of_https_connections: globals.get::<_, u8>("number_of_https_connections")?,
threads,
client_connection_keep_alive: globals.get::<_, u8>("client_connection_keep_alive")?,
rate_limiter: RateLimiter {

View File

@ -3,6 +3,7 @@
use super::engine_models::EngineError;
use serde::{Deserialize, Serialize};
use smallvec::SmallVec;
#[cfg(any(
feature = "use-synonyms-search",
feature = "use-non-static-synonyms-search"
@ -11,9 +12,7 @@ use thesaurus::synonyms;
/// A named struct to store the raw scraped search results scraped search results from the
/// upstream search engines before aggregating it.It derives the Clone trait which is needed
/// to write idiomatic rust using `Iterators`.
///
/// (href url in html in simple words).
///
/// (href url in html in simple words).
#[derive(Clone, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct SearchResult {
@ -24,7 +23,7 @@ pub struct SearchResult {
/// The description of the search result.
pub description: String,
/// The names of the upstream engines from which this results were provided.
pub engine: Vec<String>,
pub engine: SmallVec<[String; 0]>,
/// The td-tdf score of the result in regards to the title, url and description and the user's query
pub relevance_score: f32,
}
@ -36,7 +35,7 @@ impl SearchResult {
///
/// * `title` - The title of the search result.
/// * `url` - The url which is accessed when clicked on it
/// (href url in html in simple words).
/// (href url in html in simple words).
/// * `description` - The description of the search result.
/// * `engine` - The names of the upstream engines from which this results were provided.
pub fn new(title: &str, url: &str, description: &str, engine: &[&str]) -> Self {
@ -126,7 +125,7 @@ impl EngineErrorInfo {
/// # Arguments
///
/// * `error` - It takes the error type which occured while fetching the result from a particular
/// search engine.
/// search engine.
/// * `engine` - It takes the name of the engine that failed to provide the requested search results.
pub fn new(error: &EngineError, engine: &str) -> Self {
Self {
@ -154,10 +153,10 @@ impl EngineErrorInfo {
#[serde(rename_all = "camelCase")]
pub struct SearchResults {
/// Stores the individual serializable `SearchResult` struct into a vector of
pub results: Box<[SearchResult]>,
pub results: Vec<SearchResult>,
/// Stores the information on which engines failed with their engine name
/// and the type of error that caused it.
pub engine_errors_info: Box<[EngineErrorInfo]>,
pub engine_errors_info: Vec<EngineErrorInfo>,
/// Stores the flag option which holds the check value that the following
/// search query was disallowed when the safe search level set to 4 and it
/// was present in the `Blocklist` file.
@ -179,15 +178,15 @@ impl SearchResults {
/// # Arguments
///
/// * `results` - Takes an argument of individual serializable `SearchResult` struct
/// and stores it into a vector of `SearchResult` structs.
/// and stores it into a vector of `SearchResult` structs.
/// * `page_query` - Takes an argument of current page`s search query `q` provided in
/// the search url.
/// the search url.
/// * `engine_errors_info` - Takes an array of structs which contains information regarding
/// which engines failed with their names, reason and their severity color name.
pub fn new(results: Box<[SearchResult]>, engine_errors_info: Box<[EngineErrorInfo]>) -> Self {
/// which engines failed with their names, reason and their severity color name.
pub fn new(results: Vec<SearchResult>, engine_errors_info: &[EngineErrorInfo]) -> Self {
Self {
results,
engine_errors_info,
engine_errors_info: engine_errors_info.to_owned(),
disallowed: Default::default(),
filtered: Default::default(),
safe_search_level: Default::default(),
@ -206,11 +205,11 @@ impl SearchResults {
}
/// A getter function that gets the value of `engine_errors_info`.
pub fn engine_errors_info(&mut self) -> Box<[EngineErrorInfo]> {
pub fn engine_errors_info(&mut self) -> Vec<EngineErrorInfo> {
std::mem::take(&mut self.engine_errors_info)
}
/// A getter function that gets the value of `results`.
pub fn results(&mut self) -> Box<[SearchResult]> {
pub fn results(&mut self) -> Vec<SearchResult> {
self.results.clone()
}
@ -255,50 +254,27 @@ fn calculate_tf_idf(
let tf_idf = TfIdf::new(params);
let tokener = Tokenizer::new(query, stop_words, Some(punctuation));
let query_tokens = tokener.split_into_words();
let mut search_tokens = vec![];
#[cfg(any(
feature = "use-synonyms-search",
feature = "use-non-static-synonyms-search"
))]
let mut extra_tokens = vec![];
for token in query_tokens {
#[cfg(any(
feature = "use-synonyms-search",
feature = "use-non-static-synonyms-search"
))]
{
// find some synonyms and add them to the search (from wordnet or moby if feature is enabled)
let synonyms = synonyms(&token);
search_tokens.extend(synonyms)
}
search_tokens.push(token);
}
let total_score: f32 = query_tokens
.iter()
.map(|token| {
#[cfg(any(
feature = "use-synonyms-search",
feature = "use-non-static-synonyms-search"
))]
{
// find some synonyms and add them to the search (from wordnet or moby if feature is enabled)
extra_tokens.extend(synonyms(token))
}
let mut total_score = 0.0f32;
for token in search_tokens.iter() {
total_score += tf_idf.get_score(token);
}
tf_idf.get_score(token)
})
.sum();
#[cfg(not(any(
feature = "use-synonyms-search",
feature = "use-non-static-synonyms-search"
)))]
let result = total_score / (query_tokens.len() as f32);
#[cfg(any(
feature = "use-synonyms-search",
feature = "use-non-static-synonyms-search"
))]
let extra_total_score: f32 = extra_tokens
.iter()
.map(|token| tf_idf.get_score(token))
.sum();
#[cfg(any(
feature = "use-synonyms-search",
feature = "use-non-static-synonyms-search"
))]
let result =
(extra_total_score + total_score) / ((query_tokens.len() + extra_tokens.len()) as f32);
let result = total_score / (search_tokens.len() as f32);
f32::from(!result.is_nan()) * result
}

View File

@ -29,7 +29,7 @@ impl Style {
///
/// * `theme` - It takes the parsed theme option used to set a theme for the website.
/// * `colorscheme` - It takes the parsed colorscheme option used to set a colorscheme
/// for the theme being used.
/// for the theme being used.
pub fn new(theme: String, colorscheme: String, animation: Option<String>) -> Self {
Style {
theme,

View File

@ -11,7 +11,7 @@ use super::parser_models::Style;
pub struct SearchParams {
/// It stores the search parameter option `q` (or query in simple words)
/// of the search url.
pub q: Option<Cow<'static, str>>,
pub q: Option<String>,
/// It stores the search parameter `page` (or pageno in simple words)
/// of the search url.
pub page: Option<u32>,
@ -29,7 +29,7 @@ pub struct Cookie<'a> {
/// It stores the colorscheme name used for the website theme.
pub colorscheme: Cow<'a, str>,
/// It stores the user selected upstream search engines selected from the UI.
pub engines: Cow<'a, [Cow<'a, str>]>,
pub engines: Cow<'a, Vec<Cow<'a, str>>>,
/// It stores the user selected safe search level from the UI.
pub safe_search_level: u8,
}

View File

@ -14,6 +14,7 @@ use futures::stream::FuturesUnordered;
use regex::Regex;
use reqwest::{Client, ClientBuilder};
use std::sync::Arc;
use std::time::{SystemTime, UNIX_EPOCH};
use tokio::{
fs::File,
io::{AsyncBufReadExt, BufReader},
@ -60,7 +61,7 @@ type FutureVec =
/// * `debug` - Accepts a boolean value to enable or disable debug mode option.
/// * `upstream_search_engines` - Accepts a vector of search engine names which was selected by the
/// * `request_timeout` - Accepts a time (secs) as a value which controls the server request timeout.
/// user through the UI or the config file.
/// user through the UI or the config file.
///
/// # Error
///
@ -81,7 +82,6 @@ pub async fn aggregate(
config.pool_idle_connection_timeout as u64,
))
.tcp_keepalive(Duration::from_secs(config.tcp_connection_keep_alive as u64))
.pool_max_idle_per_host(config.number_of_https_connections as usize)
.connect_timeout(Duration::from_secs(config.request_timeout as u64)) // Add timeout to request to avoid DDOSing the server
.https_only(true)
.gzip(true)
@ -93,6 +93,13 @@ pub async fn aggregate(
let user_agent: &str = random_user_agent();
// Add a random delay before making the request.
if config.aggregator.random_delay || !config.debug {
let nanos = SystemTime::now().duration_since(UNIX_EPOCH)?.subsec_nanos() as f32;
let delay = ((nanos / 1_0000_0000 as f32).floor() as u64) + 1;
tokio::time::sleep(Duration::from_secs(delay)).await;
}
let mut names: Vec<&str> = Vec::with_capacity(0);
// create tasks for upstream result fetching
@ -181,21 +188,19 @@ pub async fn aggregate(
drop(blacklist_map);
}
let mut results: Box<[SearchResult]> = result_map
.into_iter()
.map(|(_, mut value)| {
if !value.url.contains("temu.com") {
value.calculate_relevance(query.as_str())
let mut results: Vec<SearchResult> = result_map
.iter()
.map(|(_, value)| {
let mut copy = value.clone();
if !copy.url.contains("temu.com") {
copy.calculate_relevance(query.as_str())
}
value
copy
})
.collect();
sort_search_results(&mut results);
Ok(SearchResults::new(
results,
engine_errors_info.into_boxed_slice(),
))
Ok(SearchResults::new(results, &engine_errors_info))
}
/// Filters a map of search results using a list of regex patterns.
@ -260,6 +265,7 @@ fn sort_search_results(results: &mut [SearchResult]) {
#[cfg(test)]
mod tests {
use super::*;
use smallvec::smallvec;
use std::io::Write;
use tempfile::NamedTempFile;
@ -275,7 +281,7 @@ mod tests {
description: "This domain is for use in illustrative examples in documents."
.to_owned(),
relevance_score: 0.0,
engine: vec!["Google".to_owned(), "Bing".to_owned()],
engine: smallvec!["Google".to_owned(), "Bing".to_owned()],
},
));
map_to_be_filtered.push((
@ -284,7 +290,7 @@ mod tests {
title: "Rust Programming Language".to_owned(),
url: "https://www.rust-lang.org/".to_owned(),
description: "A systems programming language that runs blazingly fast, prevents segfaults, and guarantees thread safety.".to_owned(),
engine: vec!["Google".to_owned(), "DuckDuckGo".to_owned()],
engine: smallvec!["Google".to_owned(), "DuckDuckGo".to_owned()],
relevance_score:0.0
},)
);
@ -325,7 +331,7 @@ mod tests {
url: "https://www.example.com".to_owned(),
description: "This domain is for use in illustrative examples in documents."
.to_owned(),
engine: vec!["Google".to_owned(), "Bing".to_owned()],
engine: smallvec!["Google".to_owned(), "Bing".to_owned()],
relevance_score: 0.0,
},
));
@ -335,7 +341,7 @@ mod tests {
title: "Rust Programming Language".to_owned(),
url: "https://www.rust-lang.org/".to_owned(),
description: "A systems programming language that runs blazingly fast, prevents segfaults, and guarantees thread safety.".to_owned(),
engine: vec!["Google".to_owned(), "DuckDuckGo".to_owned()],
engine: smallvec!["Google".to_owned(), "DuckDuckGo".to_owned()],
relevance_score:0.0
},
));
@ -392,7 +398,7 @@ mod tests {
url: "https://www.example.com".to_owned(),
description: "This domain is for use in illustrative examples in documents."
.to_owned(),
engine: vec!["Google".to_owned(), "Bing".to_owned()],
engine: smallvec!["Google".to_owned(), "Bing".to_owned()],
relevance_score: 0.0,
},
));

View File

@ -14,8 +14,7 @@ use crate::{
use actix_web::{get, http::header::ContentType, web, HttpRequest, HttpResponse};
use itertools::Itertools;
use regex::Regex;
use std::time::{SystemTime, UNIX_EPOCH};
use std::{borrow::Cow, time::Duration};
use std::borrow::Cow;
use tokio::{
fs::File,
io::{AsyncBufReadExt, BufReader},
@ -84,13 +83,6 @@ pub async fn search(
let previous_page = page.saturating_sub(1);
let next_page = page + 1;
// Add a random delay before making the request.
if config.aggregator.random_delay || !config.debug {
let nanos = SystemTime::now().duration_since(UNIX_EPOCH)?.subsec_nanos() as f32;
let delay = ((nanos / 1_0000_0000 as f32).floor() as u64) + 1;
tokio::time::sleep(Duration::from_secs(delay)).await;
}
let results: (SearchResults, String, bool);
if page != previous_page {
let (previous_results, current_results, next_results) = join!(
@ -104,7 +96,9 @@ pub async fn search(
let (results_list, cache_keys): (Vec<SearchResults>, Vec<String>) =
[previous_results?, results.clone(), next_results?]
.into_iter()
.filter_map(|(result, cache_key, flag)| flag.then_some((result, cache_key)))
.filter_map(|(result, cache_key, flag)| {
dbg!(flag).then_some((result, cache_key))
})
.multiunzip();
tokio::spawn(async move { cache.cache_results(&results_list, &cache_keys).await });
@ -146,7 +140,7 @@ pub async fn search(
/// # Arguments
///
/// * `url` - It takes the url of the current page that requested the search results for a
/// particular search query.
/// particular search query.
/// * `config` - It takes a parsed config struct.
/// * `query` - It takes the page number as u32 value.
/// * `req` - It takes the `HttpRequest` struct as a value.

View File

@ -12,7 +12,7 @@ const SAFE_SEARCH_LEVELS_NAME: [&str; 3] = ["None", "Low", "Moderate"];
/// # Arguments
///
/// * `engine_errors_info` - It takes the engine errors list containing errors for each upstream
/// search engine which failed to provide results as an argument.
/// search engine which failed to provide results as an argument.
/// * `safe_search_level` - It takes the safe search level with values from 0-2 as an argument.
/// * `query` - It takes the current search query provided by user as an argument.
///

View File

@ -9,7 +9,7 @@ use maud::{html, Markup};
/// # Arguments
///
/// * `engine_names` - It takes the key value pair list of all available engine names and there corresponding
/// selected (enabled/disabled) value as an argument.
/// selected (enabled/disabled) value as an argument.
///
/// # Returns
///

View File

@ -11,9 +11,9 @@ use std::fs::read_dir;
/// # Arguments
///
/// * `style_type` - It takes the style type of the values `theme` and `colorscheme` as an
/// argument.
/// argument.
/// * `selected_style` - It takes the currently selected style value provided via the config file
/// as an argument.
/// as an argument.
///
/// # Error
///