0
0
mirror of https://github.com/neon-mmd/websurfx.git synced 2024-11-22 14:08:23 -05:00

Compare commits

..

No commits in common. "b7d0ef7252288d2bc5cdd189f534af375f28b3f5" and "c6b93403b8964243ddbce5df2b657b25a909a1c0" have entirely different histories.

15 changed files with 663 additions and 739 deletions

1160
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -1,6 +1,6 @@
[package]
name = "websurfx"
version = "1.17.22"
version = "1.17.20"
edition = "2021"
description = "An open-source alternative to Searx that provides clean, ad-free, and organic results with incredible speed while keeping privacy and security in mind."
repository = "https://github.com/neon-mmd/websurfx"
@ -27,7 +27,6 @@ tokio = { version = "1.32.0", features = [
], default-features = false }
serde = { version = "1.0.209", default-features = false, features = ["derive"] }
serde_json = { version = "1.0.122", default-features = false }
bincode = {version="1.3.3", default-features=false}
maud = { version = "0.26.0", default-features = false, features = [
"actix-web",
] }
@ -49,7 +48,6 @@ mlua = { version = "0.9.9", features = [
redis = { version = "0.25.4", features = [
"tokio-comp",
"connection-manager",
"tcp_nodelay"
], default-features = false, optional = true }
blake3 = { version = "1.5.4", default-features = false }
error-stack = { version = "0.4.0", default-features = false, features = [
@ -57,13 +55,17 @@ error-stack = { version = "0.4.0", default-features = false, features = [
] }
async-trait = { version = "0.1.80", default-features = false }
regex = { version = "1.9.4", features = ["perf"], default-features = false }
smallvec = { version = "1.13.1", features = [
"union",
"serde",
], default-features = false }
futures = { version = "0.3.30", default-features = false, features = ["alloc"] }
dhat = { version = "0.3.2", optional = true, default-features = false }
mimalloc = { version = "0.1.43", default-features = false }
async-once-cell = { version = "0.5.3", default-features = false }
actix-governor = { version = "0.5.0", default-features = false }
moka = { version = "0.12.8", optional = true, default-features = false, features = [
"future",
mini-moka = { version = "0.10", optional = true, default-features = false, features = [
"sync",
] }
async-compression = { version = "0.4.12", default-features = false, features = [
"brotli",
@ -80,8 +82,8 @@ base64 = { version = "0.21.5", default-features = false, features = [
cfg-if = { version = "1.0.0", default-features = false, optional = true }
keyword_extraction = { version = "1.4.3", default-features = false, features = [
"tf_idf",
"rayon",
] }
stop-words = { version = "0.8.0", default-features = false, features = ["iso"] }
thesaurus = { version = "0.5.2", default-features = false, optional = true, features = [
"moby",
@ -102,6 +104,8 @@ lightningcss = { version = "1.0.0-alpha.57", default-features = false, features
# Temporary fork with fix
minify-js = { git = "https://github.com/RuairidhWilliamson/minify-js", branch = "master", version = "0.6.0", default-features = false}
[profile.dev]
opt-level = 0
debug = true
@ -176,7 +180,7 @@ opt-level = "z"
use-synonyms-search = ["thesaurus/static"]
default = ["memory-cache"]
dhat-heap = ["dep:dhat"]
memory-cache = ["dep:moka"]
memory-cache = ["dep:mini-moka"]
redis-cache = ["dep:redis", "dep:base64"]
compress-cache-results = ["dep:async-compression", "dep:cfg-if"]
encrypt-cache-results = ["dep:chacha20poly1305", "dep:chacha20"]

View File

@ -34,11 +34,11 @@
},
"nixpkgs_2": {
"locked": {
"lastModified": 1725194671,
"narHash": "sha256-tLGCFEFTB5TaOKkpfw3iYT9dnk4awTP/q4w+ROpMfuw=",
"lastModified": 1695318763,
"narHash": "sha256-FHVPDRP2AfvsxAdc+AsgFJevMz5VBmnZglFUMlxBkcY=",
"owner": "NixOS",
"repo": "nixpkgs",
"rev": "b833ff01a0d694b910daca6e2ff4a3f26dee478c",
"rev": "e12483116b3b51a185a33a272bf351e357ba9a99",
"type": "github"
},
"original": {

View File

@ -36,7 +36,7 @@
haskellPackages.hadolint
nodejs
nodePackages_latest.cspell
eslint
nodePackages_latest.eslint
nodePackages_latest.markdownlint-cli2
nodePackages_latest.stylelint
redis

37
src/cache/cacher.rs vendored
View File

@ -2,9 +2,10 @@
//! from the upstream search engines in a json format.
use error_stack::Report;
use futures::future::join_all;
#[cfg(feature = "memory-cache")]
use moka::future::Cache as MokaCache;
use mini_moka::sync::Cache as MokaCache;
#[cfg(feature = "memory-cache")]
use mini_moka::sync::ConcurrentCacheExt;
#[cfg(feature = "memory-cache")]
use std::time::Duration;
@ -375,13 +376,13 @@ impl Cacher for RedisCache {
}
}
/// TryInto implementation for SearchResults from Vec<u8>
use std::{convert::TryInto, sync::Arc};
use std::convert::TryInto;
impl TryInto<SearchResults> for Vec<u8> {
type Error = CacheError;
fn try_into(self) -> Result<SearchResults, Self::Error> {
bincode::deserialize_from(self.as_slice()).map_err(|_| CacheError::SerializationError)
serde_json::from_slice(&self).map_err(|_| CacheError::SerializationError)
}
}
@ -389,7 +390,7 @@ impl TryInto<Vec<u8>> for &SearchResults {
type Error = CacheError;
fn try_into(self) -> Result<Vec<u8>, Self::Error> {
bincode::serialize(self).map_err(|_| CacheError::SerializationError)
serde_json::to_vec(self).map_err(|_| CacheError::SerializationError)
}
}
@ -397,16 +398,7 @@ impl TryInto<Vec<u8>> for &SearchResults {
#[cfg(feature = "memory-cache")]
pub struct InMemoryCache {
/// The backend cache which stores data.
cache: Arc<MokaCache<String, Vec<u8>>>,
}
#[cfg(feature = "memory-cache")]
impl Clone for InMemoryCache {
fn clone(&self) -> Self {
Self {
cache: self.cache.clone(),
}
}
cache: MokaCache<String, Vec<u8>>,
}
#[cfg(feature = "memory-cache")]
@ -416,17 +408,15 @@ impl Cacher for InMemoryCache {
log::info!("Initialising in-memory cache");
InMemoryCache {
cache: Arc::new(
MokaCache::builder()
cache: MokaCache::builder()
.time_to_live(Duration::from_secs(config.cache_expiry_time.into()))
.build(),
),
}
}
async fn cached_results(&mut self, url: &str) -> Result<SearchResults, Report<CacheError>> {
let hashed_url_string = self.hash_url(url);
match self.cache.get(&hashed_url_string).await {
match self.cache.get(&hashed_url_string) {
Some(res) => self.post_process_search_results(res).await,
None => Err(Report::new(CacheError::MissingValue)),
}
@ -437,18 +427,13 @@ impl Cacher for InMemoryCache {
search_results: &[SearchResults],
urls: &[String],
) -> Result<(), Report<CacheError>> {
let mut tasks: Vec<_> = Vec::with_capacity(urls.len());
for (url, search_result) in urls.iter().zip(search_results.iter()) {
let hashed_url_string = self.hash_url(url);
let bytes = self.pre_process_search_results(search_result).await?;
let new_self = self.clone();
tasks.push(tokio::spawn(async move {
new_self.cache.insert(hashed_url_string, bytes).await
}));
self.cache.insert(hashed_url_string, bytes);
}
join_all(tasks).await;
self.cache.sync();
Ok(())
}
}

View File

@ -16,7 +16,7 @@ const REDIS_PIPELINE_SIZE: usize = 3;
/// connect to.
pub struct RedisCache {
/// It stores a pool of connections ready to be used.
connection_pool: Box<[ConnectionManager]>,
connection_pool: Vec<ConnectionManager>,
/// It stores the size of the connection pool (in other words the number of
/// connections that should be stored in the pool).
pool_size: u8,
@ -58,13 +58,13 @@ impl RedisCache {
}));
}
let mut outputs = Vec::with_capacity(tasks.len());
let mut outputs = Vec::new();
for task in tasks {
outputs.push(task.await??);
}
let redis_cache = RedisCache {
connection_pool: outputs.into_boxed_slice(),
connection_pool: outputs,
pool_size,
current_connection: Default::default(),
cache_ttl,

View File

@ -48,8 +48,6 @@ pub struct Config {
pub tcp_connection_keep_alive: u8,
/// It stores the pool idle connection timeout in seconds.
pub pool_idle_connection_timeout: u8,
/// It stores the number of https connections to keep in the pool.
pub number_of_https_connections: u8,
}
impl Config {
@ -141,7 +139,6 @@ impl Config {
request_timeout: globals.get::<_, u8>("request_timeout")?,
tcp_connection_keep_alive: globals.get::<_, u8>("tcp_connection_keep_alive")?,
pool_idle_connection_timeout: globals.get::<_, u8>("pool_idle_connection_timeout")?,
number_of_https_connections: globals.get::<_, u8>("number_of_https_connections")?,
threads,
client_connection_keep_alive: globals.get::<_, u8>("client_connection_keep_alive")?,
rate_limiter: RateLimiter {

View File

@ -3,6 +3,7 @@
use super::engine_models::EngineError;
use serde::{Deserialize, Serialize};
use smallvec::SmallVec;
#[cfg(any(
feature = "use-synonyms-search",
feature = "use-non-static-synonyms-search"
@ -11,9 +12,7 @@ use thesaurus::synonyms;
/// A named struct to store the raw scraped search results scraped search results from the
/// upstream search engines before aggregating it.It derives the Clone trait which is needed
/// to write idiomatic rust using `Iterators`.
///
/// (href url in html in simple words).
///
#[derive(Clone, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct SearchResult {
@ -24,7 +23,7 @@ pub struct SearchResult {
/// The description of the search result.
pub description: String,
/// The names of the upstream engines from which this results were provided.
pub engine: Vec<String>,
pub engine: SmallVec<[String; 0]>,
/// The td-tdf score of the result in regards to the title, url and description and the user's query
pub relevance_score: f32,
}
@ -154,10 +153,10 @@ impl EngineErrorInfo {
#[serde(rename_all = "camelCase")]
pub struct SearchResults {
/// Stores the individual serializable `SearchResult` struct into a vector of
pub results: Box<[SearchResult]>,
pub results: Vec<SearchResult>,
/// Stores the information on which engines failed with their engine name
/// and the type of error that caused it.
pub engine_errors_info: Box<[EngineErrorInfo]>,
pub engine_errors_info: Vec<EngineErrorInfo>,
/// Stores the flag option which holds the check value that the following
/// search query was disallowed when the safe search level set to 4 and it
/// was present in the `Blocklist` file.
@ -184,10 +183,10 @@ impl SearchResults {
/// the search url.
/// * `engine_errors_info` - Takes an array of structs which contains information regarding
/// which engines failed with their names, reason and their severity color name.
pub fn new(results: Box<[SearchResult]>, engine_errors_info: Box<[EngineErrorInfo]>) -> Self {
pub fn new(results: Vec<SearchResult>, engine_errors_info: &[EngineErrorInfo]) -> Self {
Self {
results,
engine_errors_info,
engine_errors_info: engine_errors_info.to_owned(),
disallowed: Default::default(),
filtered: Default::default(),
safe_search_level: Default::default(),
@ -206,11 +205,11 @@ impl SearchResults {
}
/// A getter function that gets the value of `engine_errors_info`.
pub fn engine_errors_info(&mut self) -> Box<[EngineErrorInfo]> {
pub fn engine_errors_info(&mut self) -> Vec<EngineErrorInfo> {
std::mem::take(&mut self.engine_errors_info)
}
/// A getter function that gets the value of `results`.
pub fn results(&mut self) -> Box<[SearchResult]> {
pub fn results(&mut self) -> Vec<SearchResult> {
self.results.clone()
}
@ -255,50 +254,27 @@ fn calculate_tf_idf(
let tf_idf = TfIdf::new(params);
let tokener = Tokenizer::new(query, stop_words, Some(punctuation));
let query_tokens = tokener.split_into_words();
let mut search_tokens = vec![];
#[cfg(any(
feature = "use-synonyms-search",
feature = "use-non-static-synonyms-search"
))]
let mut extra_tokens = vec![];
let total_score: f32 = query_tokens
.iter()
.map(|token| {
for token in query_tokens {
#[cfg(any(
feature = "use-synonyms-search",
feature = "use-non-static-synonyms-search"
))]
{
// find some synonyms and add them to the search (from wordnet or moby if feature is enabled)
extra_tokens.extend(synonyms(token))
let synonyms = synonyms(&token);
search_tokens.extend(synonyms)
}
search_tokens.push(token);
}
tf_idf.get_score(token)
})
.sum();
let mut total_score = 0.0f32;
for token in search_tokens.iter() {
total_score += tf_idf.get_score(token);
}
#[cfg(not(any(
feature = "use-synonyms-search",
feature = "use-non-static-synonyms-search"
)))]
let result = total_score / (query_tokens.len() as f32);
#[cfg(any(
feature = "use-synonyms-search",
feature = "use-non-static-synonyms-search"
))]
let extra_total_score: f32 = extra_tokens
.iter()
.map(|token| tf_idf.get_score(token))
.sum();
#[cfg(any(
feature = "use-synonyms-search",
feature = "use-non-static-synonyms-search"
))]
let result =
(extra_total_score + total_score) / ((query_tokens.len() + extra_tokens.len()) as f32);
let result = total_score / (search_tokens.len() as f32);
f32::from(!result.is_nan()) * result
}

View File

@ -11,7 +11,7 @@ use super::parser_models::Style;
pub struct SearchParams {
/// It stores the search parameter option `q` (or query in simple words)
/// of the search url.
pub q: Option<Cow<'static, str>>,
pub q: Option<String>,
/// It stores the search parameter `page` (or pageno in simple words)
/// of the search url.
pub page: Option<u32>,
@ -29,7 +29,7 @@ pub struct Cookie<'a> {
/// It stores the colorscheme name used for the website theme.
pub colorscheme: Cow<'a, str>,
/// It stores the user selected upstream search engines selected from the UI.
pub engines: Cow<'a, [Cow<'a, str>]>,
pub engines: Cow<'a, Vec<Cow<'a, str>>>,
/// It stores the user selected safe search level from the UI.
pub safe_search_level: u8,
}

View File

@ -14,6 +14,7 @@ use futures::stream::FuturesUnordered;
use regex::Regex;
use reqwest::{Client, ClientBuilder};
use std::sync::Arc;
use std::time::{SystemTime, UNIX_EPOCH};
use tokio::{
fs::File,
io::{AsyncBufReadExt, BufReader},
@ -81,7 +82,6 @@ pub async fn aggregate(
config.pool_idle_connection_timeout as u64,
))
.tcp_keepalive(Duration::from_secs(config.tcp_connection_keep_alive as u64))
.pool_max_idle_per_host(config.number_of_https_connections as usize)
.connect_timeout(Duration::from_secs(config.request_timeout as u64)) // Add timeout to request to avoid DDOSing the server
.https_only(true)
.gzip(true)
@ -93,6 +93,13 @@ pub async fn aggregate(
let user_agent: &str = random_user_agent();
// Add a random delay before making the request.
if config.aggregator.random_delay || !config.debug {
let nanos = SystemTime::now().duration_since(UNIX_EPOCH)?.subsec_nanos() as f32;
let delay = ((nanos / 1_0000_0000 as f32).floor() as u64) + 1;
tokio::time::sleep(Duration::from_secs(delay)).await;
}
let mut names: Vec<&str> = Vec::with_capacity(0);
// create tasks for upstream result fetching
@ -181,21 +188,19 @@ pub async fn aggregate(
drop(blacklist_map);
}
let mut results: Box<[SearchResult]> = result_map
.into_iter()
.map(|(_, mut value)| {
if !value.url.contains("temu.com") {
value.calculate_relevance(query.as_str())
let mut results: Vec<SearchResult> = result_map
.iter()
.map(|(_, value)| {
let mut copy = value.clone();
if !copy.url.contains("temu.com") {
copy.calculate_relevance(query.as_str())
}
value
copy
})
.collect();
sort_search_results(&mut results);
Ok(SearchResults::new(
results,
engine_errors_info.into_boxed_slice(),
))
Ok(SearchResults::new(results, &engine_errors_info))
}
/// Filters a map of search results using a list of regex patterns.
@ -260,6 +265,7 @@ fn sort_search_results(results: &mut [SearchResult]) {
#[cfg(test)]
mod tests {
use super::*;
use smallvec::smallvec;
use std::io::Write;
use tempfile::NamedTempFile;
@ -275,7 +281,7 @@ mod tests {
description: "This domain is for use in illustrative examples in documents."
.to_owned(),
relevance_score: 0.0,
engine: vec!["Google".to_owned(), "Bing".to_owned()],
engine: smallvec!["Google".to_owned(), "Bing".to_owned()],
},
));
map_to_be_filtered.push((
@ -284,7 +290,7 @@ mod tests {
title: "Rust Programming Language".to_owned(),
url: "https://www.rust-lang.org/".to_owned(),
description: "A systems programming language that runs blazingly fast, prevents segfaults, and guarantees thread safety.".to_owned(),
engine: vec!["Google".to_owned(), "DuckDuckGo".to_owned()],
engine: smallvec!["Google".to_owned(), "DuckDuckGo".to_owned()],
relevance_score:0.0
},)
);
@ -325,7 +331,7 @@ mod tests {
url: "https://www.example.com".to_owned(),
description: "This domain is for use in illustrative examples in documents."
.to_owned(),
engine: vec!["Google".to_owned(), "Bing".to_owned()],
engine: smallvec!["Google".to_owned(), "Bing".to_owned()],
relevance_score: 0.0,
},
));
@ -335,7 +341,7 @@ mod tests {
title: "Rust Programming Language".to_owned(),
url: "https://www.rust-lang.org/".to_owned(),
description: "A systems programming language that runs blazingly fast, prevents segfaults, and guarantees thread safety.".to_owned(),
engine: vec!["Google".to_owned(), "DuckDuckGo".to_owned()],
engine: smallvec!["Google".to_owned(), "DuckDuckGo".to_owned()],
relevance_score:0.0
},
));
@ -392,7 +398,7 @@ mod tests {
url: "https://www.example.com".to_owned(),
description: "This domain is for use in illustrative examples in documents."
.to_owned(),
engine: vec!["Google".to_owned(), "Bing".to_owned()],
engine: smallvec!["Google".to_owned(), "Bing".to_owned()],
relevance_score: 0.0,
},
));

View File

@ -14,8 +14,7 @@ use crate::{
use actix_web::{get, http::header::ContentType, web, HttpRequest, HttpResponse};
use itertools::Itertools;
use regex::Regex;
use std::time::{SystemTime, UNIX_EPOCH};
use std::{borrow::Cow, time::Duration};
use std::borrow::Cow;
use tokio::{
fs::File,
io::{AsyncBufReadExt, BufReader},
@ -84,13 +83,6 @@ pub async fn search(
let previous_page = page.saturating_sub(1);
let next_page = page + 1;
// Add a random delay before making the request.
if config.aggregator.random_delay || !config.debug {
let nanos = SystemTime::now().duration_since(UNIX_EPOCH)?.subsec_nanos() as f32;
let delay = ((nanos / 1_0000_0000 as f32).floor() as u64) + 1;
tokio::time::sleep(Duration::from_secs(delay)).await;
}
let results: (SearchResults, String, bool);
if page != previous_page {
let (previous_results, current_results, next_results) = join!(
@ -104,7 +96,9 @@ pub async fn search(
let (results_list, cache_keys): (Vec<SearchResults>, Vec<String>) =
[previous_results?, results.clone(), next_results?]
.into_iter()
.filter_map(|(result, cache_key, flag)| flag.then_some((result, cache_key)))
.filter_map(|(result, cache_key, flag)| {
dbg!(flag).then_some((result, cache_key))
})
.multiunzip();
tokio::spawn(async move { cache.cache_results(&results_list, &cache_keys).await });