mirror of
https://github.com/neon-mmd/websurfx.git
synced 2024-11-22 14:08:23 -05:00
Compare commits
No commits in common. "b7d0ef7252288d2bc5cdd189f534af375f28b3f5" and "c6b93403b8964243ddbce5df2b657b25a909a1c0" have entirely different histories.
b7d0ef7252
...
c6b93403b8
1160
Cargo.lock
generated
1160
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
18
Cargo.toml
18
Cargo.toml
@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "websurfx"
|
name = "websurfx"
|
||||||
version = "1.17.22"
|
version = "1.17.20"
|
||||||
edition = "2021"
|
edition = "2021"
|
||||||
description = "An open-source alternative to Searx that provides clean, ad-free, and organic results with incredible speed while keeping privacy and security in mind."
|
description = "An open-source alternative to Searx that provides clean, ad-free, and organic results with incredible speed while keeping privacy and security in mind."
|
||||||
repository = "https://github.com/neon-mmd/websurfx"
|
repository = "https://github.com/neon-mmd/websurfx"
|
||||||
@ -27,7 +27,6 @@ tokio = { version = "1.32.0", features = [
|
|||||||
], default-features = false }
|
], default-features = false }
|
||||||
serde = { version = "1.0.209", default-features = false, features = ["derive"] }
|
serde = { version = "1.0.209", default-features = false, features = ["derive"] }
|
||||||
serde_json = { version = "1.0.122", default-features = false }
|
serde_json = { version = "1.0.122", default-features = false }
|
||||||
bincode = {version="1.3.3", default-features=false}
|
|
||||||
maud = { version = "0.26.0", default-features = false, features = [
|
maud = { version = "0.26.0", default-features = false, features = [
|
||||||
"actix-web",
|
"actix-web",
|
||||||
] }
|
] }
|
||||||
@ -49,7 +48,6 @@ mlua = { version = "0.9.9", features = [
|
|||||||
redis = { version = "0.25.4", features = [
|
redis = { version = "0.25.4", features = [
|
||||||
"tokio-comp",
|
"tokio-comp",
|
||||||
"connection-manager",
|
"connection-manager",
|
||||||
"tcp_nodelay"
|
|
||||||
], default-features = false, optional = true }
|
], default-features = false, optional = true }
|
||||||
blake3 = { version = "1.5.4", default-features = false }
|
blake3 = { version = "1.5.4", default-features = false }
|
||||||
error-stack = { version = "0.4.0", default-features = false, features = [
|
error-stack = { version = "0.4.0", default-features = false, features = [
|
||||||
@ -57,13 +55,17 @@ error-stack = { version = "0.4.0", default-features = false, features = [
|
|||||||
] }
|
] }
|
||||||
async-trait = { version = "0.1.80", default-features = false }
|
async-trait = { version = "0.1.80", default-features = false }
|
||||||
regex = { version = "1.9.4", features = ["perf"], default-features = false }
|
regex = { version = "1.9.4", features = ["perf"], default-features = false }
|
||||||
|
smallvec = { version = "1.13.1", features = [
|
||||||
|
"union",
|
||||||
|
"serde",
|
||||||
|
], default-features = false }
|
||||||
futures = { version = "0.3.30", default-features = false, features = ["alloc"] }
|
futures = { version = "0.3.30", default-features = false, features = ["alloc"] }
|
||||||
dhat = { version = "0.3.2", optional = true, default-features = false }
|
dhat = { version = "0.3.2", optional = true, default-features = false }
|
||||||
mimalloc = { version = "0.1.43", default-features = false }
|
mimalloc = { version = "0.1.43", default-features = false }
|
||||||
async-once-cell = { version = "0.5.3", default-features = false }
|
async-once-cell = { version = "0.5.3", default-features = false }
|
||||||
actix-governor = { version = "0.5.0", default-features = false }
|
actix-governor = { version = "0.5.0", default-features = false }
|
||||||
moka = { version = "0.12.8", optional = true, default-features = false, features = [
|
mini-moka = { version = "0.10", optional = true, default-features = false, features = [
|
||||||
"future",
|
"sync",
|
||||||
] }
|
] }
|
||||||
async-compression = { version = "0.4.12", default-features = false, features = [
|
async-compression = { version = "0.4.12", default-features = false, features = [
|
||||||
"brotli",
|
"brotli",
|
||||||
@ -80,8 +82,8 @@ base64 = { version = "0.21.5", default-features = false, features = [
|
|||||||
cfg-if = { version = "1.0.0", default-features = false, optional = true }
|
cfg-if = { version = "1.0.0", default-features = false, optional = true }
|
||||||
keyword_extraction = { version = "1.4.3", default-features = false, features = [
|
keyword_extraction = { version = "1.4.3", default-features = false, features = [
|
||||||
"tf_idf",
|
"tf_idf",
|
||||||
"rayon",
|
|
||||||
] }
|
] }
|
||||||
|
|
||||||
stop-words = { version = "0.8.0", default-features = false, features = ["iso"] }
|
stop-words = { version = "0.8.0", default-features = false, features = ["iso"] }
|
||||||
thesaurus = { version = "0.5.2", default-features = false, optional = true, features = [
|
thesaurus = { version = "0.5.2", default-features = false, optional = true, features = [
|
||||||
"moby",
|
"moby",
|
||||||
@ -102,6 +104,8 @@ lightningcss = { version = "1.0.0-alpha.57", default-features = false, features
|
|||||||
# Temporary fork with fix
|
# Temporary fork with fix
|
||||||
minify-js = { git = "https://github.com/RuairidhWilliamson/minify-js", branch = "master", version = "0.6.0", default-features = false}
|
minify-js = { git = "https://github.com/RuairidhWilliamson/minify-js", branch = "master", version = "0.6.0", default-features = false}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
[profile.dev]
|
[profile.dev]
|
||||||
opt-level = 0
|
opt-level = 0
|
||||||
debug = true
|
debug = true
|
||||||
@ -176,7 +180,7 @@ opt-level = "z"
|
|||||||
use-synonyms-search = ["thesaurus/static"]
|
use-synonyms-search = ["thesaurus/static"]
|
||||||
default = ["memory-cache"]
|
default = ["memory-cache"]
|
||||||
dhat-heap = ["dep:dhat"]
|
dhat-heap = ["dep:dhat"]
|
||||||
memory-cache = ["dep:moka"]
|
memory-cache = ["dep:mini-moka"]
|
||||||
redis-cache = ["dep:redis", "dep:base64"]
|
redis-cache = ["dep:redis", "dep:base64"]
|
||||||
compress-cache-results = ["dep:async-compression", "dep:cfg-if"]
|
compress-cache-results = ["dep:async-compression", "dep:cfg-if"]
|
||||||
encrypt-cache-results = ["dep:chacha20poly1305", "dep:chacha20"]
|
encrypt-cache-results = ["dep:chacha20poly1305", "dep:chacha20"]
|
||||||
|
@ -34,11 +34,11 @@
|
|||||||
},
|
},
|
||||||
"nixpkgs_2": {
|
"nixpkgs_2": {
|
||||||
"locked": {
|
"locked": {
|
||||||
"lastModified": 1725194671,
|
"lastModified": 1695318763,
|
||||||
"narHash": "sha256-tLGCFEFTB5TaOKkpfw3iYT9dnk4awTP/q4w+ROpMfuw=",
|
"narHash": "sha256-FHVPDRP2AfvsxAdc+AsgFJevMz5VBmnZglFUMlxBkcY=",
|
||||||
"owner": "NixOS",
|
"owner": "NixOS",
|
||||||
"repo": "nixpkgs",
|
"repo": "nixpkgs",
|
||||||
"rev": "b833ff01a0d694b910daca6e2ff4a3f26dee478c",
|
"rev": "e12483116b3b51a185a33a272bf351e357ba9a99",
|
||||||
"type": "github"
|
"type": "github"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
|
@ -36,7 +36,7 @@
|
|||||||
haskellPackages.hadolint
|
haskellPackages.hadolint
|
||||||
nodejs
|
nodejs
|
||||||
nodePackages_latest.cspell
|
nodePackages_latest.cspell
|
||||||
eslint
|
nodePackages_latest.eslint
|
||||||
nodePackages_latest.markdownlint-cli2
|
nodePackages_latest.markdownlint-cli2
|
||||||
nodePackages_latest.stylelint
|
nodePackages_latest.stylelint
|
||||||
redis
|
redis
|
||||||
|
37
src/cache/cacher.rs
vendored
37
src/cache/cacher.rs
vendored
@ -2,9 +2,10 @@
|
|||||||
//! from the upstream search engines in a json format.
|
//! from the upstream search engines in a json format.
|
||||||
|
|
||||||
use error_stack::Report;
|
use error_stack::Report;
|
||||||
use futures::future::join_all;
|
|
||||||
#[cfg(feature = "memory-cache")]
|
#[cfg(feature = "memory-cache")]
|
||||||
use moka::future::Cache as MokaCache;
|
use mini_moka::sync::Cache as MokaCache;
|
||||||
|
#[cfg(feature = "memory-cache")]
|
||||||
|
use mini_moka::sync::ConcurrentCacheExt;
|
||||||
|
|
||||||
#[cfg(feature = "memory-cache")]
|
#[cfg(feature = "memory-cache")]
|
||||||
use std::time::Duration;
|
use std::time::Duration;
|
||||||
@ -375,13 +376,13 @@ impl Cacher for RedisCache {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
/// TryInto implementation for SearchResults from Vec<u8>
|
/// TryInto implementation for SearchResults from Vec<u8>
|
||||||
use std::{convert::TryInto, sync::Arc};
|
use std::convert::TryInto;
|
||||||
|
|
||||||
impl TryInto<SearchResults> for Vec<u8> {
|
impl TryInto<SearchResults> for Vec<u8> {
|
||||||
type Error = CacheError;
|
type Error = CacheError;
|
||||||
|
|
||||||
fn try_into(self) -> Result<SearchResults, Self::Error> {
|
fn try_into(self) -> Result<SearchResults, Self::Error> {
|
||||||
bincode::deserialize_from(self.as_slice()).map_err(|_| CacheError::SerializationError)
|
serde_json::from_slice(&self).map_err(|_| CacheError::SerializationError)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -389,7 +390,7 @@ impl TryInto<Vec<u8>> for &SearchResults {
|
|||||||
type Error = CacheError;
|
type Error = CacheError;
|
||||||
|
|
||||||
fn try_into(self) -> Result<Vec<u8>, Self::Error> {
|
fn try_into(self) -> Result<Vec<u8>, Self::Error> {
|
||||||
bincode::serialize(self).map_err(|_| CacheError::SerializationError)
|
serde_json::to_vec(self).map_err(|_| CacheError::SerializationError)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -397,16 +398,7 @@ impl TryInto<Vec<u8>> for &SearchResults {
|
|||||||
#[cfg(feature = "memory-cache")]
|
#[cfg(feature = "memory-cache")]
|
||||||
pub struct InMemoryCache {
|
pub struct InMemoryCache {
|
||||||
/// The backend cache which stores data.
|
/// The backend cache which stores data.
|
||||||
cache: Arc<MokaCache<String, Vec<u8>>>,
|
cache: MokaCache<String, Vec<u8>>,
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(feature = "memory-cache")]
|
|
||||||
impl Clone for InMemoryCache {
|
|
||||||
fn clone(&self) -> Self {
|
|
||||||
Self {
|
|
||||||
cache: self.cache.clone(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(feature = "memory-cache")]
|
#[cfg(feature = "memory-cache")]
|
||||||
@ -416,17 +408,15 @@ impl Cacher for InMemoryCache {
|
|||||||
log::info!("Initialising in-memory cache");
|
log::info!("Initialising in-memory cache");
|
||||||
|
|
||||||
InMemoryCache {
|
InMemoryCache {
|
||||||
cache: Arc::new(
|
cache: MokaCache::builder()
|
||||||
MokaCache::builder()
|
|
||||||
.time_to_live(Duration::from_secs(config.cache_expiry_time.into()))
|
.time_to_live(Duration::from_secs(config.cache_expiry_time.into()))
|
||||||
.build(),
|
.build(),
|
||||||
),
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn cached_results(&mut self, url: &str) -> Result<SearchResults, Report<CacheError>> {
|
async fn cached_results(&mut self, url: &str) -> Result<SearchResults, Report<CacheError>> {
|
||||||
let hashed_url_string = self.hash_url(url);
|
let hashed_url_string = self.hash_url(url);
|
||||||
match self.cache.get(&hashed_url_string).await {
|
match self.cache.get(&hashed_url_string) {
|
||||||
Some(res) => self.post_process_search_results(res).await,
|
Some(res) => self.post_process_search_results(res).await,
|
||||||
None => Err(Report::new(CacheError::MissingValue)),
|
None => Err(Report::new(CacheError::MissingValue)),
|
||||||
}
|
}
|
||||||
@ -437,18 +427,13 @@ impl Cacher for InMemoryCache {
|
|||||||
search_results: &[SearchResults],
|
search_results: &[SearchResults],
|
||||||
urls: &[String],
|
urls: &[String],
|
||||||
) -> Result<(), Report<CacheError>> {
|
) -> Result<(), Report<CacheError>> {
|
||||||
let mut tasks: Vec<_> = Vec::with_capacity(urls.len());
|
|
||||||
for (url, search_result) in urls.iter().zip(search_results.iter()) {
|
for (url, search_result) in urls.iter().zip(search_results.iter()) {
|
||||||
let hashed_url_string = self.hash_url(url);
|
let hashed_url_string = self.hash_url(url);
|
||||||
let bytes = self.pre_process_search_results(search_result).await?;
|
let bytes = self.pre_process_search_results(search_result).await?;
|
||||||
let new_self = self.clone();
|
self.cache.insert(hashed_url_string, bytes);
|
||||||
tasks.push(tokio::spawn(async move {
|
|
||||||
new_self.cache.insert(hashed_url_string, bytes).await
|
|
||||||
}));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
join_all(tasks).await;
|
self.cache.sync();
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
6
src/cache/redis_cacher.rs
vendored
6
src/cache/redis_cacher.rs
vendored
@ -16,7 +16,7 @@ const REDIS_PIPELINE_SIZE: usize = 3;
|
|||||||
/// connect to.
|
/// connect to.
|
||||||
pub struct RedisCache {
|
pub struct RedisCache {
|
||||||
/// It stores a pool of connections ready to be used.
|
/// It stores a pool of connections ready to be used.
|
||||||
connection_pool: Box<[ConnectionManager]>,
|
connection_pool: Vec<ConnectionManager>,
|
||||||
/// It stores the size of the connection pool (in other words the number of
|
/// It stores the size of the connection pool (in other words the number of
|
||||||
/// connections that should be stored in the pool).
|
/// connections that should be stored in the pool).
|
||||||
pool_size: u8,
|
pool_size: u8,
|
||||||
@ -58,13 +58,13 @@ impl RedisCache {
|
|||||||
}));
|
}));
|
||||||
}
|
}
|
||||||
|
|
||||||
let mut outputs = Vec::with_capacity(tasks.len());
|
let mut outputs = Vec::new();
|
||||||
for task in tasks {
|
for task in tasks {
|
||||||
outputs.push(task.await??);
|
outputs.push(task.await??);
|
||||||
}
|
}
|
||||||
|
|
||||||
let redis_cache = RedisCache {
|
let redis_cache = RedisCache {
|
||||||
connection_pool: outputs.into_boxed_slice(),
|
connection_pool: outputs,
|
||||||
pool_size,
|
pool_size,
|
||||||
current_connection: Default::default(),
|
current_connection: Default::default(),
|
||||||
cache_ttl,
|
cache_ttl,
|
||||||
|
@ -48,8 +48,6 @@ pub struct Config {
|
|||||||
pub tcp_connection_keep_alive: u8,
|
pub tcp_connection_keep_alive: u8,
|
||||||
/// It stores the pool idle connection timeout in seconds.
|
/// It stores the pool idle connection timeout in seconds.
|
||||||
pub pool_idle_connection_timeout: u8,
|
pub pool_idle_connection_timeout: u8,
|
||||||
/// It stores the number of https connections to keep in the pool.
|
|
||||||
pub number_of_https_connections: u8,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Config {
|
impl Config {
|
||||||
@ -141,7 +139,6 @@ impl Config {
|
|||||||
request_timeout: globals.get::<_, u8>("request_timeout")?,
|
request_timeout: globals.get::<_, u8>("request_timeout")?,
|
||||||
tcp_connection_keep_alive: globals.get::<_, u8>("tcp_connection_keep_alive")?,
|
tcp_connection_keep_alive: globals.get::<_, u8>("tcp_connection_keep_alive")?,
|
||||||
pool_idle_connection_timeout: globals.get::<_, u8>("pool_idle_connection_timeout")?,
|
pool_idle_connection_timeout: globals.get::<_, u8>("pool_idle_connection_timeout")?,
|
||||||
number_of_https_connections: globals.get::<_, u8>("number_of_https_connections")?,
|
|
||||||
threads,
|
threads,
|
||||||
client_connection_keep_alive: globals.get::<_, u8>("client_connection_keep_alive")?,
|
client_connection_keep_alive: globals.get::<_, u8>("client_connection_keep_alive")?,
|
||||||
rate_limiter: RateLimiter {
|
rate_limiter: RateLimiter {
|
||||||
|
@ -3,6 +3,7 @@
|
|||||||
|
|
||||||
use super::engine_models::EngineError;
|
use super::engine_models::EngineError;
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
|
use smallvec::SmallVec;
|
||||||
#[cfg(any(
|
#[cfg(any(
|
||||||
feature = "use-synonyms-search",
|
feature = "use-synonyms-search",
|
||||||
feature = "use-non-static-synonyms-search"
|
feature = "use-non-static-synonyms-search"
|
||||||
@ -11,9 +12,7 @@ use thesaurus::synonyms;
|
|||||||
/// A named struct to store the raw scraped search results scraped search results from the
|
/// A named struct to store the raw scraped search results scraped search results from the
|
||||||
/// upstream search engines before aggregating it.It derives the Clone trait which is needed
|
/// upstream search engines before aggregating it.It derives the Clone trait which is needed
|
||||||
/// to write idiomatic rust using `Iterators`.
|
/// to write idiomatic rust using `Iterators`.
|
||||||
///
|
|
||||||
/// (href url in html in simple words).
|
/// (href url in html in simple words).
|
||||||
///
|
|
||||||
#[derive(Clone, Serialize, Deserialize)]
|
#[derive(Clone, Serialize, Deserialize)]
|
||||||
#[serde(rename_all = "camelCase")]
|
#[serde(rename_all = "camelCase")]
|
||||||
pub struct SearchResult {
|
pub struct SearchResult {
|
||||||
@ -24,7 +23,7 @@ pub struct SearchResult {
|
|||||||
/// The description of the search result.
|
/// The description of the search result.
|
||||||
pub description: String,
|
pub description: String,
|
||||||
/// The names of the upstream engines from which this results were provided.
|
/// The names of the upstream engines from which this results were provided.
|
||||||
pub engine: Vec<String>,
|
pub engine: SmallVec<[String; 0]>,
|
||||||
/// The td-tdf score of the result in regards to the title, url and description and the user's query
|
/// The td-tdf score of the result in regards to the title, url and description and the user's query
|
||||||
pub relevance_score: f32,
|
pub relevance_score: f32,
|
||||||
}
|
}
|
||||||
@ -154,10 +153,10 @@ impl EngineErrorInfo {
|
|||||||
#[serde(rename_all = "camelCase")]
|
#[serde(rename_all = "camelCase")]
|
||||||
pub struct SearchResults {
|
pub struct SearchResults {
|
||||||
/// Stores the individual serializable `SearchResult` struct into a vector of
|
/// Stores the individual serializable `SearchResult` struct into a vector of
|
||||||
pub results: Box<[SearchResult]>,
|
pub results: Vec<SearchResult>,
|
||||||
/// Stores the information on which engines failed with their engine name
|
/// Stores the information on which engines failed with their engine name
|
||||||
/// and the type of error that caused it.
|
/// and the type of error that caused it.
|
||||||
pub engine_errors_info: Box<[EngineErrorInfo]>,
|
pub engine_errors_info: Vec<EngineErrorInfo>,
|
||||||
/// Stores the flag option which holds the check value that the following
|
/// Stores the flag option which holds the check value that the following
|
||||||
/// search query was disallowed when the safe search level set to 4 and it
|
/// search query was disallowed when the safe search level set to 4 and it
|
||||||
/// was present in the `Blocklist` file.
|
/// was present in the `Blocklist` file.
|
||||||
@ -184,10 +183,10 @@ impl SearchResults {
|
|||||||
/// the search url.
|
/// the search url.
|
||||||
/// * `engine_errors_info` - Takes an array of structs which contains information regarding
|
/// * `engine_errors_info` - Takes an array of structs which contains information regarding
|
||||||
/// which engines failed with their names, reason and their severity color name.
|
/// which engines failed with their names, reason and their severity color name.
|
||||||
pub fn new(results: Box<[SearchResult]>, engine_errors_info: Box<[EngineErrorInfo]>) -> Self {
|
pub fn new(results: Vec<SearchResult>, engine_errors_info: &[EngineErrorInfo]) -> Self {
|
||||||
Self {
|
Self {
|
||||||
results,
|
results,
|
||||||
engine_errors_info,
|
engine_errors_info: engine_errors_info.to_owned(),
|
||||||
disallowed: Default::default(),
|
disallowed: Default::default(),
|
||||||
filtered: Default::default(),
|
filtered: Default::default(),
|
||||||
safe_search_level: Default::default(),
|
safe_search_level: Default::default(),
|
||||||
@ -206,11 +205,11 @@ impl SearchResults {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// A getter function that gets the value of `engine_errors_info`.
|
/// A getter function that gets the value of `engine_errors_info`.
|
||||||
pub fn engine_errors_info(&mut self) -> Box<[EngineErrorInfo]> {
|
pub fn engine_errors_info(&mut self) -> Vec<EngineErrorInfo> {
|
||||||
std::mem::take(&mut self.engine_errors_info)
|
std::mem::take(&mut self.engine_errors_info)
|
||||||
}
|
}
|
||||||
/// A getter function that gets the value of `results`.
|
/// A getter function that gets the value of `results`.
|
||||||
pub fn results(&mut self) -> Box<[SearchResult]> {
|
pub fn results(&mut self) -> Vec<SearchResult> {
|
||||||
self.results.clone()
|
self.results.clone()
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -255,50 +254,27 @@ fn calculate_tf_idf(
|
|||||||
let tf_idf = TfIdf::new(params);
|
let tf_idf = TfIdf::new(params);
|
||||||
let tokener = Tokenizer::new(query, stop_words, Some(punctuation));
|
let tokener = Tokenizer::new(query, stop_words, Some(punctuation));
|
||||||
let query_tokens = tokener.split_into_words();
|
let query_tokens = tokener.split_into_words();
|
||||||
|
let mut search_tokens = vec![];
|
||||||
|
|
||||||
#[cfg(any(
|
for token in query_tokens {
|
||||||
feature = "use-synonyms-search",
|
|
||||||
feature = "use-non-static-synonyms-search"
|
|
||||||
))]
|
|
||||||
let mut extra_tokens = vec![];
|
|
||||||
|
|
||||||
let total_score: f32 = query_tokens
|
|
||||||
.iter()
|
|
||||||
.map(|token| {
|
|
||||||
#[cfg(any(
|
#[cfg(any(
|
||||||
feature = "use-synonyms-search",
|
feature = "use-synonyms-search",
|
||||||
feature = "use-non-static-synonyms-search"
|
feature = "use-non-static-synonyms-search"
|
||||||
))]
|
))]
|
||||||
{
|
{
|
||||||
// find some synonyms and add them to the search (from wordnet or moby if feature is enabled)
|
// find some synonyms and add them to the search (from wordnet or moby if feature is enabled)
|
||||||
extra_tokens.extend(synonyms(token))
|
let synonyms = synonyms(&token);
|
||||||
|
search_tokens.extend(synonyms)
|
||||||
|
}
|
||||||
|
search_tokens.push(token);
|
||||||
}
|
}
|
||||||
|
|
||||||
tf_idf.get_score(token)
|
let mut total_score = 0.0f32;
|
||||||
})
|
for token in search_tokens.iter() {
|
||||||
.sum();
|
total_score += tf_idf.get_score(token);
|
||||||
|
}
|
||||||
|
|
||||||
#[cfg(not(any(
|
let result = total_score / (search_tokens.len() as f32);
|
||||||
feature = "use-synonyms-search",
|
|
||||||
feature = "use-non-static-synonyms-search"
|
|
||||||
)))]
|
|
||||||
let result = total_score / (query_tokens.len() as f32);
|
|
||||||
|
|
||||||
#[cfg(any(
|
|
||||||
feature = "use-synonyms-search",
|
|
||||||
feature = "use-non-static-synonyms-search"
|
|
||||||
))]
|
|
||||||
let extra_total_score: f32 = extra_tokens
|
|
||||||
.iter()
|
|
||||||
.map(|token| tf_idf.get_score(token))
|
|
||||||
.sum();
|
|
||||||
|
|
||||||
#[cfg(any(
|
|
||||||
feature = "use-synonyms-search",
|
|
||||||
feature = "use-non-static-synonyms-search"
|
|
||||||
))]
|
|
||||||
let result =
|
|
||||||
(extra_total_score + total_score) / ((query_tokens.len() + extra_tokens.len()) as f32);
|
|
||||||
|
|
||||||
f32::from(!result.is_nan()) * result
|
f32::from(!result.is_nan()) * result
|
||||||
}
|
}
|
||||||
|
@ -11,7 +11,7 @@ use super::parser_models::Style;
|
|||||||
pub struct SearchParams {
|
pub struct SearchParams {
|
||||||
/// It stores the search parameter option `q` (or query in simple words)
|
/// It stores the search parameter option `q` (or query in simple words)
|
||||||
/// of the search url.
|
/// of the search url.
|
||||||
pub q: Option<Cow<'static, str>>,
|
pub q: Option<String>,
|
||||||
/// It stores the search parameter `page` (or pageno in simple words)
|
/// It stores the search parameter `page` (or pageno in simple words)
|
||||||
/// of the search url.
|
/// of the search url.
|
||||||
pub page: Option<u32>,
|
pub page: Option<u32>,
|
||||||
@ -29,7 +29,7 @@ pub struct Cookie<'a> {
|
|||||||
/// It stores the colorscheme name used for the website theme.
|
/// It stores the colorscheme name used for the website theme.
|
||||||
pub colorscheme: Cow<'a, str>,
|
pub colorscheme: Cow<'a, str>,
|
||||||
/// It stores the user selected upstream search engines selected from the UI.
|
/// It stores the user selected upstream search engines selected from the UI.
|
||||||
pub engines: Cow<'a, [Cow<'a, str>]>,
|
pub engines: Cow<'a, Vec<Cow<'a, str>>>,
|
||||||
/// It stores the user selected safe search level from the UI.
|
/// It stores the user selected safe search level from the UI.
|
||||||
pub safe_search_level: u8,
|
pub safe_search_level: u8,
|
||||||
}
|
}
|
||||||
|
@ -14,6 +14,7 @@ use futures::stream::FuturesUnordered;
|
|||||||
use regex::Regex;
|
use regex::Regex;
|
||||||
use reqwest::{Client, ClientBuilder};
|
use reqwest::{Client, ClientBuilder};
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
use std::time::{SystemTime, UNIX_EPOCH};
|
||||||
use tokio::{
|
use tokio::{
|
||||||
fs::File,
|
fs::File,
|
||||||
io::{AsyncBufReadExt, BufReader},
|
io::{AsyncBufReadExt, BufReader},
|
||||||
@ -81,7 +82,6 @@ pub async fn aggregate(
|
|||||||
config.pool_idle_connection_timeout as u64,
|
config.pool_idle_connection_timeout as u64,
|
||||||
))
|
))
|
||||||
.tcp_keepalive(Duration::from_secs(config.tcp_connection_keep_alive as u64))
|
.tcp_keepalive(Duration::from_secs(config.tcp_connection_keep_alive as u64))
|
||||||
.pool_max_idle_per_host(config.number_of_https_connections as usize)
|
|
||||||
.connect_timeout(Duration::from_secs(config.request_timeout as u64)) // Add timeout to request to avoid DDOSing the server
|
.connect_timeout(Duration::from_secs(config.request_timeout as u64)) // Add timeout to request to avoid DDOSing the server
|
||||||
.https_only(true)
|
.https_only(true)
|
||||||
.gzip(true)
|
.gzip(true)
|
||||||
@ -93,6 +93,13 @@ pub async fn aggregate(
|
|||||||
|
|
||||||
let user_agent: &str = random_user_agent();
|
let user_agent: &str = random_user_agent();
|
||||||
|
|
||||||
|
// Add a random delay before making the request.
|
||||||
|
if config.aggregator.random_delay || !config.debug {
|
||||||
|
let nanos = SystemTime::now().duration_since(UNIX_EPOCH)?.subsec_nanos() as f32;
|
||||||
|
let delay = ((nanos / 1_0000_0000 as f32).floor() as u64) + 1;
|
||||||
|
tokio::time::sleep(Duration::from_secs(delay)).await;
|
||||||
|
}
|
||||||
|
|
||||||
let mut names: Vec<&str> = Vec::with_capacity(0);
|
let mut names: Vec<&str> = Vec::with_capacity(0);
|
||||||
|
|
||||||
// create tasks for upstream result fetching
|
// create tasks for upstream result fetching
|
||||||
@ -181,21 +188,19 @@ pub async fn aggregate(
|
|||||||
drop(blacklist_map);
|
drop(blacklist_map);
|
||||||
}
|
}
|
||||||
|
|
||||||
let mut results: Box<[SearchResult]> = result_map
|
let mut results: Vec<SearchResult> = result_map
|
||||||
.into_iter()
|
.iter()
|
||||||
.map(|(_, mut value)| {
|
.map(|(_, value)| {
|
||||||
if !value.url.contains("temu.com") {
|
let mut copy = value.clone();
|
||||||
value.calculate_relevance(query.as_str())
|
if !copy.url.contains("temu.com") {
|
||||||
|
copy.calculate_relevance(query.as_str())
|
||||||
}
|
}
|
||||||
value
|
copy
|
||||||
})
|
})
|
||||||
.collect();
|
.collect();
|
||||||
sort_search_results(&mut results);
|
sort_search_results(&mut results);
|
||||||
|
|
||||||
Ok(SearchResults::new(
|
Ok(SearchResults::new(results, &engine_errors_info))
|
||||||
results,
|
|
||||||
engine_errors_info.into_boxed_slice(),
|
|
||||||
))
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Filters a map of search results using a list of regex patterns.
|
/// Filters a map of search results using a list of regex patterns.
|
||||||
@ -260,6 +265,7 @@ fn sort_search_results(results: &mut [SearchResult]) {
|
|||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use super::*;
|
use super::*;
|
||||||
|
use smallvec::smallvec;
|
||||||
use std::io::Write;
|
use std::io::Write;
|
||||||
use tempfile::NamedTempFile;
|
use tempfile::NamedTempFile;
|
||||||
|
|
||||||
@ -275,7 +281,7 @@ mod tests {
|
|||||||
description: "This domain is for use in illustrative examples in documents."
|
description: "This domain is for use in illustrative examples in documents."
|
||||||
.to_owned(),
|
.to_owned(),
|
||||||
relevance_score: 0.0,
|
relevance_score: 0.0,
|
||||||
engine: vec!["Google".to_owned(), "Bing".to_owned()],
|
engine: smallvec!["Google".to_owned(), "Bing".to_owned()],
|
||||||
},
|
},
|
||||||
));
|
));
|
||||||
map_to_be_filtered.push((
|
map_to_be_filtered.push((
|
||||||
@ -284,7 +290,7 @@ mod tests {
|
|||||||
title: "Rust Programming Language".to_owned(),
|
title: "Rust Programming Language".to_owned(),
|
||||||
url: "https://www.rust-lang.org/".to_owned(),
|
url: "https://www.rust-lang.org/".to_owned(),
|
||||||
description: "A systems programming language that runs blazingly fast, prevents segfaults, and guarantees thread safety.".to_owned(),
|
description: "A systems programming language that runs blazingly fast, prevents segfaults, and guarantees thread safety.".to_owned(),
|
||||||
engine: vec!["Google".to_owned(), "DuckDuckGo".to_owned()],
|
engine: smallvec!["Google".to_owned(), "DuckDuckGo".to_owned()],
|
||||||
relevance_score:0.0
|
relevance_score:0.0
|
||||||
},)
|
},)
|
||||||
);
|
);
|
||||||
@ -325,7 +331,7 @@ mod tests {
|
|||||||
url: "https://www.example.com".to_owned(),
|
url: "https://www.example.com".to_owned(),
|
||||||
description: "This domain is for use in illustrative examples in documents."
|
description: "This domain is for use in illustrative examples in documents."
|
||||||
.to_owned(),
|
.to_owned(),
|
||||||
engine: vec!["Google".to_owned(), "Bing".to_owned()],
|
engine: smallvec!["Google".to_owned(), "Bing".to_owned()],
|
||||||
relevance_score: 0.0,
|
relevance_score: 0.0,
|
||||||
},
|
},
|
||||||
));
|
));
|
||||||
@ -335,7 +341,7 @@ mod tests {
|
|||||||
title: "Rust Programming Language".to_owned(),
|
title: "Rust Programming Language".to_owned(),
|
||||||
url: "https://www.rust-lang.org/".to_owned(),
|
url: "https://www.rust-lang.org/".to_owned(),
|
||||||
description: "A systems programming language that runs blazingly fast, prevents segfaults, and guarantees thread safety.".to_owned(),
|
description: "A systems programming language that runs blazingly fast, prevents segfaults, and guarantees thread safety.".to_owned(),
|
||||||
engine: vec!["Google".to_owned(), "DuckDuckGo".to_owned()],
|
engine: smallvec!["Google".to_owned(), "DuckDuckGo".to_owned()],
|
||||||
relevance_score:0.0
|
relevance_score:0.0
|
||||||
},
|
},
|
||||||
));
|
));
|
||||||
@ -392,7 +398,7 @@ mod tests {
|
|||||||
url: "https://www.example.com".to_owned(),
|
url: "https://www.example.com".to_owned(),
|
||||||
description: "This domain is for use in illustrative examples in documents."
|
description: "This domain is for use in illustrative examples in documents."
|
||||||
.to_owned(),
|
.to_owned(),
|
||||||
engine: vec!["Google".to_owned(), "Bing".to_owned()],
|
engine: smallvec!["Google".to_owned(), "Bing".to_owned()],
|
||||||
relevance_score: 0.0,
|
relevance_score: 0.0,
|
||||||
},
|
},
|
||||||
));
|
));
|
||||||
|
@ -14,8 +14,7 @@ use crate::{
|
|||||||
use actix_web::{get, http::header::ContentType, web, HttpRequest, HttpResponse};
|
use actix_web::{get, http::header::ContentType, web, HttpRequest, HttpResponse};
|
||||||
use itertools::Itertools;
|
use itertools::Itertools;
|
||||||
use regex::Regex;
|
use regex::Regex;
|
||||||
use std::time::{SystemTime, UNIX_EPOCH};
|
use std::borrow::Cow;
|
||||||
use std::{borrow::Cow, time::Duration};
|
|
||||||
use tokio::{
|
use tokio::{
|
||||||
fs::File,
|
fs::File,
|
||||||
io::{AsyncBufReadExt, BufReader},
|
io::{AsyncBufReadExt, BufReader},
|
||||||
@ -84,13 +83,6 @@ pub async fn search(
|
|||||||
let previous_page = page.saturating_sub(1);
|
let previous_page = page.saturating_sub(1);
|
||||||
let next_page = page + 1;
|
let next_page = page + 1;
|
||||||
|
|
||||||
// Add a random delay before making the request.
|
|
||||||
if config.aggregator.random_delay || !config.debug {
|
|
||||||
let nanos = SystemTime::now().duration_since(UNIX_EPOCH)?.subsec_nanos() as f32;
|
|
||||||
let delay = ((nanos / 1_0000_0000 as f32).floor() as u64) + 1;
|
|
||||||
tokio::time::sleep(Duration::from_secs(delay)).await;
|
|
||||||
}
|
|
||||||
|
|
||||||
let results: (SearchResults, String, bool);
|
let results: (SearchResults, String, bool);
|
||||||
if page != previous_page {
|
if page != previous_page {
|
||||||
let (previous_results, current_results, next_results) = join!(
|
let (previous_results, current_results, next_results) = join!(
|
||||||
@ -104,7 +96,9 @@ pub async fn search(
|
|||||||
let (results_list, cache_keys): (Vec<SearchResults>, Vec<String>) =
|
let (results_list, cache_keys): (Vec<SearchResults>, Vec<String>) =
|
||||||
[previous_results?, results.clone(), next_results?]
|
[previous_results?, results.clone(), next_results?]
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.filter_map(|(result, cache_key, flag)| flag.then_some((result, cache_key)))
|
.filter_map(|(result, cache_key, flag)| {
|
||||||
|
dbg!(flag).then_some((result, cache_key))
|
||||||
|
})
|
||||||
.multiunzip();
|
.multiunzip();
|
||||||
|
|
||||||
tokio::spawn(async move { cache.cache_results(&results_list, &cache_keys).await });
|
tokio::spawn(async move { cache.cache_results(&results_list, &cache_keys).await });
|
||||||
|
Loading…
Reference in New Issue
Block a user