mirror of
https://github.com/neon-mmd/websurfx.git
synced 2024-11-22 05:58:21 -05:00
Merge pull request #146 from neon-mmd/improve-async-multithreading
✨ Provide a way to allow users to select different engines
This commit is contained in:
commit
221f38c705
66
Cargo.lock
generated
66
Cargo.lock
generated
@ -292,6 +292,17 @@ version = "0.10.3"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "619743e34b5ba4e9703bba34deac3427c72507c7159f5fd030aea8cac0cfe341"
|
checksum = "619743e34b5ba4e9703bba34deac3427c72507c7159f5fd030aea8cac0cfe341"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "async-trait"
|
||||||
|
version = "0.1.71"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "a564d521dd56509c4c47480d00b80ee55f7e385ae48db5744c67ad50c92d2ebf"
|
||||||
|
dependencies = [
|
||||||
|
"proc-macro2 1.0.64",
|
||||||
|
"quote 1.0.29",
|
||||||
|
"syn 2.0.26",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "autocfg"
|
name = "autocfg"
|
||||||
version = "0.1.8"
|
version = "0.1.8"
|
||||||
@ -506,18 +517,18 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "clap"
|
name = "clap"
|
||||||
version = "4.3.11"
|
version = "4.3.12"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "1640e5cc7fb47dbb8338fd471b105e7ed6c3cb2aeb00c2e067127ffd3764a05d"
|
checksum = "3eab9e8ceb9afdade1ab3f0fd8dbce5b1b2f468ad653baf10e771781b2b67b73"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"clap_builder",
|
"clap_builder",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "clap_builder"
|
name = "clap_builder"
|
||||||
version = "4.3.11"
|
version = "4.3.12"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "98c59138d527eeaf9b53f35a77fcc1fad9d883116070c63d5de1c7dc7b00c72b"
|
checksum = "9f2763db829349bf00cfc06251268865ed4363b93a943174f638daf3ecdba2cd"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anstyle",
|
"anstyle",
|
||||||
"clap_lex",
|
"clap_lex",
|
||||||
@ -784,7 +795,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|||||||
checksum = "13b588ba4ac1a99f7f2964d24b3d896ddc6bf847ee3855dbd4366f058cfcd331"
|
checksum = "13b588ba4ac1a99f7f2964d24b3d896ddc6bf847ee3855dbd4366f058cfcd331"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"quote 1.0.29",
|
"quote 1.0.29",
|
||||||
"syn 2.0.25",
|
"syn 2.0.26",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@ -1457,7 +1468,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|||||||
checksum = "cb0889898416213fab133e1d33a0e5858a48177452750691bde3666d0fdbaf8b"
|
checksum = "cb0889898416213fab133e1d33a0e5858a48177452750691bde3666d0fdbaf8b"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"hermit-abi",
|
"hermit-abi",
|
||||||
"rustix 0.38.3",
|
"rustix 0.38.4",
|
||||||
"windows-sys",
|
"windows-sys",
|
||||||
]
|
]
|
||||||
|
|
||||||
@ -1834,7 +1845,7 @@ checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c"
|
|||||||
dependencies = [
|
dependencies = [
|
||||||
"proc-macro2 1.0.64",
|
"proc-macro2 1.0.64",
|
||||||
"quote 1.0.29",
|
"quote 1.0.29",
|
||||||
"syn 2.0.25",
|
"syn 2.0.26",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@ -1952,7 +1963,7 @@ dependencies = [
|
|||||||
"pest_meta",
|
"pest_meta",
|
||||||
"proc-macro2 1.0.64",
|
"proc-macro2 1.0.64",
|
||||||
"quote 1.0.29",
|
"quote 1.0.29",
|
||||||
"syn 2.0.25",
|
"syn 2.0.26",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@ -2054,7 +2065,7 @@ dependencies = [
|
|||||||
"phf_shared 0.11.2",
|
"phf_shared 0.11.2",
|
||||||
"proc-macro2 1.0.64",
|
"proc-macro2 1.0.64",
|
||||||
"quote 1.0.29",
|
"quote 1.0.29",
|
||||||
"syn 2.0.25",
|
"syn 2.0.26",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@ -2398,9 +2409,9 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "regex-automata"
|
name = "regex-automata"
|
||||||
version = "0.3.2"
|
version = "0.3.3"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "83d3daa6976cffb758ec878f108ba0e062a45b2d6ca3a2cca965338855476caf"
|
checksum = "39354c10dd07468c2e73926b23bb9c2caca74c5501e38a35da70406f1d923310"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"aho-corasick",
|
"aho-corasick",
|
||||||
"memchr",
|
"memchr",
|
||||||
@ -2409,9 +2420,9 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "regex-syntax"
|
name = "regex-syntax"
|
||||||
version = "0.7.3"
|
version = "0.7.4"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "2ab07dc67230e4a4718e70fd5c20055a4334b121f1f9db8fe63ef39ce9b8c846"
|
checksum = "e5ea92a5b6195c6ef2a0295ea818b312502c6fc94dde986c5553242e18fd4ce2"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "reqwest"
|
name = "reqwest"
|
||||||
@ -2548,9 +2559,9 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "rustix"
|
name = "rustix"
|
||||||
version = "0.38.3"
|
version = "0.38.4"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "ac5ffa1efe7548069688cd7028f32591853cd7b5b756d41bcffd2353e4fc75b4"
|
checksum = "0a962918ea88d644592894bc6dc55acc6c0956488adcebbfb6e273506b7fd6e5"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"bitflags 2.3.3",
|
"bitflags 2.3.3",
|
||||||
"errno",
|
"errno",
|
||||||
@ -2708,14 +2719,14 @@ checksum = "389894603bd18c46fa56231694f8d827779c0951a667087194cf9de94ed24682"
|
|||||||
dependencies = [
|
dependencies = [
|
||||||
"proc-macro2 1.0.64",
|
"proc-macro2 1.0.64",
|
||||||
"quote 1.0.29",
|
"quote 1.0.29",
|
||||||
"syn 2.0.25",
|
"syn 2.0.26",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "serde_json"
|
name = "serde_json"
|
||||||
version = "1.0.100"
|
version = "1.0.102"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "0f1e14e89be7aa4c4b78bdbdc9eb5bf8517829a600ae8eaa39a6e1d960b5185c"
|
checksum = "b5062a995d481b2308b6064e9af76011f2921c35f97b0468811ed9f6cd91dfed"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"itoa 1.0.8",
|
"itoa 1.0.8",
|
||||||
"ryu",
|
"ryu",
|
||||||
@ -2937,9 +2948,9 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "syn"
|
name = "syn"
|
||||||
version = "2.0.25"
|
version = "2.0.26"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "15e3fc8c0c74267e2df136e5e5fb656a464158aa57624053375eb9c8c6e25ae2"
|
checksum = "45c3457aacde3c65315de5031ec191ce46604304d2446e803d71ade03308d970"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"proc-macro2 1.0.64",
|
"proc-macro2 1.0.64",
|
||||||
"quote 1.0.29",
|
"quote 1.0.29",
|
||||||
@ -3009,7 +3020,7 @@ checksum = "463fe12d7993d3b327787537ce8dd4dfa058de32fc2b195ef3cde03dc4771e8f"
|
|||||||
dependencies = [
|
dependencies = [
|
||||||
"proc-macro2 1.0.64",
|
"proc-macro2 1.0.64",
|
||||||
"quote 1.0.29",
|
"quote 1.0.29",
|
||||||
"syn 2.0.25",
|
"syn 2.0.26",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@ -3164,7 +3175,7 @@ checksum = "630bdcf245f78637c13ec01ffae6187cca34625e8c63150d424b59e55af2675e"
|
|||||||
dependencies = [
|
dependencies = [
|
||||||
"proc-macro2 1.0.64",
|
"proc-macro2 1.0.64",
|
||||||
"quote 1.0.29",
|
"quote 1.0.29",
|
||||||
"syn 2.0.25",
|
"syn 2.0.26",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@ -3343,9 +3354,9 @@ checksum = "92888ba5573ff080736b3648696b70cafad7d250551175acbaa4e0385b3e1460"
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "unicode-ident"
|
name = "unicode-ident"
|
||||||
version = "1.0.10"
|
version = "1.0.11"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "22049a19f4a68748a168c0fc439f9516686aa045927ff767eca0a85101fb6e73"
|
checksum = "301abaae475aa91687eb82514b328ab47a211a533026cb25fc3e519b86adfc3c"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "unicode-normalization"
|
name = "unicode-normalization"
|
||||||
@ -3486,7 +3497,7 @@ dependencies = [
|
|||||||
"once_cell",
|
"once_cell",
|
||||||
"proc-macro2 1.0.64",
|
"proc-macro2 1.0.64",
|
||||||
"quote 1.0.29",
|
"quote 1.0.29",
|
||||||
"syn 2.0.25",
|
"syn 2.0.26",
|
||||||
"wasm-bindgen-shared",
|
"wasm-bindgen-shared",
|
||||||
]
|
]
|
||||||
|
|
||||||
@ -3520,7 +3531,7 @@ checksum = "54681b18a46765f095758388f2d0cf16eb8d4169b639ab575a8f5693af210c7b"
|
|||||||
dependencies = [
|
dependencies = [
|
||||||
"proc-macro2 1.0.64",
|
"proc-macro2 1.0.64",
|
||||||
"quote 1.0.29",
|
"quote 1.0.29",
|
||||||
"syn 2.0.25",
|
"syn 2.0.26",
|
||||||
"wasm-bindgen-backend",
|
"wasm-bindgen-backend",
|
||||||
"wasm-bindgen-shared",
|
"wasm-bindgen-shared",
|
||||||
]
|
]
|
||||||
@ -3543,10 +3554,11 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "websurfx"
|
name = "websurfx"
|
||||||
version = "0.13.17"
|
version = "0.14.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"actix-files",
|
"actix-files",
|
||||||
"actix-web",
|
"actix-web",
|
||||||
|
"async-trait",
|
||||||
"criterion",
|
"criterion",
|
||||||
"env_logger",
|
"env_logger",
|
||||||
"error-stack",
|
"error-stack",
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "websurfx"
|
name = "websurfx"
|
||||||
version = "0.13.17"
|
version = "0.14.0"
|
||||||
edition = "2021"
|
edition = "2021"
|
||||||
description = "An open-source alternative to Searx that provides clean, ad-free, and organic results with incredible speed while keeping privacy and security in mind."
|
description = "An open-source alternative to Searx that provides clean, ad-free, and organic results with incredible speed while keeping privacy and security in mind."
|
||||||
repository = "https://github.com/neon-mmd/websurfx"
|
repository = "https://github.com/neon-mmd/websurfx"
|
||||||
@ -12,7 +12,7 @@ tokio = {version="*",features=["full"]}
|
|||||||
serde = {version="*",features=["derive"]}
|
serde = {version="*",features=["derive"]}
|
||||||
handlebars = { version = "4.3.6", features = ["dir_source"] }
|
handlebars = { version = "4.3.6", features = ["dir_source"] }
|
||||||
scraper = {version="*"}
|
scraper = {version="*"}
|
||||||
actix-web = {version="4.3.1"}
|
actix-web = {version="4.3.1", features = ["cookies"]}
|
||||||
actix-files = {version="0.6.2"}
|
actix-files = {version="0.6.2"}
|
||||||
serde_json = {version="*"}
|
serde_json = {version="*"}
|
||||||
fake-useragent = {version="*"}
|
fake-useragent = {version="*"}
|
||||||
@ -24,6 +24,7 @@ md5 = {version="*"}
|
|||||||
rand={version="*"}
|
rand={version="*"}
|
||||||
once_cell = {version="*"}
|
once_cell = {version="*"}
|
||||||
error-stack = {version="0.3.1"}
|
error-stack = {version="0.3.1"}
|
||||||
|
async-trait = {version="*"}
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
rusty-hook = "^0.11.2"
|
rusty-hook = "^0.11.2"
|
||||||
|
@ -3,7 +3,7 @@
|
|||||||
|
|
||||||
use super::parser_models::Style;
|
use super::parser_models::Style;
|
||||||
use rlua::Lua;
|
use rlua::Lua;
|
||||||
use std::{format, fs, path::Path};
|
use std::{collections::HashMap, format, fs, path::Path};
|
||||||
|
|
||||||
// ------- Constants --------
|
// ------- Constants --------
|
||||||
static COMMON_DIRECTORY_NAME: &str = "websurfx";
|
static COMMON_DIRECTORY_NAME: &str = "websurfx";
|
||||||
@ -18,6 +18,10 @@ static CONFIG_FILE_NAME: &str = "config.lua";
|
|||||||
/// * `style` - It stores the theming options for the website.
|
/// * `style` - It stores the theming options for the website.
|
||||||
/// * `redis_url` - It stores the redis connection url address on which the redis
|
/// * `redis_url` - It stores the redis connection url address on which the redis
|
||||||
/// client should connect.
|
/// client should connect.
|
||||||
|
/// * `aggregator` - It stores the option to whether enable or disable production use.
|
||||||
|
/// * `logging` - It stores the option to whether enable or disable logs.
|
||||||
|
/// * `debug` - It stores the option to whether enable or disable debug mode.
|
||||||
|
/// * `upstream_search_engines` - It stores all the engine names that were enabled by the user.
|
||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
pub struct Config {
|
pub struct Config {
|
||||||
pub port: u16,
|
pub port: u16,
|
||||||
@ -27,12 +31,17 @@ pub struct Config {
|
|||||||
pub aggregator: AggregatorConfig,
|
pub aggregator: AggregatorConfig,
|
||||||
pub logging: bool,
|
pub logging: bool,
|
||||||
pub debug: bool,
|
pub debug: bool,
|
||||||
|
pub upstream_search_engines: Vec<String>,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Configuration options for the aggregator.
|
/// Configuration options for the aggregator.
|
||||||
|
///
|
||||||
|
/// # Fields
|
||||||
|
///
|
||||||
|
/// * `random_delay` - It stores the option to whether enable or disable random delays between
|
||||||
|
/// requests.
|
||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
pub struct AggregatorConfig {
|
pub struct AggregatorConfig {
|
||||||
/// Whether to introduce a random delay before sending the request to the search engine.
|
|
||||||
pub random_delay: bool,
|
pub random_delay: bool,
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -66,6 +75,11 @@ impl Config {
|
|||||||
},
|
},
|
||||||
logging: globals.get::<_, bool>("logging")?,
|
logging: globals.get::<_, bool>("logging")?,
|
||||||
debug: globals.get::<_, bool>("debug")?,
|
debug: globals.get::<_, bool>("debug")?,
|
||||||
|
upstream_search_engines: globals
|
||||||
|
.get::<_, HashMap<String, bool>>("upstream_search_engines")?
|
||||||
|
.into_iter()
|
||||||
|
.filter_map(|(key, value)| value.then_some(key))
|
||||||
|
.collect(),
|
||||||
})
|
})
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
@ -2,41 +2,48 @@
|
|||||||
//! by querying the upstream duckduckgo search engine with user provided query and with a page
|
//! by querying the upstream duckduckgo search engine with user provided query and with a page
|
||||||
//! number if provided.
|
//! number if provided.
|
||||||
|
|
||||||
use std::{collections::HashMap, time::Duration};
|
use std::collections::HashMap;
|
||||||
|
|
||||||
use reqwest::header::{HeaderMap, CONTENT_TYPE, COOKIE, REFERER, USER_AGENT};
|
use reqwest::header::{HeaderMap, CONTENT_TYPE, COOKIE, REFERER, USER_AGENT};
|
||||||
use scraper::{Html, Selector};
|
use scraper::{Html, Selector};
|
||||||
|
|
||||||
use crate::results::aggregation_models::RawSearchResult;
|
use crate::results::aggregation_models::RawSearchResult;
|
||||||
|
|
||||||
use super::engine_models::EngineError;
|
use super::engine_models::{EngineError, SearchEngine};
|
||||||
|
|
||||||
use error_stack::{IntoReport, Report, Result, ResultExt};
|
use error_stack::{IntoReport, Report, Result, ResultExt};
|
||||||
|
|
||||||
/// This function scrapes results from the upstream engine duckduckgo and puts all the scraped
|
/// A new DuckDuckGo engine type defined in-order to implement the `SearchEngine` trait which allows to
|
||||||
/// results like title, visiting_url (href in html),engine (from which engine it was fetched from)
|
/// reduce code duplication as well as allows to create vector of different search engines easily.
|
||||||
/// and description in a RawSearchResult and then adds that to HashMap whose keys are url and
|
pub struct DuckDuckGo;
|
||||||
/// values are RawSearchResult struct and then returns it within a Result enum.
|
|
||||||
///
|
#[async_trait::async_trait]
|
||||||
/// # Arguments
|
impl SearchEngine for DuckDuckGo {
|
||||||
///
|
/// This function scrapes results from the upstream engine duckduckgo and puts all the scraped
|
||||||
/// * `query` - Takes the user provided query to query to the upstream search engine with.
|
/// results like title, visiting_url (href in html),engine (from which engine it was fetched from)
|
||||||
/// * `page` - Takes an u32 as an argument.
|
/// and description in a RawSearchResult and then adds that to HashMap whose keys are url and
|
||||||
/// * `user_agent` - Takes a random user agent string as an argument.
|
/// values are RawSearchResult struct and then returns it within a Result enum.
|
||||||
///
|
///
|
||||||
/// # Errors
|
/// # Arguments
|
||||||
///
|
///
|
||||||
/// Returns an `EngineErrorKind` if the user is not connected to the internet or if their is failure to
|
/// * `query` - Takes the user provided query to query to the upstream search engine with.
|
||||||
/// reach the above `upstream search engine` page or if the `upstream search engine` is unable to
|
/// * `page` - Takes an u32 as an argument.
|
||||||
/// provide results for the requested search query and also returns error if the scraping selector
|
/// * `user_agent` - Takes a random user agent string as an argument.
|
||||||
/// or HeaderMap fails to initialize.
|
///
|
||||||
pub async fn results(
|
/// # Errors
|
||||||
query: &str,
|
///
|
||||||
|
/// Returns an `EngineErrorKind` if the user is not connected to the internet or if their is failure to
|
||||||
|
/// reach the above `upstream search engine` page or if the `upstream search engine` is unable to
|
||||||
|
/// provide results for the requested search query and also returns error if the scraping selector
|
||||||
|
/// or HeaderMap fails to initialize.
|
||||||
|
async fn results(
|
||||||
|
&self,
|
||||||
|
query: String,
|
||||||
page: u32,
|
page: u32,
|
||||||
user_agent: &str,
|
user_agent: String,
|
||||||
) -> Result<HashMap<String, RawSearchResult>, EngineError> {
|
) -> Result<HashMap<String, RawSearchResult>, EngineError> {
|
||||||
// Page number can be missing or empty string and so appropriate handling is required
|
// Page number can be missing or empty string and so appropriate handling is required
|
||||||
// so that upstream server receives valid page number.
|
// so that upstream server recieves valid page number.
|
||||||
let url: String = match page {
|
let url: String = match page {
|
||||||
1 => {
|
1 => {
|
||||||
format!("https://html.duckduckgo.com/html/?q={query}&s=&dc=&v=1&o=json&api=/d.js")
|
format!("https://html.duckduckgo.com/html/?q={query}&s=&dc=&v=1&o=json&api=/d.js")
|
||||||
@ -82,21 +89,9 @@ pub async fn results(
|
|||||||
.change_context(EngineError::UnexpectedError)?,
|
.change_context(EngineError::UnexpectedError)?,
|
||||||
);
|
);
|
||||||
|
|
||||||
// fetch the html from upstream duckduckgo engine
|
let document: Html = Html::parse_document(
|
||||||
let results: String = reqwest::Client::new()
|
&DuckDuckGo::fetch_html_from_upstream(self, url, header_map).await?,
|
||||||
.get(url)
|
);
|
||||||
.timeout(Duration::from_secs(5))
|
|
||||||
.headers(header_map) // add spoofed headers to emulate human behavior
|
|
||||||
.send()
|
|
||||||
.await
|
|
||||||
.into_report()
|
|
||||||
.change_context(EngineError::RequestError)?
|
|
||||||
.text()
|
|
||||||
.await
|
|
||||||
.into_report()
|
|
||||||
.change_context(EngineError::RequestError)?;
|
|
||||||
|
|
||||||
let document: Html = Html::parse_document(&results);
|
|
||||||
|
|
||||||
let no_result: Selector = Selector::parse(".no-results")
|
let no_result: Selector = Selector::parse(".no-results")
|
||||||
.map_err(|_| Report::new(EngineError::UnexpectedError))
|
.map_err(|_| Report::new(EngineError::UnexpectedError))
|
||||||
@ -152,4 +147,5 @@ pub async fn results(
|
|||||||
})
|
})
|
||||||
.map(|search_result| (search_result.visiting_url.clone(), search_result))
|
.map(|search_result| (search_result.visiting_url.clone(), search_result))
|
||||||
.collect())
|
.collect())
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,8 +1,9 @@
|
|||||||
//! This module provides the error enum to handle different errors associated while requesting data from
|
//! This module provides the error enum to handle different errors associated while requesting data from
|
||||||
//! the upstream search engines with the search query provided by the user.
|
//! the upstream search engines with the search query provided by the user.
|
||||||
|
|
||||||
use error_stack::Context;
|
use crate::results::aggregation_models::RawSearchResult;
|
||||||
use std::fmt;
|
use error_stack::{IntoReport, Result, ResultExt};
|
||||||
|
use std::{collections::HashMap, fmt, time::Duration};
|
||||||
|
|
||||||
/// A custom error type used for handle engine associated errors.
|
/// A custom error type used for handle engine associated errors.
|
||||||
///
|
///
|
||||||
@ -40,4 +41,35 @@ impl fmt::Display for EngineError {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Context for EngineError {}
|
impl error_stack::Context for EngineError {}
|
||||||
|
|
||||||
|
/// A trait to define common behaviour for all search engines.
|
||||||
|
#[async_trait::async_trait]
|
||||||
|
pub trait SearchEngine {
|
||||||
|
async fn fetch_html_from_upstream(
|
||||||
|
&self,
|
||||||
|
url: String,
|
||||||
|
header_map: reqwest::header::HeaderMap,
|
||||||
|
) -> Result<String, EngineError> {
|
||||||
|
// fetch the html from upstream search engine
|
||||||
|
Ok(reqwest::Client::new()
|
||||||
|
.get(url)
|
||||||
|
.timeout(Duration::from_secs(30)) // Add timeout to request to avoid DDOSing the server
|
||||||
|
.headers(header_map) // add spoofed headers to emulate human behaviour
|
||||||
|
.send()
|
||||||
|
.await
|
||||||
|
.into_report()
|
||||||
|
.change_context(EngineError::RequestError)?
|
||||||
|
.text()
|
||||||
|
.await
|
||||||
|
.into_report()
|
||||||
|
.change_context(EngineError::RequestError)?)
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn results(
|
||||||
|
&self,
|
||||||
|
query: String,
|
||||||
|
page: u32,
|
||||||
|
user_agent: String,
|
||||||
|
) -> Result<HashMap<String, RawSearchResult>, EngineError>;
|
||||||
|
}
|
||||||
|
@ -8,31 +8,39 @@ use std::collections::HashMap;
|
|||||||
|
|
||||||
use crate::results::aggregation_models::RawSearchResult;
|
use crate::results::aggregation_models::RawSearchResult;
|
||||||
|
|
||||||
use super::engine_models::EngineError;
|
use super::engine_models::{EngineError, SearchEngine};
|
||||||
use error_stack::{IntoReport, Report, Result, ResultExt};
|
use error_stack::{IntoReport, Report, Result, ResultExt};
|
||||||
|
|
||||||
/// This function scrapes results from the upstream engine duckduckgo and puts all the scraped
|
/// A new Searx engine type defined in-order to implement the `SearchEngine` trait which allows to
|
||||||
/// results like title, visiting_url (href in html),engine (from which engine it was fetched from)
|
/// reduce code duplication as well as allows to create vector of different search engines easily.
|
||||||
/// and description in a RawSearchResult and then adds that to HashMap whose keys are url and
|
pub struct Searx;
|
||||||
/// values are RawSearchResult struct and then returns it within a Result enum.
|
|
||||||
///
|
#[async_trait::async_trait]
|
||||||
/// # Arguments
|
impl SearchEngine for Searx {
|
||||||
///
|
/// This function scrapes results from the upstream engine duckduckgo and puts all the scraped
|
||||||
/// * `query` - Takes the user provided query to query to the upstream search engine with.
|
/// results like title, visiting_url (href in html),engine (from which engine it was fetched from)
|
||||||
/// * `page` - Takes an u32 as an argument.
|
/// and description in a RawSearchResult and then adds that to HashMap whose keys are url and
|
||||||
/// * `user_agent` - Takes a random user agent string as an argument.
|
/// values are RawSearchResult struct and then returns it within a Result enum.
|
||||||
///
|
///
|
||||||
/// # Errors
|
/// # Arguments
|
||||||
///
|
///
|
||||||
/// Returns an `EngineErrorKind` if the user is not connected to the internet or if their is failure to
|
/// * `query` - Takes the user provided query to query to the upstream search engine with.
|
||||||
/// reach the above `upstream search engine` page or if the `upstream search engine` is unable to
|
/// * `page` - Takes an u32 as an argument.
|
||||||
/// provide results for the requested search query and also returns error if the scraping selector
|
/// * `user_agent` - Takes a random user agent string as an argument.
|
||||||
/// or HeaderMap fails to initialize.
|
///
|
||||||
pub async fn results(
|
/// # Errors
|
||||||
query: &str,
|
///
|
||||||
|
/// Returns an `EngineErrorKind` if the user is not connected to the internet or if their is failure to
|
||||||
|
/// reach the above `upstream search engine` page or if the `upstream search engine` is unable to
|
||||||
|
/// provide results for the requested search query and also returns error if the scraping selector
|
||||||
|
/// or HeaderMap fails to initialize.
|
||||||
|
|
||||||
|
async fn results(
|
||||||
|
&self,
|
||||||
|
query: String,
|
||||||
page: u32,
|
page: u32,
|
||||||
user_agent: &str,
|
user_agent: String,
|
||||||
) -> Result<HashMap<String, RawSearchResult>, EngineError> {
|
) -> Result<HashMap<String, RawSearchResult>, EngineError> {
|
||||||
// Page number can be missing or empty string and so appropriate handling is required
|
// Page number can be missing or empty string and so appropriate handling is required
|
||||||
// so that upstream server recieves valid page number.
|
// so that upstream server recieves valid page number.
|
||||||
let url: String = format!("https://searx.work/search?q={query}&pageno={page}");
|
let url: String = format!("https://searx.work/search?q={query}&pageno={page}");
|
||||||
@ -62,24 +70,14 @@ pub async fn results(
|
|||||||
);
|
);
|
||||||
header_map.insert(COOKIE, "categories=general; language=auto; locale=en; autocomplete=duckduckgo; image_proxy=1; method=POST; safesearch=2; theme=simple; results_on_new_tab=1; doi_resolver=oadoi.org; simple_style=auto; center_alignment=1; query_in_title=1; infinite_scroll=0; disabled_engines=; enabled_engines=\"archive is__general\\054yep__general\\054curlie__general\\054currency__general\\054ddg definitions__general\\054wikidata__general\\054duckduckgo__general\\054tineye__general\\054lingva__general\\054startpage__general\\054yahoo__general\\054wiby__general\\054marginalia__general\\054alexandria__general\\054wikibooks__general\\054wikiquote__general\\054wikisource__general\\054wikiversity__general\\054wikivoyage__general\\054dictzone__general\\054seznam__general\\054mojeek__general\\054naver__general\\054wikimini__general\\054brave__general\\054petalsearch__general\\054goo__general\"; disabled_plugins=; enabled_plugins=\"searx.plugins.hostname_replace\\054searx.plugins.oa_doi_rewrite\\054searx.plugins.vim_hotkeys\"; tokens=; maintab=on; enginetab=on".parse().into_report().change_context(EngineError::UnexpectedError)?);
|
header_map.insert(COOKIE, "categories=general; language=auto; locale=en; autocomplete=duckduckgo; image_proxy=1; method=POST; safesearch=2; theme=simple; results_on_new_tab=1; doi_resolver=oadoi.org; simple_style=auto; center_alignment=1; query_in_title=1; infinite_scroll=0; disabled_engines=; enabled_engines=\"archive is__general\\054yep__general\\054curlie__general\\054currency__general\\054ddg definitions__general\\054wikidata__general\\054duckduckgo__general\\054tineye__general\\054lingva__general\\054startpage__general\\054yahoo__general\\054wiby__general\\054marginalia__general\\054alexandria__general\\054wikibooks__general\\054wikiquote__general\\054wikisource__general\\054wikiversity__general\\054wikivoyage__general\\054dictzone__general\\054seznam__general\\054mojeek__general\\054naver__general\\054wikimini__general\\054brave__general\\054petalsearch__general\\054goo__general\"; disabled_plugins=; enabled_plugins=\"searx.plugins.hostname_replace\\054searx.plugins.oa_doi_rewrite\\054searx.plugins.vim_hotkeys\"; tokens=; maintab=on; enginetab=on".parse().into_report().change_context(EngineError::UnexpectedError)?);
|
||||||
|
|
||||||
// fetch the html from upstream searx instance engine
|
let document: Html =
|
||||||
let results: String = reqwest::Client::new()
|
Html::parse_document(&Searx::fetch_html_from_upstream(self, url, header_map).await?);
|
||||||
.get(url)
|
|
||||||
.headers(header_map) // add spoofed headers to emulate human behaviours.
|
|
||||||
.send()
|
|
||||||
.await
|
|
||||||
.into_report()
|
|
||||||
.change_context(EngineError::RequestError)?
|
|
||||||
.text()
|
|
||||||
.await
|
|
||||||
.into_report()
|
|
||||||
.change_context(EngineError::RequestError)?;
|
|
||||||
|
|
||||||
let document: Html = Html::parse_document(&results);
|
|
||||||
|
|
||||||
let no_result: Selector = Selector::parse("#urls>.dialog-error>p")
|
let no_result: Selector = Selector::parse("#urls>.dialog-error>p")
|
||||||
.map_err(|_| Report::new(EngineError::UnexpectedError))
|
.map_err(|_| Report::new(EngineError::UnexpectedError))
|
||||||
.attach_printable_lazy(|| format!("invalid CSS selector: {}", "#urls>.dialog-error>p"))?;
|
.attach_printable_lazy(|| {
|
||||||
|
format!("invalid CSS selector: {}", "#urls>.dialog-error>p")
|
||||||
|
})?;
|
||||||
|
|
||||||
if let Some(no_result_msg) = document.select(&no_result).nth(1) {
|
if let Some(no_result_msg) = document.select(&no_result).nth(1) {
|
||||||
if no_result_msg.inner_html()
|
if no_result_msg.inner_html()
|
||||||
@ -135,4 +133,5 @@ pub async fn results(
|
|||||||
})
|
})
|
||||||
.map(|search_result| (search_result.visiting_url.clone(), search_result))
|
.map(|search_result| (search_result.visiting_url.clone(), search_result))
|
||||||
.collect())
|
.collect())
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -3,7 +3,7 @@
|
|||||||
|
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
use crate::config::parser_models::Style;
|
use crate::{config::parser_models::Style, engines::engine_models::EngineError};
|
||||||
|
|
||||||
/// A named struct to store, serialize and deserializes the individual search result from all the
|
/// A named struct to store, serialize and deserializes the individual search result from all the
|
||||||
/// scraped and aggregated search results from the upstream search engines.
|
/// scraped and aggregated search results from the upstream search engines.
|
||||||
@ -16,7 +16,7 @@ use crate::config::parser_models::Style;
|
|||||||
/// * `url` - The url to be displayed below the search result title in html.
|
/// * `url` - The url to be displayed below the search result title in html.
|
||||||
/// * `description` - The description of the search result.
|
/// * `description` - The description of the search result.
|
||||||
/// * `engine` - The names of the upstream engines from which this results were provided.
|
/// * `engine` - The names of the upstream engines from which this results were provided.
|
||||||
#[derive(Debug, Serialize, Deserialize)]
|
#[derive(Serialize, Deserialize)]
|
||||||
#[serde(rename_all = "camelCase")]
|
#[serde(rename_all = "camelCase")]
|
||||||
pub struct SearchResult {
|
pub struct SearchResult {
|
||||||
pub title: String,
|
pub title: String,
|
||||||
@ -116,6 +116,25 @@ impl RawSearchResult {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize, Deserialize)]
|
||||||
|
pub struct EngineErrorInfo {
|
||||||
|
pub error: String,
|
||||||
|
pub engine: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl EngineErrorInfo {
|
||||||
|
pub fn new(error: &EngineError, engine: String) -> Self {
|
||||||
|
Self {
|
||||||
|
error: match error {
|
||||||
|
EngineError::RequestError => String::from("RequestError"),
|
||||||
|
EngineError::EmptyResultSet => String::from("EmptyResultSet"),
|
||||||
|
EngineError::UnexpectedError => String::from("UnexpectedError"),
|
||||||
|
},
|
||||||
|
engine,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// A named struct to store, serialize, deserialize the all the search results scraped and
|
/// A named struct to store, serialize, deserialize the all the search results scraped and
|
||||||
/// aggregated from the upstream search engines.
|
/// aggregated from the upstream search engines.
|
||||||
///
|
///
|
||||||
@ -124,12 +143,18 @@ impl RawSearchResult {
|
|||||||
/// * `results` - Stores the individual serializable `SearchResult` struct into a vector of
|
/// * `results` - Stores the individual serializable `SearchResult` struct into a vector of
|
||||||
/// `SearchResult` structs.
|
/// `SearchResult` structs.
|
||||||
/// * `page_query` - Stores the current pages search query `q` provided in the search url.
|
/// * `page_query` - Stores the current pages search query `q` provided in the search url.
|
||||||
|
/// * `style` - Stores the theming options for the website.
|
||||||
|
/// * `engine_errors_info` - Stores the information on which engines failed with their engine name
|
||||||
|
/// and the type of error that caused it.
|
||||||
|
/// * `empty_result_set` - Stores a boolean which indicates that no engines gave a result for the
|
||||||
|
/// given search query.
|
||||||
#[derive(Serialize, Deserialize)]
|
#[derive(Serialize, Deserialize)]
|
||||||
#[serde(rename_all = "camelCase")]
|
#[serde(rename_all = "camelCase")]
|
||||||
pub struct SearchResults {
|
pub struct SearchResults {
|
||||||
pub results: Vec<SearchResult>,
|
pub results: Vec<SearchResult>,
|
||||||
pub page_query: String,
|
pub page_query: String,
|
||||||
pub style: Style,
|
pub style: Style,
|
||||||
|
pub engine_errors_info: Vec<EngineErrorInfo>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl SearchResults {
|
impl SearchResults {
|
||||||
@ -141,14 +166,22 @@ impl SearchResults {
|
|||||||
/// and stores it into a vector of `SearchResult` structs.
|
/// and stores it into a vector of `SearchResult` structs.
|
||||||
/// * `page_query` - Takes an argument of current page`s search query `q` provided in
|
/// * `page_query` - Takes an argument of current page`s search query `q` provided in
|
||||||
/// the search url.
|
/// the search url.
|
||||||
pub fn new(results: Vec<SearchResult>, page_query: String) -> Self {
|
/// * `empty_result_set` - Takes a boolean which indicates that no engines gave a result for the
|
||||||
|
/// given search query.
|
||||||
|
pub fn new(
|
||||||
|
results: Vec<SearchResult>,
|
||||||
|
page_query: String,
|
||||||
|
engine_errors_info: Vec<EngineErrorInfo>,
|
||||||
|
) -> Self {
|
||||||
SearchResults {
|
SearchResults {
|
||||||
results,
|
results,
|
||||||
page_query,
|
page_query,
|
||||||
style: Style::new("".to_string(), "".to_string()),
|
style: Style::new("".to_string(), "".to_string()),
|
||||||
|
engine_errors_info,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// A setter function to add website style to the return search results.
|
||||||
pub fn add_style(&mut self, style: Style) {
|
pub fn add_style(&mut self, style: Style) {
|
||||||
self.style = style;
|
self.style = style;
|
||||||
}
|
}
|
||||||
|
@ -3,22 +3,41 @@
|
|||||||
|
|
||||||
use std::{collections::HashMap, time::Duration};
|
use std::{collections::HashMap, time::Duration};
|
||||||
|
|
||||||
|
use error_stack::Report;
|
||||||
use rand::Rng;
|
use rand::Rng;
|
||||||
use tokio::join;
|
use tokio::task::JoinHandle;
|
||||||
|
|
||||||
use super::{
|
use super::{
|
||||||
aggregation_models::{RawSearchResult, SearchResult, SearchResults},
|
aggregation_models::{EngineErrorInfo, RawSearchResult, SearchResult, SearchResults},
|
||||||
user_agent::random_user_agent,
|
user_agent::random_user_agent,
|
||||||
};
|
};
|
||||||
|
|
||||||
use crate::engines::{duckduckgo, searx};
|
use crate::engines::{
|
||||||
|
duckduckgo,
|
||||||
|
engine_models::{EngineError, SearchEngine},
|
||||||
|
searx,
|
||||||
|
};
|
||||||
|
|
||||||
/// A function that aggregates all the scraped results from the above upstream engines and
|
/// Aliases for long type annotations
|
||||||
/// then removes duplicate results and if two results are found to be from two or more engines
|
type FutureVec = Vec<JoinHandle<Result<HashMap<String, RawSearchResult>, Report<EngineError>>>>;
|
||||||
/// then puts their names together to show the results are fetched from these upstream engines
|
|
||||||
/// and then removes all data from the HashMap and puts into a struct of all results aggregated
|
/// The function aggregates the scraped results from the user-selected upstream search engines.
|
||||||
/// into a vector and also adds the query used into the struct this is necessary because
|
/// These engines can be chosen either from the user interface (UI) or from the configuration file.
|
||||||
/// otherwise the search bar in search remains empty if searched from the query url
|
/// The code handles this process by matching the selected search engines and adding them to a vector.
|
||||||
|
/// This vector is then used to create an asynchronous task vector using `tokio::spawn`, which returns
|
||||||
|
/// a future. This future is awaited in another loop. Once the results are collected, they are filtered
|
||||||
|
/// to remove any errors and ensure only proper results are included. If an error is encountered, it is
|
||||||
|
/// sent to the UI along with the name of the engine and the type of error. This information is finally
|
||||||
|
/// placed in the returned `SearchResults` struct.
|
||||||
|
///
|
||||||
|
/// Additionally, the function eliminates duplicate results. If two results are identified as coming from
|
||||||
|
/// multiple engines, their names are combined to indicate that the results were fetched from these upstream
|
||||||
|
/// engines. After this, all the data in the `HashMap` is removed and placed into a struct that contains all
|
||||||
|
/// the aggregated results in a vector. Furthermore, the query used is also added to the struct. This step is
|
||||||
|
/// necessary to ensure that the search bar in the search remains populated even when searched from the query URL.
|
||||||
|
///
|
||||||
|
/// Overall, this function serves to aggregate scraped results from user-selected search engines, handling errors,
|
||||||
|
/// removing duplicates, and organizing the data for display in the UI.
|
||||||
///
|
///
|
||||||
/// # Example:
|
/// # Example:
|
||||||
///
|
///
|
||||||
@ -30,6 +49,9 @@ use crate::engines::{duckduckgo, searx};
|
|||||||
/// * `query` - Accepts a string to query with the above upstream search engines.
|
/// * `query` - Accepts a string to query with the above upstream search engines.
|
||||||
/// * `page` - Accepts an u32 page number.
|
/// * `page` - Accepts an u32 page number.
|
||||||
/// * `random_delay` - Accepts a boolean value to add a random delay before making the request.
|
/// * `random_delay` - Accepts a boolean value to add a random delay before making the request.
|
||||||
|
/// * `debug` - Accepts a boolean value to enable or disable debug mode option.
|
||||||
|
/// * `upstream_search_engines` - Accepts a vector of search engine names which was selected by the
|
||||||
|
/// user through the UI or the config file.
|
||||||
///
|
///
|
||||||
/// # Error
|
/// # Error
|
||||||
///
|
///
|
||||||
@ -37,10 +59,11 @@ use crate::engines::{duckduckgo, searx};
|
|||||||
/// function in either `searx` or `duckduckgo` or both otherwise returns a `SearchResults struct`
|
/// function in either `searx` or `duckduckgo` or both otherwise returns a `SearchResults struct`
|
||||||
/// containing appropriate values.
|
/// containing appropriate values.
|
||||||
pub async fn aggregate(
|
pub async fn aggregate(
|
||||||
query: &str,
|
query: String,
|
||||||
page: u32,
|
page: u32,
|
||||||
random_delay: bool,
|
random_delay: bool,
|
||||||
debug: bool,
|
debug: bool,
|
||||||
|
upstream_search_engines: Vec<String>,
|
||||||
) -> Result<SearchResults, Box<dyn std::error::Error>> {
|
) -> Result<SearchResults, Box<dyn std::error::Error>> {
|
||||||
let user_agent: String = random_user_agent();
|
let user_agent: String = random_user_agent();
|
||||||
let mut result_map: HashMap<String, RawSearchResult> = HashMap::new();
|
let mut result_map: HashMap<String, RawSearchResult> = HashMap::new();
|
||||||
@ -53,28 +76,81 @@ pub async fn aggregate(
|
|||||||
}
|
}
|
||||||
|
|
||||||
// fetch results from upstream search engines simultaneously/concurrently.
|
// fetch results from upstream search engines simultaneously/concurrently.
|
||||||
let (ddg_map_results, searx_map_results) = join!(
|
let search_engines: Vec<Box<dyn SearchEngine + Send + Sync>> = upstream_search_engines
|
||||||
duckduckgo::results(query, page, &user_agent),
|
.iter()
|
||||||
searx::results(query, page, &user_agent)
|
.map(|engine| match engine.to_lowercase().as_str() {
|
||||||
);
|
"duckduckgo" => Box::new(duckduckgo::DuckDuckGo) as Box<dyn SearchEngine + Send + Sync>,
|
||||||
|
"searx" => Box::new(searx::Searx) as Box<dyn SearchEngine + Send + Sync>,
|
||||||
|
&_ => panic!("Config Error: Incorrect config file option provided"),
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
|
||||||
let ddg_map_results = ddg_map_results.unwrap_or_else(|e| {
|
let task_capacity: usize = search_engines.len();
|
||||||
if debug {
|
|
||||||
log::error!("Error fetching results from DuckDuckGo: {:?}", e);
|
let tasks: FutureVec = search_engines
|
||||||
|
.into_iter()
|
||||||
|
.map(|search_engine| {
|
||||||
|
let query: String = query.clone();
|
||||||
|
let user_agent: String = user_agent.clone();
|
||||||
|
tokio::spawn(
|
||||||
|
async move { search_engine.results(query, page, user_agent.clone()).await },
|
||||||
|
)
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
let mut outputs = Vec::with_capacity(task_capacity);
|
||||||
|
|
||||||
|
for task in tasks {
|
||||||
|
if let Ok(result) = task.await {
|
||||||
|
outputs.push(result)
|
||||||
}
|
}
|
||||||
HashMap::new()
|
|
||||||
});
|
|
||||||
|
|
||||||
let searx_map_results = searx_map_results.unwrap_or_else(|e| {
|
|
||||||
if debug {
|
|
||||||
log::error!("Error fetching results from Searx: {:?}", e);
|
|
||||||
}
|
}
|
||||||
HashMap::new()
|
|
||||||
});
|
|
||||||
|
|
||||||
result_map.extend(ddg_map_results);
|
let mut engine_errors_info: Vec<EngineErrorInfo> = Vec::new();
|
||||||
|
|
||||||
searx_map_results.into_iter().for_each(|(key, value)| {
|
// The code block `outputs.iter()` determines whether it is the first time the code is being run.
|
||||||
|
// It does this by checking the initial flag. If it is the first time, the code selects the first
|
||||||
|
// engine from which results are fetched and adds or extends them into the `result_map`. If the
|
||||||
|
// initially selected engine fails, the code automatically selects another engine to map or extend
|
||||||
|
// into the `result_map`. On the other hand, if an engine selected for the first time successfully
|
||||||
|
// fetches results and maps them into the `result_map`, the initial flag is set to false. Subsequently,
|
||||||
|
// the code iterates through the remaining engines one by one. It compares the fetched results from each
|
||||||
|
// engine with the results already present in the `result_map` to identify any duplicates. If duplicate
|
||||||
|
// results are found, the code groups them together with the name of the engine from which they were
|
||||||
|
// fetched, and automatically removes the duplicate results from the newly fetched data.
|
||||||
|
//
|
||||||
|
// Additionally, the code handles errors returned by the engines. It keeps track of which engines
|
||||||
|
// encountered errors and stores this information in a vector of structures called `EngineErrorInfo`.
|
||||||
|
// Each structure in this vector contains the name of the engine and the type of error it returned.
|
||||||
|
// These structures will later be added to the final `SearchResults` structure. The `SearchResults`
|
||||||
|
// structure is used to display an error box in the UI containing the relevant information from
|
||||||
|
// the `EngineErrorInfo` structure.
|
||||||
|
//
|
||||||
|
// In summary, this code block manages the selection of engines, handling of duplicate results, and tracking
|
||||||
|
// of errors in order to populate the `result_map` and provide informative feedback to the user through the
|
||||||
|
// `SearchResults` structure.
|
||||||
|
let mut initial: bool = true;
|
||||||
|
let mut counter: usize = 0;
|
||||||
|
outputs.iter().for_each(|results| {
|
||||||
|
if initial {
|
||||||
|
match results {
|
||||||
|
Ok(result) => {
|
||||||
|
result_map.extend(result.clone());
|
||||||
|
counter += 1;
|
||||||
|
initial = false
|
||||||
|
}
|
||||||
|
Err(error_type) => {
|
||||||
|
engine_errors_info.push(EngineErrorInfo::new(
|
||||||
|
error_type.downcast_ref::<EngineError>().unwrap(),
|
||||||
|
upstream_search_engines[counter].clone(),
|
||||||
|
));
|
||||||
|
counter += 1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
match results {
|
||||||
|
Ok(result) => {
|
||||||
|
result.clone().into_iter().for_each(|(key, value)| {
|
||||||
result_map
|
result_map
|
||||||
.entry(key)
|
.entry(key)
|
||||||
.and_modify(|result| {
|
.and_modify(|result| {
|
||||||
@ -89,6 +165,18 @@ pub async fn aggregate(
|
|||||||
)
|
)
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
counter += 1
|
||||||
|
}
|
||||||
|
Err(error_type) => {
|
||||||
|
engine_errors_info.push(EngineErrorInfo::new(
|
||||||
|
error_type.downcast_ref::<EngineError>().unwrap(),
|
||||||
|
upstream_search_engines[counter].clone(),
|
||||||
|
));
|
||||||
|
counter += 1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
Ok(SearchResults::new(
|
Ok(SearchResults::new(
|
||||||
result_map
|
result_map
|
||||||
@ -104,5 +192,6 @@ pub async fn aggregate(
|
|||||||
})
|
})
|
||||||
.collect(),
|
.collect(),
|
||||||
query.to_string(),
|
query.to_string(),
|
||||||
|
engine_errors_info,
|
||||||
))
|
))
|
||||||
}
|
}
|
||||||
|
@ -22,7 +22,7 @@ use serde::Deserialize;
|
|||||||
/// of the search url.
|
/// of the search url.
|
||||||
/// * `page` - It stores the search parameter `page` (or pageno in simple words)
|
/// * `page` - It stores the search parameter `page` (or pageno in simple words)
|
||||||
/// of the search url.
|
/// of the search url.
|
||||||
#[derive(Debug, Deserialize)]
|
#[derive(Deserialize)]
|
||||||
struct SearchParams {
|
struct SearchParams {
|
||||||
q: Option<String>,
|
q: Option<String>,
|
||||||
page: Option<u32>,
|
page: Option<u32>,
|
||||||
@ -51,6 +51,21 @@ pub async fn not_found(
|
|||||||
.body(page_content))
|
.body(page_content))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// A named struct which is used to deserialize the cookies fetched from the client side.
|
||||||
|
///
|
||||||
|
/// # Fields
|
||||||
|
///
|
||||||
|
/// * `theme` - It stores the theme name used in the website.
|
||||||
|
/// * `colorscheme` - It stores the colorscheme name used for the website theme.
|
||||||
|
/// * `engines` - It stores the user selected upstream search engines selected from the UI.
|
||||||
|
#[allow(dead_code)]
|
||||||
|
#[derive(Deserialize)]
|
||||||
|
struct Cookie {
|
||||||
|
theme: String,
|
||||||
|
colorscheme: String,
|
||||||
|
engines: Vec<String>,
|
||||||
|
}
|
||||||
|
|
||||||
/// Handles the route of search page of the `websurfx` meta search engine website and it takes
|
/// Handles the route of search page of the `websurfx` meta search engine website and it takes
|
||||||
/// two search url parameters `q` and `page` where `page` parameter is optional.
|
/// two search url parameters `q` and `page` where `page` parameter is optional.
|
||||||
///
|
///
|
||||||
@ -72,7 +87,6 @@ pub async fn search(
|
|||||||
config: web::Data<Config>,
|
config: web::Data<Config>,
|
||||||
) -> Result<HttpResponse, Box<dyn std::error::Error>> {
|
) -> Result<HttpResponse, Box<dyn std::error::Error>> {
|
||||||
let params = web::Query::<SearchParams>::from_query(req.query_string())?;
|
let params = web::Query::<SearchParams>::from_query(req.query_string())?;
|
||||||
|
|
||||||
match ¶ms.q {
|
match ¶ms.q {
|
||||||
Some(query) => {
|
Some(query) => {
|
||||||
if query.trim().is_empty() {
|
if query.trim().is_empty() {
|
||||||
@ -89,7 +103,7 @@ pub async fn search(
|
|||||||
"http://{}:{}/search?q={}&page={}",
|
"http://{}:{}/search?q={}&page={}",
|
||||||
config.binding_ip, config.port, query, page
|
config.binding_ip, config.port, query, page
|
||||||
);
|
);
|
||||||
let results_json = results(url, &config, query, page).await?;
|
let results_json = results(url, &config, query.to_string(), page, req).await?;
|
||||||
let page_content: String = hbs.render("search", &results_json)?;
|
let page_content: String = hbs.render("search", &results_json)?;
|
||||||
Ok(HttpResponse::Ok().body(page_content))
|
Ok(HttpResponse::Ok().body(page_content))
|
||||||
}
|
}
|
||||||
@ -104,23 +118,51 @@ pub async fn search(
|
|||||||
async fn results(
|
async fn results(
|
||||||
url: String,
|
url: String,
|
||||||
config: &Config,
|
config: &Config,
|
||||||
query: &str,
|
query: String,
|
||||||
page: u32,
|
page: u32,
|
||||||
|
req: HttpRequest,
|
||||||
) -> Result<SearchResults, Box<dyn std::error::Error>> {
|
) -> Result<SearchResults, Box<dyn std::error::Error>> {
|
||||||
//Initialize redis cache connection struct
|
//Initialize redis cache connection struct
|
||||||
let mut redis_cache = RedisCache::new(config.redis_url.clone())?;
|
let mut redis_cache = RedisCache::new(config.redis_url.clone())?;
|
||||||
// fetch the cached results json.
|
// fetch the cached results json.
|
||||||
let cached_results_json = redis_cache.cached_json(&url);
|
let cached_results_json = redis_cache.cached_json(&url);
|
||||||
// check if fetched results was indeed fetched or it was an error and if so
|
// check if fetched cache results was indeed fetched or it was an error and if so
|
||||||
// handle the data accordingly.
|
// handle the data accordingly.
|
||||||
match cached_results_json {
|
match cached_results_json {
|
||||||
Ok(results_json) => Ok(serde_json::from_str::<SearchResults>(&results_json).unwrap()),
|
Ok(results) => Ok(serde_json::from_str::<SearchResults>(&results).unwrap()),
|
||||||
Err(_) => {
|
Err(_) => {
|
||||||
let mut results_json: crate::results::aggregation_models::SearchResults =
|
// check if the cookie value is empty or not if it is empty then use the
|
||||||
aggregate(query, page, config.aggregator.random_delay, config.debug).await?;
|
// default selected upstream search engines from the config file otherwise
|
||||||
results_json.add_style(config.style.clone());
|
// parse the non-empty cookie and grab the user selected engines from the
|
||||||
redis_cache.cache_results(serde_json::to_string(&results_json)?, &url)?;
|
// UI and use that.
|
||||||
Ok(results_json)
|
let mut results: crate::results::aggregation_models::SearchResults = match req
|
||||||
|
.cookie("appCookie")
|
||||||
|
{
|
||||||
|
Some(cookie_value) => {
|
||||||
|
let cookie_value: Cookie = serde_json::from_str(cookie_value.name_value().1)?;
|
||||||
|
aggregate(
|
||||||
|
query,
|
||||||
|
page,
|
||||||
|
config.aggregator.random_delay,
|
||||||
|
config.debug,
|
||||||
|
cookie_value.engines,
|
||||||
|
)
|
||||||
|
.await?
|
||||||
|
}
|
||||||
|
None => {
|
||||||
|
aggregate(
|
||||||
|
query,
|
||||||
|
page,
|
||||||
|
config.aggregator.random_delay,
|
||||||
|
config.debug,
|
||||||
|
config.upstream_search_engines.clone(),
|
||||||
|
)
|
||||||
|
.await?
|
||||||
|
}
|
||||||
|
};
|
||||||
|
results.add_style(config.style.clone());
|
||||||
|
redis_cache.cache_results(serde_json::to_string(&results)?, &url)?;
|
||||||
|
Ok(results)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -5,7 +5,7 @@ debug = false -- an option to enable or disable debug mode.
|
|||||||
-- ### Server ###
|
-- ### Server ###
|
||||||
port = "8080" -- port on which server should be launched
|
port = "8080" -- port on which server should be launched
|
||||||
binding_ip = "127.0.0.1" --ip address on the which server should be launched.
|
binding_ip = "127.0.0.1" --ip address on the which server should be launched.
|
||||||
production_use = false -- whether to use production mode or not (in other words this option should be used if it is to be used to host it on the server to provide a service to a large number of users)
|
production_use = false -- whether to use production mode or not (in other words this option should be used if it is to be used to host it on the server to provide a service to a large number of users (more than one))
|
||||||
-- if production_use is set to true
|
-- if production_use is set to true
|
||||||
-- There will be a random delay before sending the request to the search engines, this is to prevent DDoSing the upstream search engines from a large number of simultaneous requests.
|
-- There will be a random delay before sending the request to the search engines, this is to prevent DDoSing the upstream search engines from a large number of simultaneous requests.
|
||||||
|
|
||||||
@ -26,3 +26,6 @@ theme = "simple" -- the theme name which should be used for the website
|
|||||||
|
|
||||||
-- ### Caching ###
|
-- ### Caching ###
|
||||||
redis_url = "redis://127.0.0.1:8082" -- redis connection url address on which the client should connect on.
|
redis_url = "redis://127.0.0.1:8082" -- redis connection url address on which the client should connect on.
|
||||||
|
|
||||||
|
-- ### Search Engines ###
|
||||||
|
upstream_search_engines = { DuckDuckGo = true, Searx = false } -- select the upstream search engines from which the results should be fetched.
|
||||||
|
Loading…
Reference in New Issue
Block a user