mirror of
https://github.com/neon-mmd/websurfx.git
synced 2024-11-23 14:38:21 -05:00
991f3f59de
* ♻️ refactor: initialize & store the config & cache structs as a constant (#486) - initializes & stores the config & cache structs as a static constant. - Pass the config & cache structs as a static reference to all the functions handling their respective route. * ⚡ perf: replace hashmaps with vectors for fetching & aggregating results (#486) - replace hashmaps with vectors for fetching, collecting & aggregating results as it tends to be contigous & cache efficient data structure. - refactor & redesign algorithms for fetching & aggregating results centered around vectors in aggregate function. * ➕ build: add the future crate (#486) * ⚡ perf: use `futureunordered` for collecting results fetched from the tokio spawn tasks (#486) - using the `futureunordered` instead of vector for collecting results reduces the time it takes to fetch the results as the results do not need to come in specific order so any result that gets fetched first gets collected in the `futureunordered` type. Co-authored-by: Spencerjibz <spencernajib2@gmail.com> * ⚡ perf: initialize new async connections parallely using tokio spawn tasks (#486) * ⚡ perf: initialize redis pipeline struct once with the default size of 3 (#486) * ⚡ perf: reduce branch predictions by reducing conditional code branches (#486) * ✅ test(unit): provide unit test for the `get_safesearch_level` function (#486) * ⚡ perf: reduce clones & use index based loop to improve search results filtering performance (#486) * 🚨 fix(clippy): make clippy/format checks happy (#486) * 🚨 fix(build): make the cargo build check happy (#486) * ⚡ perf: reduce the amount of clones, to_owneds & to_strings (#486) * ⚡ perf: use async crates & methods & make functions async (#486) * 🔖 chore(release): bump the app version (#486) --------- Co-authored-by: Spencerjibz <spencernajib2@gmail.com>
228 lines
8.9 KiB
Rust
228 lines
8.9 KiB
Rust
//! This module provides the error enum to handle different errors associated while requesting data from
|
|
//! the upstream search engines with the search query provided by the user.
|
|
|
|
use super::aggregation_models::SearchResult;
|
|
use error_stack::{Report, Result, ResultExt};
|
|
use reqwest::Client;
|
|
use std::fmt;
|
|
|
|
/// A custom error type used for handle engine associated errors.
|
|
#[derive(Debug)]
|
|
pub enum EngineError {
|
|
/// No matching engine found
|
|
NoSuchEngineFound(String),
|
|
/// This variant handles all request related errors like forbidden, not found,
|
|
/// etc.
|
|
EmptyResultSet,
|
|
/// This variant handles the not results found error provide by the upstream
|
|
/// search engines.
|
|
RequestError,
|
|
/// This variant handles all the errors which are unexpected or occur rarely
|
|
/// and are errors mostly related to failure in initialization of HeaderMap,
|
|
/// Selector errors and all other errors occurring within the code handling
|
|
/// the `upstream search engines`.
|
|
UnexpectedError,
|
|
}
|
|
|
|
impl fmt::Display for EngineError {
|
|
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
|
match self {
|
|
EngineError::NoSuchEngineFound(engine) => {
|
|
write!(f, "No such engine with the name '{engine}' found")
|
|
}
|
|
EngineError::EmptyResultSet => {
|
|
write!(f, "The upstream search engine returned an empty result set")
|
|
}
|
|
EngineError::RequestError => {
|
|
write!(
|
|
f,
|
|
"Error occurred while requesting data from upstream search engine"
|
|
)
|
|
}
|
|
EngineError::UnexpectedError => {
|
|
write!(f, "An unexpected error occurred while processing the data")
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
impl error_stack::Context for EngineError {}
|
|
|
|
/// A trait to define common behavior for all search engines.
|
|
#[async_trait::async_trait]
|
|
pub trait SearchEngine: Sync + Send {
|
|
/// This helper function fetches/requests the search results from the upstream search engine in
|
|
/// an html form.
|
|
///
|
|
/// # Arguments
|
|
///
|
|
/// * `url` - It takes the url of the upstream search engine with the user requested search
|
|
/// query appended in the search parameters.
|
|
/// * `header_map` - It takes the http request headers to be sent to the upstream engine in
|
|
/// order to prevent being detected as a bot. It takes the header as a HeaderMap type.
|
|
/// * `request_timeout` - It takes the request timeout value as seconds which is used to limit
|
|
/// the amount of time for each request to remain connected when until the results can be provided
|
|
/// by the upstream engine.
|
|
///
|
|
/// # Error
|
|
///
|
|
/// It returns the html data as a string if the upstream engine provides the data as expected
|
|
/// otherwise it returns a custom `EngineError`.
|
|
async fn fetch_html_from_upstream(
|
|
&self,
|
|
url: &str,
|
|
header_map: reqwest::header::HeaderMap,
|
|
client: &Client,
|
|
) -> Result<String, EngineError> {
|
|
// fetch the html from upstream search engine
|
|
Ok(client
|
|
.get(url)
|
|
.headers(header_map) // add spoofed headers to emulate human behavior
|
|
.send()
|
|
.await
|
|
.change_context(EngineError::RequestError)?
|
|
.text()
|
|
.await
|
|
.change_context(EngineError::RequestError)?)
|
|
}
|
|
|
|
/// This helper function fetches/requests the json search results from the upstream search engine as a vector of bytes.
|
|
///
|
|
/// # Arguments
|
|
///
|
|
/// * `url` - It takes the url of the upstream search engine with the user requested search
|
|
/// query appended in the search parameters.
|
|
/// * `header_map` - It takes the http request headers to be sent to the upstream engine in
|
|
/// order to prevent being detected as a bot. It takes the header as a HeaderMap type.
|
|
/// * `request_timeout` - It takes the request timeout value as seconds which is used to limit
|
|
/// the amount of time for each request to remain connected when until the results can be provided
|
|
/// by the upstream engine.
|
|
///
|
|
/// # Error
|
|
///
|
|
/// It returns the html data as a vector of bytes if the upstream engine provides the data as expected
|
|
/// otherwise it returns a custom `EngineError`.
|
|
async fn fetch_json_as_bytes_from_upstream(
|
|
&self,
|
|
url: &str,
|
|
header_map: reqwest::header::HeaderMap,
|
|
client: &Client,
|
|
) -> Result<Vec<u8>, EngineError> {
|
|
// fetch the json response from upstream search engine
|
|
|
|
Ok(client
|
|
.get(url)
|
|
.headers(header_map) // add spoofed headers to emulate human behavior
|
|
.send()
|
|
.await
|
|
.change_context(EngineError::RequestError)?
|
|
.bytes()
|
|
.await
|
|
.change_context(EngineError::RequestError)?
|
|
.to_vec())
|
|
}
|
|
|
|
/// This function scrapes results from the upstream engine and puts all the scraped results like
|
|
/// title, visiting_url (href in html),engine (from which engine it was fetched from) and description
|
|
/// in a RawSearchResult and then adds that to HashMap whose keys are url and values are RawSearchResult
|
|
/// struct and then returns it within a Result enum.
|
|
///
|
|
/// # Arguments
|
|
///
|
|
/// * `query` - Takes the user provided query to query to the upstream search engine with.
|
|
/// * `page` - Takes an u32 as an argument.
|
|
/// * `user_agent` - Takes a random user agent string as an argument.
|
|
/// * `request_timeout` - Takes a time (secs) as a value which controls the server request timeout.
|
|
///
|
|
/// # Errors
|
|
///
|
|
/// Returns an `EngineErrorKind` if the user is not connected to the internet or if their is failure to
|
|
/// reach the above `upstream search engine` page or if the `upstream search engine` is unable to
|
|
/// provide results for the requested search query and also returns error if the scraping selector
|
|
/// or HeaderMap fails to initialize.
|
|
async fn results(
|
|
&self,
|
|
query: &str,
|
|
page: u32,
|
|
user_agent: &str,
|
|
client: &Client,
|
|
safe_search: u8,
|
|
) -> Result<Vec<(String, SearchResult)>, EngineError>;
|
|
}
|
|
|
|
/// A named struct which stores the engine struct with the name of the associated engine.
|
|
pub struct EngineHandler {
|
|
/// It stores the engine struct wrapped in a box smart pointer as the engine struct implements
|
|
/// the `SearchEngine` trait.
|
|
engine: Box<dyn SearchEngine>,
|
|
/// It stores the name of the engine to which the struct is associated to.
|
|
name: &'static str,
|
|
}
|
|
|
|
impl Clone for EngineHandler {
|
|
fn clone(&self) -> Self {
|
|
Self::new(self.name).unwrap()
|
|
}
|
|
}
|
|
|
|
impl EngineHandler {
|
|
/// Parses an engine name into an engine handler.
|
|
///
|
|
/// # Arguments
|
|
///
|
|
/// * `engine_name` - It takes the name of the engine to which the struct was associated to.
|
|
///
|
|
/// # Returns
|
|
///
|
|
/// It returns an option either containing the value or a none if the engine is unknown
|
|
pub fn new(engine_name: &str) -> Result<Self, EngineError> {
|
|
let engine: (&'static str, Box<dyn SearchEngine>) =
|
|
match engine_name.to_lowercase().as_str() {
|
|
"duckduckgo" => {
|
|
let engine = crate::engines::duckduckgo::DuckDuckGo::new()?;
|
|
("duckduckgo", Box::new(engine))
|
|
}
|
|
"searx" => {
|
|
let engine = crate::engines::searx::Searx::new()?;
|
|
("searx", Box::new(engine))
|
|
}
|
|
"brave" => {
|
|
let engine = crate::engines::brave::Brave::new()?;
|
|
("brave", Box::new(engine))
|
|
}
|
|
"startpage" => {
|
|
let engine = crate::engines::startpage::Startpage::new()?;
|
|
("startpage", Box::new(engine))
|
|
}
|
|
"librex" => {
|
|
let engine = crate::engines::librex::LibreX::new()?;
|
|
("librex", Box::new(engine))
|
|
}
|
|
"mojeek" => {
|
|
let engine = crate::engines::mojeek::Mojeek::new()?;
|
|
("mojeek", Box::new(engine))
|
|
}
|
|
"bing" => {
|
|
let engine = crate::engines::bing::Bing::new()?;
|
|
("bing", Box::new(engine))
|
|
}
|
|
_ => {
|
|
return Err(Report::from(EngineError::NoSuchEngineFound(
|
|
engine_name.to_string(),
|
|
)))
|
|
}
|
|
};
|
|
|
|
Ok(Self {
|
|
engine: engine.1,
|
|
name: engine.0,
|
|
})
|
|
}
|
|
|
|
/// This function converts the EngineHandler type into a tuple containing the engine name and
|
|
/// the associated engine struct.
|
|
pub fn into_name_engine(self) -> (&'static str, Box<dyn SearchEngine>) {
|
|
(self.name, self.engine)
|
|
}
|
|
}
|