2023-05-31 12:54:51 -04:00
|
|
|
//! This module provides the error enum to handle different errors associated while requesting data from
|
|
|
|
//! the upstream search engines with the search query provided by the user.
|
|
|
|
|
2023-09-03 13:50:50 -04:00
|
|
|
use super::aggregation_models::SearchResult;
|
2023-09-24 07:54:08 -04:00
|
|
|
use error_stack::{Report, Result, ResultExt};
|
2023-11-20 07:27:49 -05:00
|
|
|
use reqwest::Client;
|
2024-03-11 05:01:30 -04:00
|
|
|
use std::fmt;
|
2023-05-31 12:54:51 -04:00
|
|
|
|
|
|
|
/// A custom error type used for handle engine associated errors.
|
2023-05-16 05:22:00 -04:00
|
|
|
#[derive(Debug)]
|
2023-06-14 08:42:30 -04:00
|
|
|
pub enum EngineError {
|
2023-09-24 09:09:03 -04:00
|
|
|
/// No matching engine found
|
2023-10-08 16:30:31 -04:00
|
|
|
NoSuchEngineFound(String),
|
2023-09-03 12:23:34 -04:00
|
|
|
/// This variant handles all request related errors like forbidden, not found,
|
|
|
|
/// etc.
|
2023-05-31 12:54:51 -04:00
|
|
|
EmptyResultSet,
|
2023-09-03 12:23:34 -04:00
|
|
|
/// This variant handles the not results found error provide by the upstream
|
|
|
|
/// search engines.
|
2023-06-14 08:42:30 -04:00
|
|
|
RequestError,
|
2023-09-03 12:23:34 -04:00
|
|
|
/// This variant handles all the errors which are unexpected or occur rarely
|
|
|
|
/// and are errors mostly related to failure in initialization of HeaderMap,
|
|
|
|
/// Selector errors and all other errors occurring within the code handling
|
|
|
|
/// the `upstream search engines`.
|
2023-06-14 08:42:30 -04:00
|
|
|
UnexpectedError,
|
2023-05-31 12:54:51 -04:00
|
|
|
}
|
|
|
|
|
2023-06-14 08:42:30 -04:00
|
|
|
impl fmt::Display for EngineError {
|
|
|
|
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
2023-05-31 12:54:51 -04:00
|
|
|
match self {
|
2023-10-08 16:30:31 -04:00
|
|
|
EngineError::NoSuchEngineFound(engine) => {
|
|
|
|
write!(f, "No such engine with the name '{engine}' found")
|
2023-09-24 07:54:08 -04:00
|
|
|
}
|
2023-06-14 08:42:30 -04:00
|
|
|
EngineError::EmptyResultSet => {
|
2023-05-31 12:54:51 -04:00
|
|
|
write!(f, "The upstream search engine returned an empty result set")
|
|
|
|
}
|
2023-06-14 08:42:30 -04:00
|
|
|
EngineError::RequestError => {
|
2023-06-14 18:48:37 -04:00
|
|
|
write!(
|
|
|
|
f,
|
|
|
|
"Error occurred while requesting data from upstream search engine"
|
|
|
|
)
|
|
|
|
}
|
|
|
|
EngineError::UnexpectedError => {
|
|
|
|
write!(f, "An unexpected error occurred while processing the data")
|
2023-06-04 04:56:07 -04:00
|
|
|
}
|
2023-05-31 12:54:51 -04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-07-11 12:41:34 -04:00
|
|
|
impl error_stack::Context for EngineError {}
|
|
|
|
|
2023-08-17 16:48:20 -04:00
|
|
|
/// A trait to define common behavior for all search engines.
|
2023-07-11 12:41:34 -04:00
|
|
|
#[async_trait::async_trait]
|
2023-08-18 04:43:53 -04:00
|
|
|
pub trait SearchEngine: Sync + Send {
|
2023-09-03 12:23:34 -04:00
|
|
|
/// This helper function fetches/requests the search results from the upstream search engine in
|
|
|
|
/// an html form.
|
|
|
|
///
|
|
|
|
/// # Arguments
|
|
|
|
///
|
|
|
|
/// * `url` - It takes the url of the upstream search engine with the user requested search
|
|
|
|
/// query appended in the search parameters.
|
|
|
|
/// * `header_map` - It takes the http request headers to be sent to the upstream engine in
|
|
|
|
/// order to prevent being detected as a bot. It takes the header as a HeaderMap type.
|
|
|
|
/// * `request_timeout` - It takes the request timeout value as seconds which is used to limit
|
|
|
|
/// the amount of time for each request to remain connected when until the results can be provided
|
|
|
|
/// by the upstream engine.
|
|
|
|
///
|
|
|
|
/// # Error
|
|
|
|
///
|
|
|
|
/// It returns the html data as a string if the upstream engine provides the data as expected
|
|
|
|
/// otherwise it returns a custom `EngineError`.
|
2023-07-11 12:41:34 -04:00
|
|
|
async fn fetch_html_from_upstream(
|
|
|
|
&self,
|
2023-08-27 14:00:22 -04:00
|
|
|
url: &str,
|
2023-07-11 12:41:34 -04:00
|
|
|
header_map: reqwest::header::HeaderMap,
|
2023-11-20 07:27:49 -05:00
|
|
|
client: &Client,
|
2023-07-11 12:41:34 -04:00
|
|
|
) -> Result<String, EngineError> {
|
|
|
|
// fetch the html from upstream search engine
|
2023-11-20 07:27:49 -05:00
|
|
|
Ok(client
|
2023-07-11 12:41:34 -04:00
|
|
|
.get(url)
|
2023-08-17 16:48:20 -04:00
|
|
|
.headers(header_map) // add spoofed headers to emulate human behavior
|
2023-07-11 12:41:34 -04:00
|
|
|
.send()
|
|
|
|
.await
|
|
|
|
.change_context(EngineError::RequestError)?
|
|
|
|
.text()
|
|
|
|
.await
|
|
|
|
.change_context(EngineError::RequestError)?)
|
|
|
|
}
|
|
|
|
|
2024-01-30 08:37:50 -05:00
|
|
|
/// This helper function fetches/requests the json search results from the upstream search engine as a vector of bytes.
|
|
|
|
///
|
|
|
|
/// # Arguments
|
|
|
|
///
|
|
|
|
/// * `url` - It takes the url of the upstream search engine with the user requested search
|
|
|
|
/// query appended in the search parameters.
|
|
|
|
/// * `header_map` - It takes the http request headers to be sent to the upstream engine in
|
|
|
|
/// order to prevent being detected as a bot. It takes the header as a HeaderMap type.
|
|
|
|
/// * `request_timeout` - It takes the request timeout value as seconds which is used to limit
|
|
|
|
/// the amount of time for each request to remain connected when until the results can be provided
|
|
|
|
/// by the upstream engine.
|
|
|
|
///
|
|
|
|
/// # Error
|
|
|
|
///
|
|
|
|
/// It returns the html data as a vector of bytes if the upstream engine provides the data as expected
|
|
|
|
/// otherwise it returns a custom `EngineError`.
|
|
|
|
async fn fetch_json_as_bytes_from_upstream(
|
|
|
|
&self,
|
|
|
|
url: &str,
|
|
|
|
header_map: reqwest::header::HeaderMap,
|
|
|
|
client: &Client,
|
|
|
|
) -> Result<Vec<u8>, EngineError> {
|
|
|
|
// fetch the json response from upstream search engine
|
|
|
|
|
|
|
|
Ok(client
|
|
|
|
.get(url)
|
|
|
|
.headers(header_map) // add spoofed headers to emulate human behavior
|
|
|
|
.send()
|
|
|
|
.await
|
|
|
|
.change_context(EngineError::RequestError)?
|
|
|
|
.bytes()
|
|
|
|
.await
|
|
|
|
.change_context(EngineError::RequestError)?
|
|
|
|
.to_vec())
|
|
|
|
}
|
|
|
|
|
2023-09-03 12:23:34 -04:00
|
|
|
/// This function scrapes results from the upstream engine and puts all the scraped results like
|
|
|
|
/// title, visiting_url (href in html),engine (from which engine it was fetched from) and description
|
|
|
|
/// in a RawSearchResult and then adds that to HashMap whose keys are url and values are RawSearchResult
|
|
|
|
/// struct and then returns it within a Result enum.
|
|
|
|
///
|
|
|
|
/// # Arguments
|
|
|
|
///
|
|
|
|
/// * `query` - Takes the user provided query to query to the upstream search engine with.
|
|
|
|
/// * `page` - Takes an u32 as an argument.
|
|
|
|
/// * `user_agent` - Takes a random user agent string as an argument.
|
|
|
|
/// * `request_timeout` - Takes a time (secs) as a value which controls the server request timeout.
|
|
|
|
///
|
|
|
|
/// # Errors
|
|
|
|
///
|
|
|
|
/// Returns an `EngineErrorKind` if the user is not connected to the internet or if their is failure to
|
|
|
|
/// reach the above `upstream search engine` page or if the `upstream search engine` is unable to
|
|
|
|
/// provide results for the requested search query and also returns error if the scraping selector
|
|
|
|
/// or HeaderMap fails to initialize.
|
2023-07-11 12:41:34 -04:00
|
|
|
async fn results(
|
|
|
|
&self,
|
2023-08-27 14:00:22 -04:00
|
|
|
query: &str,
|
2023-07-11 12:41:34 -04:00
|
|
|
page: u32,
|
2023-08-27 14:00:22 -04:00
|
|
|
user_agent: &str,
|
2023-11-20 07:27:49 -05:00
|
|
|
client: &Client,
|
2023-09-02 10:45:17 -04:00
|
|
|
safe_search: u8,
|
2024-03-11 05:01:30 -04:00
|
|
|
) -> Result<Vec<(String, SearchResult)>, EngineError>;
|
2023-08-18 04:43:53 -04:00
|
|
|
}
|
|
|
|
|
2023-09-03 12:23:34 -04:00
|
|
|
/// A named struct which stores the engine struct with the name of the associated engine.
|
2023-08-18 04:43:53 -04:00
|
|
|
pub struct EngineHandler {
|
2023-09-03 12:23:34 -04:00
|
|
|
/// It stores the engine struct wrapped in a box smart pointer as the engine struct implements
|
|
|
|
/// the `SearchEngine` trait.
|
2023-08-18 04:43:53 -04:00
|
|
|
engine: Box<dyn SearchEngine>,
|
2023-09-03 12:23:34 -04:00
|
|
|
/// It stores the name of the engine to which the struct is associated to.
|
2023-08-18 04:43:53 -04:00
|
|
|
name: &'static str,
|
|
|
|
}
|
|
|
|
|
|
|
|
impl Clone for EngineHandler {
|
|
|
|
fn clone(&self) -> Self {
|
|
|
|
Self::new(self.name).unwrap()
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
impl EngineHandler {
|
2023-09-03 12:23:34 -04:00
|
|
|
/// Parses an engine name into an engine handler.
|
|
|
|
///
|
|
|
|
/// # Arguments
|
|
|
|
///
|
|
|
|
/// * `engine_name` - It takes the name of the engine to which the struct was associated to.
|
|
|
|
///
|
|
|
|
/// # Returns
|
|
|
|
///
|
|
|
|
/// It returns an option either containing the value or a none if the engine is unknown
|
2023-09-24 07:54:08 -04:00
|
|
|
pub fn new(engine_name: &str) -> Result<Self, EngineError> {
|
2023-08-18 04:43:53 -04:00
|
|
|
let engine: (&'static str, Box<dyn SearchEngine>) =
|
|
|
|
match engine_name.to_lowercase().as_str() {
|
2023-09-24 07:54:08 -04:00
|
|
|
"duckduckgo" => {
|
|
|
|
let engine = crate::engines::duckduckgo::DuckDuckGo::new()?;
|
|
|
|
("duckduckgo", Box::new(engine))
|
|
|
|
}
|
|
|
|
"searx" => {
|
|
|
|
let engine = crate::engines::searx::Searx::new()?;
|
|
|
|
("searx", Box::new(engine))
|
|
|
|
}
|
2023-10-16 13:25:15 -04:00
|
|
|
"brave" => {
|
|
|
|
let engine = crate::engines::brave::Brave::new()?;
|
|
|
|
("brave", Box::new(engine))
|
|
|
|
}
|
2023-12-05 12:47:28 -05:00
|
|
|
"startpage" => {
|
|
|
|
let engine = crate::engines::startpage::Startpage::new()?;
|
|
|
|
("startpage", Box::new(engine))
|
|
|
|
}
|
2023-12-09 06:25:28 -05:00
|
|
|
"librex" => {
|
|
|
|
let engine = crate::engines::librex::LibreX::new()?;
|
|
|
|
("librex", Box::new(engine))
|
|
|
|
}
|
2023-12-29 11:21:06 -05:00
|
|
|
"mojeek" => {
|
|
|
|
let engine = crate::engines::mojeek::Mojeek::new()?;
|
|
|
|
("mojeek", Box::new(engine))
|
|
|
|
}
|
2024-01-01 06:57:31 -05:00
|
|
|
"bing" => {
|
|
|
|
let engine = crate::engines::bing::Bing::new()?;
|
|
|
|
("bing", Box::new(engine))
|
|
|
|
}
|
2023-10-08 16:30:31 -04:00
|
|
|
_ => {
|
|
|
|
return Err(Report::from(EngineError::NoSuchEngineFound(
|
|
|
|
engine_name.to_string(),
|
|
|
|
)))
|
|
|
|
}
|
2023-08-18 04:43:53 -04:00
|
|
|
};
|
|
|
|
|
2023-09-24 07:54:08 -04:00
|
|
|
Ok(Self {
|
2023-08-18 04:43:53 -04:00
|
|
|
engine: engine.1,
|
|
|
|
name: engine.0,
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
2023-09-03 12:23:34 -04:00
|
|
|
/// This function converts the EngineHandler type into a tuple containing the engine name and
|
|
|
|
/// the associated engine struct.
|
2023-08-18 04:43:53 -04:00
|
|
|
pub fn into_name_engine(self) -> (&'static str, Box<dyn SearchEngine>) {
|
|
|
|
(self.name, self.engine)
|
|
|
|
}
|
2023-07-11 12:41:34 -04:00
|
|
|
}
|