0
0
mirror of https://github.com/neon-mmd/websurfx.git synced 2024-11-22 05:58:21 -05:00

Create separate search_result_parser

This commit is contained in:
Zsombor Gegesy 2023-09-24 13:54:08 +02:00
parent 769d870803
commit 75a77d25f0
8 changed files with 110 additions and 62 deletions

View File

@ -111,7 +111,7 @@ impl Config {
.get::<_, HashMap<String, bool>>("upstream_search_engines")? .get::<_, HashMap<String, bool>>("upstream_search_engines")?
.into_iter() .into_iter()
.filter_map(|(key, value)| value.then_some(key)) .filter_map(|(key, value)| value.then_some(key))
.filter_map(|engine| crate::models::engine_models::EngineHandler::new(&engine)) .filter_map(|engine| crate::models::engine_models::EngineHandler::new(&engine).ok())
.collect(), .collect(),
request_timeout: globals.get::<_, u8>("request_timeout")?, request_timeout: globals.get::<_, u8>("request_timeout")?,
threads, threads,

View File

@ -5,7 +5,7 @@
use std::collections::HashMap; use std::collections::HashMap;
use reqwest::header::HeaderMap; use reqwest::header::HeaderMap;
use scraper::{Html, Selector}; use scraper::Html;
use crate::models::aggregation_models::SearchResult; use crate::models::aggregation_models::SearchResult;
@ -13,9 +13,27 @@ use crate::models::engine_models::{EngineError, SearchEngine};
use error_stack::{Report, Result, ResultExt}; use error_stack::{Report, Result, ResultExt};
use super::search_result_parser::SearchResultParser;
/// A new DuckDuckGo engine type defined in-order to implement the `SearchEngine` trait which allows to /// A new DuckDuckGo engine type defined in-order to implement the `SearchEngine` trait which allows to
/// reduce code duplication as well as allows to create vector of different search engines easily. /// reduce code duplication as well as allows to create vector of different search engines easily.
pub struct DuckDuckGo; pub struct DuckDuckGo {
parser: SearchResultParser,
}
impl DuckDuckGo {
pub fn new() -> Result<Self, EngineError> {
Ok(Self {
parser: SearchResultParser::new(
".no-results",
".result",
".result__a",
".result__url",
".result__snippet",
)?,
})
}
}
#[async_trait::async_trait] #[async_trait::async_trait]
impl SearchEngine for DuckDuckGo { impl SearchEngine for DuckDuckGo {
@ -59,34 +77,17 @@ impl SearchEngine for DuckDuckGo {
&DuckDuckGo::fetch_html_from_upstream(self, &url, header_map, request_timeout).await?, &DuckDuckGo::fetch_html_from_upstream(self, &url, header_map, request_timeout).await?,
); );
let no_result: Selector = Selector::parse(".no-results") if document.select(&self.parser.no_result).next().is_some() {
.map_err(|_| Report::new(EngineError::UnexpectedError))
.attach_printable_lazy(|| format!("invalid CSS selector: {}", ".no-results"))?;
if document.select(&no_result).next().is_some() {
return Err(Report::new(EngineError::EmptyResultSet)); return Err(Report::new(EngineError::EmptyResultSet));
} }
let results: Selector = Selector::parse(".result")
.map_err(|_| Report::new(EngineError::UnexpectedError))
.attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result"))?;
let result_title: Selector = Selector::parse(".result__a")
.map_err(|_| Report::new(EngineError::UnexpectedError))
.attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result__a"))?;
let result_url: Selector = Selector::parse(".result__url")
.map_err(|_| Report::new(EngineError::UnexpectedError))
.attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result__url"))?;
let result_desc: Selector = Selector::parse(".result__snippet")
.map_err(|_| Report::new(EngineError::UnexpectedError))
.attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result__snippet"))?;
// scrape all the results from the html // scrape all the results from the html
Ok(document Ok(document
.select(&results) .select(&self.parser.results)
.map(|result| { .map(|result| {
SearchResult::new( SearchResult::new(
result result
.select(&result_title) .select(&self.parser.result_title)
.next() .next()
.unwrap() .unwrap()
.inner_html() .inner_html()
@ -94,7 +95,7 @@ impl SearchEngine for DuckDuckGo {
format!( format!(
"https://{}", "https://{}",
result result
.select(&result_url) .select(&self.parser.result_url)
.next() .next()
.unwrap() .unwrap()
.inner_html() .inner_html()
@ -102,7 +103,7 @@ impl SearchEngine for DuckDuckGo {
) )
.as_str(), .as_str(),
result result
.select(&result_desc) .select(&self.parser.result_desc)
.next() .next()
.unwrap() .unwrap()
.inner_html() .inner_html()

View File

@ -4,4 +4,5 @@
//! code. Moreover, it also provides a custom error for the upstream search engine handling code. //! code. Moreover, it also provides a custom error for the upstream search engine handling code.
pub mod duckduckgo; pub mod duckduckgo;
pub mod search_result_parser;
pub mod searx; pub mod searx;

View File

@ -0,0 +1,38 @@
use crate::models::engine_models::EngineError;
use error_stack::{Report, Result, ResultExt};
use scraper::{Html, Selector};
pub struct SearchResultParser {
pub no_result: Selector,
pub results: Selector,
pub result_title: Selector,
pub result_url: Selector,
pub result_desc: Selector,
}
impl SearchResultParser {
pub fn new(
no_result_selector: &str,
results_selector: &str,
result_title_selector: &str,
result_url_selector: &str,
result_desc_selector: &str,
) -> Result<SearchResultParser, EngineError> {
Ok(SearchResultParser {
no_result: new_selector(no_result_selector)?,
results: new_selector(results_selector)?,
result_title: new_selector(result_title_selector)?,
result_url: new_selector(result_url_selector)?,
result_desc: new_selector(result_desc_selector)?,
})
}
}
fn new_selector(selector: &str) -> Result<Selector, EngineError> {
Selector::parse(selector).map_err(|err| {
Report::new(EngineError::UnexpectedError).attach_printable(format!(
"invalid CSS selector: {}, err: {:?}",
selector, err
))
})
}

View File

@ -3,16 +3,34 @@
//! number if provided. //! number if provided.
use reqwest::header::HeaderMap; use reqwest::header::HeaderMap;
use scraper::{Html, Selector}; use scraper::Html;
use std::collections::HashMap; use std::collections::HashMap;
use super::search_result_parser::SearchResultParser;
use crate::models::aggregation_models::SearchResult; use crate::models::aggregation_models::SearchResult;
use crate::models::engine_models::{EngineError, SearchEngine}; use crate::models::engine_models::{EngineError, SearchEngine};
use error_stack::{Report, Result, ResultExt}; use error_stack::{Report, Result, ResultExt};
/// A new Searx engine type defined in-order to implement the `SearchEngine` trait which allows to /// A new Searx engine type defined in-order to implement the `SearchEngine` trait which allows to
/// reduce code duplication as well as allows to create vector of different search engines easily. /// reduce code duplication as well as allows to create vector of different search engines easily.
pub struct Searx; pub struct Searx {
parser: SearchResultParser,
}
impl Searx {
// new Searchx engine
pub fn new() -> Result<Searx, EngineError> {
Ok(Self {
parser: SearchResultParser::new(
"#urls>.dialog-error>p",
".result",
"h3>a",
"h3>a",
".content",
)?,
})
}
}
#[async_trait::async_trait] #[async_trait::async_trait]
impl SearchEngine for Searx { impl SearchEngine for Searx {
@ -52,13 +70,7 @@ impl SearchEngine for Searx {
&Searx::fetch_html_from_upstream(self, &url, header_map, request_timeout).await?, &Searx::fetch_html_from_upstream(self, &url, header_map, request_timeout).await?,
); );
let no_result: Selector = Selector::parse("#urls>.dialog-error>p") if let Some(no_result_msg) = document.select(&self.parser.no_result).nth(1) {
.map_err(|_| Report::new(EngineError::UnexpectedError))
.attach_printable_lazy(|| {
format!("invalid CSS selector: {}", "#urls>.dialog-error>p")
})?;
if let Some(no_result_msg) = document.select(&no_result).nth(1) {
if no_result_msg.inner_html() if no_result_msg.inner_html()
== "we didn't find any results. Please use another query or search in more categories" == "we didn't find any results. Please use another query or search in more categories"
{ {
@ -66,40 +78,26 @@ impl SearchEngine for Searx {
} }
} }
let results: Selector = Selector::parse(".result")
.map_err(|_| Report::new(EngineError::UnexpectedError))
.attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result"))?;
let result_title: Selector = Selector::parse("h3>a")
.map_err(|_| Report::new(EngineError::UnexpectedError))
.attach_printable_lazy(|| format!("invalid CSS selector: {}", "h3>a"))?;
let result_url: Selector = Selector::parse("h3>a")
.map_err(|_| Report::new(EngineError::UnexpectedError))
.attach_printable_lazy(|| format!("invalid CSS selector: {}", "h3>a"))?;
let result_desc: Selector = Selector::parse(".content")
.map_err(|_| Report::new(EngineError::UnexpectedError))
.attach_printable_lazy(|| format!("invalid CSS selector: {}", ".content"))?;
// scrape all the results from the html // scrape all the results from the html
Ok(document Ok(document
.select(&results) .select(&self.parser.results)
.map(|result| { .map(|result| {
SearchResult::new( SearchResult::new(
result result
.select(&result_title) .select(&self.parser.result_title)
.next() .next()
.unwrap() .unwrap()
.inner_html() .inner_html()
.trim(), .trim(),
result result
.select(&result_url) .select(&self.parser.result_url)
.next() .next()
.unwrap() .unwrap()
.value() .value()
.attr("href") .attr("href")
.unwrap(), .unwrap(),
result result
.select(&result_desc) .select(&self.parser.result_desc)
.next() .next()
.unwrap() .unwrap()
.inner_html() .inner_html()

View File

@ -85,12 +85,14 @@ impl EngineErrorInfo {
pub fn new(error: &EngineError, engine: &str) -> Self { pub fn new(error: &EngineError, engine: &str) -> Self {
Self { Self {
error: match error { error: match error {
EngineError::EngineNotFound => "EngineNotFound".to_owned(),
EngineError::RequestError => "RequestError".to_owned(), EngineError::RequestError => "RequestError".to_owned(),
EngineError::EmptyResultSet => "EmptyResultSet".to_owned(), EngineError::EmptyResultSet => "EmptyResultSet".to_owned(),
EngineError::UnexpectedError => "UnexpectedError".to_owned(), EngineError::UnexpectedError => "UnexpectedError".to_owned(),
}, },
engine: engine.to_owned(), engine: engine.to_owned(),
severity_color: match error { severity_color: match error {
EngineError::EngineNotFound => "red".to_owned(),
EngineError::RequestError => "green".to_owned(), EngineError::RequestError => "green".to_owned(),
EngineError::EmptyResultSet => "blue".to_owned(), EngineError::EmptyResultSet => "blue".to_owned(),
EngineError::UnexpectedError => "red".to_owned(), EngineError::UnexpectedError => "red".to_owned(),

View File

@ -2,12 +2,14 @@
//! the upstream search engines with the search query provided by the user. //! the upstream search engines with the search query provided by the user.
use super::aggregation_models::SearchResult; use super::aggregation_models::SearchResult;
use error_stack::{Result, ResultExt}; use error_stack::{Report, Result, ResultExt};
use std::{collections::HashMap, fmt, time::Duration}; use std::{collections::HashMap, fmt, time::Duration};
/// A custom error type used for handle engine associated errors. /// A custom error type used for handle engine associated errors.
#[derive(Debug)] #[derive(Debug)]
pub enum EngineError { pub enum EngineError {
// No matching engine found
EngineNotFound,
/// This variant handles all request related errors like forbidden, not found, /// This variant handles all request related errors like forbidden, not found,
/// etc. /// etc.
EmptyResultSet, EmptyResultSet,
@ -24,6 +26,9 @@ pub enum EngineError {
impl fmt::Display for EngineError { impl fmt::Display for EngineError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self { match self {
EngineError::EngineNotFound => {
write!(f, "Search engine not found")
}
EngineError::EmptyResultSet => { EngineError::EmptyResultSet => {
write!(f, "The upstream search engine returned an empty result set") write!(f, "The upstream search engine returned an empty result set")
} }
@ -134,18 +139,21 @@ impl EngineHandler {
/// # Returns /// # Returns
/// ///
/// It returns an option either containing the value or a none if the engine is unknown /// It returns an option either containing the value or a none if the engine is unknown
pub fn new(engine_name: &str) -> Option<Self> { pub fn new(engine_name: &str) -> Result<Self, EngineError> {
let engine: (&'static str, Box<dyn SearchEngine>) = let engine: (&'static str, Box<dyn SearchEngine>) =
match engine_name.to_lowercase().as_str() { match engine_name.to_lowercase().as_str() {
"duckduckgo" => ( "duckduckgo" => {
"duckduckgo", let engine = crate::engines::duckduckgo::DuckDuckGo::new()?;
Box::new(crate::engines::duckduckgo::DuckDuckGo), ("duckduckgo", Box::new(engine))
), }
"searx" => ("searx", Box::new(crate::engines::searx::Searx)), "searx" => {
_ => return None, let engine = crate::engines::searx::Searx::new()?;
("searx", Box::new(engine))
}
_ => return Err(Report::from(EngineError::EngineNotFound)),
}; };
Some(Self { Ok(Self {
engine: engine.1, engine: engine.1,
name: engine.0, name: engine.0,
}) })

View File

@ -191,7 +191,7 @@ async fn results(
let engines: Vec<EngineHandler> = cookie_value let engines: Vec<EngineHandler> = cookie_value
.engines .engines
.iter() .iter()
.filter_map(|name| EngineHandler::new(name)) .filter_map(|name| EngineHandler::new(name).ok())
.collect(); .collect();
safe_search_level = match config.safe_search { safe_search_level = match config.safe_search {