mirror of
https://github.com/neon-mmd/websurfx.git
synced 2024-12-22 04:18:21 -05:00
Merge pull request #273 from gzsombor/search-result-parse-refactor
⚙️ Rewrite the search result parser code to make the engine code more concise
This commit is contained in:
commit
40b7e6d2ea
@ -3,6 +3,7 @@
|
||||
|
||||
use crate::handler::paths::{file_path, FileType};
|
||||
|
||||
use crate::models::engine_models::{EngineError, EngineHandler};
|
||||
use crate::models::parser_models::{AggregatorConfig, RateLimiter, Style};
|
||||
use log::LevelFilter;
|
||||
use mlua::Lua;
|
||||
@ -28,7 +29,7 @@ pub struct Config {
|
||||
/// It stores the option to whether enable or disable debug mode.
|
||||
pub debug: bool,
|
||||
/// It stores all the engine names that were enabled by the user.
|
||||
pub upstream_search_engines: Vec<crate::models::engine_models::EngineHandler>,
|
||||
pub upstream_search_engines: Vec<EngineHandler>,
|
||||
/// It stores the time (secs) which controls the server request timeout.
|
||||
pub request_timeout: u8,
|
||||
/// It stores the number of threads which controls the app will use to run.
|
||||
@ -111,8 +112,8 @@ impl Config {
|
||||
.get::<_, HashMap<String, bool>>("upstream_search_engines")?
|
||||
.into_iter()
|
||||
.filter_map(|(key, value)| value.then_some(key))
|
||||
.filter_map(|engine| crate::models::engine_models::EngineHandler::new(&engine))
|
||||
.collect(),
|
||||
.map(|engine| EngineHandler::new(&engine))
|
||||
.collect::<Result<Vec<EngineHandler>, error_stack::Report<EngineError>>>()?,
|
||||
request_timeout: globals.get::<_, u8>("request_timeout")?,
|
||||
threads,
|
||||
rate_limiter: RateLimiter {
|
||||
|
@ -5,7 +5,7 @@
|
||||
use std::collections::HashMap;
|
||||
|
||||
use reqwest::header::HeaderMap;
|
||||
use scraper::{Html, Selector};
|
||||
use scraper::Html;
|
||||
|
||||
use crate::models::aggregation_models::SearchResult;
|
||||
|
||||
@ -13,9 +13,29 @@ use crate::models::engine_models::{EngineError, SearchEngine};
|
||||
|
||||
use error_stack::{Report, Result, ResultExt};
|
||||
|
||||
use super::search_result_parser::SearchResultParser;
|
||||
|
||||
/// A new DuckDuckGo engine type defined in-order to implement the `SearchEngine` trait which allows to
|
||||
/// reduce code duplication as well as allows to create vector of different search engines easily.
|
||||
pub struct DuckDuckGo;
|
||||
pub struct DuckDuckGo {
|
||||
/// The parser, used to interpret the search result.
|
||||
parser: SearchResultParser,
|
||||
}
|
||||
|
||||
impl DuckDuckGo {
|
||||
/// Creates the DuckDuckGo parser.
|
||||
pub fn new() -> Result<Self, EngineError> {
|
||||
Ok(Self {
|
||||
parser: SearchResultParser::new(
|
||||
".no-results",
|
||||
".result",
|
||||
".result__a",
|
||||
".result__url",
|
||||
".result__snippet",
|
||||
)?,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl SearchEngine for DuckDuckGo {
|
||||
@ -59,58 +79,19 @@ impl SearchEngine for DuckDuckGo {
|
||||
&DuckDuckGo::fetch_html_from_upstream(self, &url, header_map, request_timeout).await?,
|
||||
);
|
||||
|
||||
let no_result: Selector = Selector::parse(".no-results")
|
||||
.map_err(|_| Report::new(EngineError::UnexpectedError))
|
||||
.attach_printable_lazy(|| format!("invalid CSS selector: {}", ".no-results"))?;
|
||||
|
||||
if document.select(&no_result).next().is_some() {
|
||||
if self.parser.parse_for_no_results(&document).next().is_some() {
|
||||
return Err(Report::new(EngineError::EmptyResultSet));
|
||||
}
|
||||
|
||||
let results: Selector = Selector::parse(".result")
|
||||
.map_err(|_| Report::new(EngineError::UnexpectedError))
|
||||
.attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result"))?;
|
||||
let result_title: Selector = Selector::parse(".result__a")
|
||||
.map_err(|_| Report::new(EngineError::UnexpectedError))
|
||||
.attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result__a"))?;
|
||||
let result_url: Selector = Selector::parse(".result__url")
|
||||
.map_err(|_| Report::new(EngineError::UnexpectedError))
|
||||
.attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result__url"))?;
|
||||
let result_desc: Selector = Selector::parse(".result__snippet")
|
||||
.map_err(|_| Report::new(EngineError::UnexpectedError))
|
||||
.attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result__snippet"))?;
|
||||
|
||||
// scrape all the results from the html
|
||||
Ok(document
|
||||
.select(&results)
|
||||
.map(|result| {
|
||||
SearchResult::new(
|
||||
result
|
||||
.select(&result_title)
|
||||
.next()
|
||||
.unwrap()
|
||||
.inner_html()
|
||||
.trim(),
|
||||
format!(
|
||||
"https://{}",
|
||||
result
|
||||
.select(&result_url)
|
||||
.next()
|
||||
.unwrap()
|
||||
.inner_html()
|
||||
.trim()
|
||||
)
|
||||
.as_str(),
|
||||
result
|
||||
.select(&result_desc)
|
||||
.next()
|
||||
.unwrap()
|
||||
.inner_html()
|
||||
.trim(),
|
||||
self.parser
|
||||
.parse_for_results(&document, |title, url, desc| {
|
||||
Some(SearchResult::new(
|
||||
title.inner_html().trim(),
|
||||
&format!("https://{}", url.inner_html().trim()),
|
||||
desc.inner_html().trim(),
|
||||
&["duckduckgo"],
|
||||
)
|
||||
))
|
||||
})
|
||||
.map(|search_result| (search_result.url.clone(), search_result))
|
||||
.collect())
|
||||
}
|
||||
}
|
||||
|
@ -4,4 +4,5 @@
|
||||
//! code. Moreover, it also provides a custom error for the upstream search engine handling code.
|
||||
|
||||
pub mod duckduckgo;
|
||||
pub mod search_result_parser;
|
||||
pub mod searx;
|
||||
|
76
src/engines/search_result_parser.rs
Normal file
76
src/engines/search_result_parser.rs
Normal file
@ -0,0 +1,76 @@
|
||||
//! This modules provides helper functionalities for parsing a html document into internal SearchResult.
|
||||
use std::collections::HashMap;
|
||||
|
||||
use crate::models::{aggregation_models::SearchResult, engine_models::EngineError};
|
||||
use error_stack::{Report, Result};
|
||||
use scraper::{html::Select, ElementRef, Html, Selector};
|
||||
|
||||
/// A html search result parser, based on a predefined CSS selectors.
|
||||
pub struct SearchResultParser {
|
||||
/// selector to locate the element which is displayed, if there were nothing found.
|
||||
no_result: Selector,
|
||||
/// selector to locate the element which contains one item from the search result.
|
||||
results: Selector,
|
||||
/// selector to locate the title relative to the search result item.
|
||||
result_title: Selector,
|
||||
/// selector to locate the url relative to the search result item.
|
||||
result_url: Selector,
|
||||
/// selector to locate the description relative to the search result item.
|
||||
result_desc: Selector,
|
||||
}
|
||||
|
||||
impl SearchResultParser {
|
||||
/// Creates a new parser, if all the selectors are valid, otherwise it returns an EngineError
|
||||
pub fn new(
|
||||
no_result_selector: &str,
|
||||
results_selector: &str,
|
||||
result_title_selector: &str,
|
||||
result_url_selector: &str,
|
||||
result_desc_selector: &str,
|
||||
) -> Result<SearchResultParser, EngineError> {
|
||||
Ok(SearchResultParser {
|
||||
no_result: new_selector(no_result_selector)?,
|
||||
results: new_selector(results_selector)?,
|
||||
result_title: new_selector(result_title_selector)?,
|
||||
result_url: new_selector(result_url_selector)?,
|
||||
result_desc: new_selector(result_desc_selector)?,
|
||||
})
|
||||
}
|
||||
|
||||
/// Parse the html and returns element representing the 'no result found' response.
|
||||
pub fn parse_for_no_results<'a>(&'a self, document: &'a Html) -> Select<'a, 'a> {
|
||||
document.select(&self.no_result)
|
||||
}
|
||||
|
||||
/// Parse the html, and convert the results to SearchResult with the help of the builder function
|
||||
pub fn parse_for_results(
|
||||
&self,
|
||||
document: &Html,
|
||||
builder: impl Fn(&ElementRef<'_>, &ElementRef<'_>, &ElementRef<'_>) -> Option<SearchResult>,
|
||||
) -> Result<HashMap<String, SearchResult>, EngineError> {
|
||||
let res = document
|
||||
.select(&self.results)
|
||||
.filter_map(|result| {
|
||||
let title = result.select(&self.result_title).next();
|
||||
let url = result.select(&self.result_url).next();
|
||||
let desc = result.select(&self.result_desc).next();
|
||||
match (title, url, desc) {
|
||||
(Some(ref t), Some(ref u), Some(ref d)) => builder(t, u, d),
|
||||
_ => None,
|
||||
}
|
||||
})
|
||||
.map(|search_result| (search_result.url.clone(), search_result))
|
||||
.collect();
|
||||
Ok(res)
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a Selector struct, if the given parameter is a valid css expression, otherwise convert it into an EngineError.
|
||||
fn new_selector(selector: &str) -> Result<Selector, EngineError> {
|
||||
Selector::parse(selector).map_err(|err| {
|
||||
Report::new(EngineError::UnexpectedError).attach_printable(format!(
|
||||
"invalid CSS selector: {}, err: {:?}",
|
||||
selector, err
|
||||
))
|
||||
})
|
||||
}
|
@ -3,16 +3,35 @@
|
||||
//! number if provided.
|
||||
|
||||
use reqwest::header::HeaderMap;
|
||||
use scraper::{Html, Selector};
|
||||
use scraper::Html;
|
||||
use std::collections::HashMap;
|
||||
|
||||
use super::search_result_parser::SearchResultParser;
|
||||
use crate::models::aggregation_models::SearchResult;
|
||||
use crate::models::engine_models::{EngineError, SearchEngine};
|
||||
use error_stack::{Report, Result, ResultExt};
|
||||
|
||||
/// A new Searx engine type defined in-order to implement the `SearchEngine` trait which allows to
|
||||
/// reduce code duplication as well as allows to create vector of different search engines easily.
|
||||
pub struct Searx;
|
||||
pub struct Searx {
|
||||
/// The parser, used to interpret the search result.
|
||||
parser: SearchResultParser,
|
||||
}
|
||||
|
||||
impl Searx {
|
||||
/// creates a Searx parser
|
||||
pub fn new() -> Result<Searx, EngineError> {
|
||||
Ok(Self {
|
||||
parser: SearchResultParser::new(
|
||||
"#urls>.dialog-error>p",
|
||||
".result",
|
||||
"h3>a",
|
||||
"h3>a",
|
||||
".content",
|
||||
)?,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl SearchEngine for Searx {
|
||||
@ -52,13 +71,7 @@ impl SearchEngine for Searx {
|
||||
&Searx::fetch_html_from_upstream(self, &url, header_map, request_timeout).await?,
|
||||
);
|
||||
|
||||
let no_result: Selector = Selector::parse("#urls>.dialog-error>p")
|
||||
.map_err(|_| Report::new(EngineError::UnexpectedError))
|
||||
.attach_printable_lazy(|| {
|
||||
format!("invalid CSS selector: {}", "#urls>.dialog-error>p")
|
||||
})?;
|
||||
|
||||
if let Some(no_result_msg) = document.select(&no_result).nth(1) {
|
||||
if let Some(no_result_msg) = self.parser.parse_for_no_results(&document).nth(1) {
|
||||
if no_result_msg.inner_html()
|
||||
== "we didn't find any results. Please use another query or search in more categories"
|
||||
{
|
||||
@ -66,48 +79,17 @@ impl SearchEngine for Searx {
|
||||
}
|
||||
}
|
||||
|
||||
let results: Selector = Selector::parse(".result")
|
||||
.map_err(|_| Report::new(EngineError::UnexpectedError))
|
||||
.attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result"))?;
|
||||
let result_title: Selector = Selector::parse("h3>a")
|
||||
.map_err(|_| Report::new(EngineError::UnexpectedError))
|
||||
.attach_printable_lazy(|| format!("invalid CSS selector: {}", "h3>a"))?;
|
||||
let result_url: Selector = Selector::parse("h3>a")
|
||||
.map_err(|_| Report::new(EngineError::UnexpectedError))
|
||||
.attach_printable_lazy(|| format!("invalid CSS selector: {}", "h3>a"))?;
|
||||
|
||||
let result_desc: Selector = Selector::parse(".content")
|
||||
.map_err(|_| Report::new(EngineError::UnexpectedError))
|
||||
.attach_printable_lazy(|| format!("invalid CSS selector: {}", ".content"))?;
|
||||
|
||||
// scrape all the results from the html
|
||||
Ok(document
|
||||
.select(&results)
|
||||
.map(|result| {
|
||||
SearchResult::new(
|
||||
result
|
||||
.select(&result_title)
|
||||
.next()
|
||||
.unwrap()
|
||||
.inner_html()
|
||||
.trim(),
|
||||
result
|
||||
.select(&result_url)
|
||||
.next()
|
||||
.unwrap()
|
||||
.value()
|
||||
.attr("href")
|
||||
.unwrap(),
|
||||
result
|
||||
.select(&result_desc)
|
||||
.next()
|
||||
.unwrap()
|
||||
.inner_html()
|
||||
.trim(),
|
||||
&["searx"],
|
||||
)
|
||||
self.parser
|
||||
.parse_for_results(&document, |title, url, desc| {
|
||||
url.value().attr("href").map(|url| {
|
||||
SearchResult::new(
|
||||
title.inner_html().trim(),
|
||||
url,
|
||||
desc.inner_html().trim(),
|
||||
&["searx"],
|
||||
)
|
||||
})
|
||||
})
|
||||
.map(|search_result| (search_result.url.clone(), search_result))
|
||||
.collect())
|
||||
}
|
||||
}
|
||||
|
@ -85,12 +85,14 @@ impl EngineErrorInfo {
|
||||
pub fn new(error: &EngineError, engine: &str) -> Self {
|
||||
Self {
|
||||
error: match error {
|
||||
EngineError::NoSuchEngineFound(_) => "EngineNotFound".to_owned(),
|
||||
EngineError::RequestError => "RequestError".to_owned(),
|
||||
EngineError::EmptyResultSet => "EmptyResultSet".to_owned(),
|
||||
EngineError::UnexpectedError => "UnexpectedError".to_owned(),
|
||||
},
|
||||
engine: engine.to_owned(),
|
||||
severity_color: match error {
|
||||
EngineError::NoSuchEngineFound(_) => "red".to_owned(),
|
||||
EngineError::RequestError => "green".to_owned(),
|
||||
EngineError::EmptyResultSet => "blue".to_owned(),
|
||||
EngineError::UnexpectedError => "red".to_owned(),
|
||||
|
@ -2,12 +2,14 @@
|
||||
//! the upstream search engines with the search query provided by the user.
|
||||
|
||||
use super::aggregation_models::SearchResult;
|
||||
use error_stack::{Result, ResultExt};
|
||||
use error_stack::{Report, Result, ResultExt};
|
||||
use std::{collections::HashMap, fmt, time::Duration};
|
||||
|
||||
/// A custom error type used for handle engine associated errors.
|
||||
#[derive(Debug)]
|
||||
pub enum EngineError {
|
||||
/// No matching engine found
|
||||
NoSuchEngineFound(String),
|
||||
/// This variant handles all request related errors like forbidden, not found,
|
||||
/// etc.
|
||||
EmptyResultSet,
|
||||
@ -24,6 +26,9 @@ pub enum EngineError {
|
||||
impl fmt::Display for EngineError {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
match self {
|
||||
EngineError::NoSuchEngineFound(engine) => {
|
||||
write!(f, "No such engine with the name '{engine}' found")
|
||||
}
|
||||
EngineError::EmptyResultSet => {
|
||||
write!(f, "The upstream search engine returned an empty result set")
|
||||
}
|
||||
@ -134,18 +139,25 @@ impl EngineHandler {
|
||||
/// # Returns
|
||||
///
|
||||
/// It returns an option either containing the value or a none if the engine is unknown
|
||||
pub fn new(engine_name: &str) -> Option<Self> {
|
||||
pub fn new(engine_name: &str) -> Result<Self, EngineError> {
|
||||
let engine: (&'static str, Box<dyn SearchEngine>) =
|
||||
match engine_name.to_lowercase().as_str() {
|
||||
"duckduckgo" => (
|
||||
"duckduckgo",
|
||||
Box::new(crate::engines::duckduckgo::DuckDuckGo),
|
||||
),
|
||||
"searx" => ("searx", Box::new(crate::engines::searx::Searx)),
|
||||
_ => return None,
|
||||
"duckduckgo" => {
|
||||
let engine = crate::engines::duckduckgo::DuckDuckGo::new()?;
|
||||
("duckduckgo", Box::new(engine))
|
||||
}
|
||||
"searx" => {
|
||||
let engine = crate::engines::searx::Searx::new()?;
|
||||
("searx", Box::new(engine))
|
||||
}
|
||||
_ => {
|
||||
return Err(Report::from(EngineError::NoSuchEngineFound(
|
||||
engine_name.to_string(),
|
||||
)))
|
||||
}
|
||||
};
|
||||
|
||||
Some(Self {
|
||||
Ok(Self {
|
||||
engine: engine.1,
|
||||
name: engine.0,
|
||||
})
|
||||
|
@ -191,7 +191,7 @@ async fn results(
|
||||
let engines: Vec<EngineHandler> = cookie_value
|
||||
.engines
|
||||
.iter()
|
||||
.filter_map(|name| EngineHandler::new(name))
|
||||
.filter_map(|name| EngineHandler::new(name).ok())
|
||||
.collect();
|
||||
|
||||
safe_search_level = match config.safe_search {
|
||||
|
Loading…
Reference in New Issue
Block a user