mirror of
https://github.com/neon-mmd/websurfx.git
synced 2024-11-22 05:58:21 -05:00
Create separate search_result_parser
This commit is contained in:
parent
769d870803
commit
75a77d25f0
@ -111,7 +111,7 @@ impl Config {
|
|||||||
.get::<_, HashMap<String, bool>>("upstream_search_engines")?
|
.get::<_, HashMap<String, bool>>("upstream_search_engines")?
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.filter_map(|(key, value)| value.then_some(key))
|
.filter_map(|(key, value)| value.then_some(key))
|
||||||
.filter_map(|engine| crate::models::engine_models::EngineHandler::new(&engine))
|
.filter_map(|engine| crate::models::engine_models::EngineHandler::new(&engine).ok())
|
||||||
.collect(),
|
.collect(),
|
||||||
request_timeout: globals.get::<_, u8>("request_timeout")?,
|
request_timeout: globals.get::<_, u8>("request_timeout")?,
|
||||||
threads,
|
threads,
|
||||||
|
@ -5,7 +5,7 @@
|
|||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
|
|
||||||
use reqwest::header::HeaderMap;
|
use reqwest::header::HeaderMap;
|
||||||
use scraper::{Html, Selector};
|
use scraper::Html;
|
||||||
|
|
||||||
use crate::models::aggregation_models::SearchResult;
|
use crate::models::aggregation_models::SearchResult;
|
||||||
|
|
||||||
@ -13,9 +13,27 @@ use crate::models::engine_models::{EngineError, SearchEngine};
|
|||||||
|
|
||||||
use error_stack::{Report, Result, ResultExt};
|
use error_stack::{Report, Result, ResultExt};
|
||||||
|
|
||||||
|
use super::search_result_parser::SearchResultParser;
|
||||||
|
|
||||||
/// A new DuckDuckGo engine type defined in-order to implement the `SearchEngine` trait which allows to
|
/// A new DuckDuckGo engine type defined in-order to implement the `SearchEngine` trait which allows to
|
||||||
/// reduce code duplication as well as allows to create vector of different search engines easily.
|
/// reduce code duplication as well as allows to create vector of different search engines easily.
|
||||||
pub struct DuckDuckGo;
|
pub struct DuckDuckGo {
|
||||||
|
parser: SearchResultParser,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl DuckDuckGo {
|
||||||
|
pub fn new() -> Result<Self, EngineError> {
|
||||||
|
Ok(Self {
|
||||||
|
parser: SearchResultParser::new(
|
||||||
|
".no-results",
|
||||||
|
".result",
|
||||||
|
".result__a",
|
||||||
|
".result__url",
|
||||||
|
".result__snippet",
|
||||||
|
)?,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[async_trait::async_trait]
|
#[async_trait::async_trait]
|
||||||
impl SearchEngine for DuckDuckGo {
|
impl SearchEngine for DuckDuckGo {
|
||||||
@ -59,34 +77,17 @@ impl SearchEngine for DuckDuckGo {
|
|||||||
&DuckDuckGo::fetch_html_from_upstream(self, &url, header_map, request_timeout).await?,
|
&DuckDuckGo::fetch_html_from_upstream(self, &url, header_map, request_timeout).await?,
|
||||||
);
|
);
|
||||||
|
|
||||||
let no_result: Selector = Selector::parse(".no-results")
|
if document.select(&self.parser.no_result).next().is_some() {
|
||||||
.map_err(|_| Report::new(EngineError::UnexpectedError))
|
|
||||||
.attach_printable_lazy(|| format!("invalid CSS selector: {}", ".no-results"))?;
|
|
||||||
|
|
||||||
if document.select(&no_result).next().is_some() {
|
|
||||||
return Err(Report::new(EngineError::EmptyResultSet));
|
return Err(Report::new(EngineError::EmptyResultSet));
|
||||||
}
|
}
|
||||||
|
|
||||||
let results: Selector = Selector::parse(".result")
|
|
||||||
.map_err(|_| Report::new(EngineError::UnexpectedError))
|
|
||||||
.attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result"))?;
|
|
||||||
let result_title: Selector = Selector::parse(".result__a")
|
|
||||||
.map_err(|_| Report::new(EngineError::UnexpectedError))
|
|
||||||
.attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result__a"))?;
|
|
||||||
let result_url: Selector = Selector::parse(".result__url")
|
|
||||||
.map_err(|_| Report::new(EngineError::UnexpectedError))
|
|
||||||
.attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result__url"))?;
|
|
||||||
let result_desc: Selector = Selector::parse(".result__snippet")
|
|
||||||
.map_err(|_| Report::new(EngineError::UnexpectedError))
|
|
||||||
.attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result__snippet"))?;
|
|
||||||
|
|
||||||
// scrape all the results from the html
|
// scrape all the results from the html
|
||||||
Ok(document
|
Ok(document
|
||||||
.select(&results)
|
.select(&self.parser.results)
|
||||||
.map(|result| {
|
.map(|result| {
|
||||||
SearchResult::new(
|
SearchResult::new(
|
||||||
result
|
result
|
||||||
.select(&result_title)
|
.select(&self.parser.result_title)
|
||||||
.next()
|
.next()
|
||||||
.unwrap()
|
.unwrap()
|
||||||
.inner_html()
|
.inner_html()
|
||||||
@ -94,7 +95,7 @@ impl SearchEngine for DuckDuckGo {
|
|||||||
format!(
|
format!(
|
||||||
"https://{}",
|
"https://{}",
|
||||||
result
|
result
|
||||||
.select(&result_url)
|
.select(&self.parser.result_url)
|
||||||
.next()
|
.next()
|
||||||
.unwrap()
|
.unwrap()
|
||||||
.inner_html()
|
.inner_html()
|
||||||
@ -102,7 +103,7 @@ impl SearchEngine for DuckDuckGo {
|
|||||||
)
|
)
|
||||||
.as_str(),
|
.as_str(),
|
||||||
result
|
result
|
||||||
.select(&result_desc)
|
.select(&self.parser.result_desc)
|
||||||
.next()
|
.next()
|
||||||
.unwrap()
|
.unwrap()
|
||||||
.inner_html()
|
.inner_html()
|
||||||
|
@ -4,4 +4,5 @@
|
|||||||
//! code. Moreover, it also provides a custom error for the upstream search engine handling code.
|
//! code. Moreover, it also provides a custom error for the upstream search engine handling code.
|
||||||
|
|
||||||
pub mod duckduckgo;
|
pub mod duckduckgo;
|
||||||
|
pub mod search_result_parser;
|
||||||
pub mod searx;
|
pub mod searx;
|
||||||
|
38
src/engines/search_result_parser.rs
Normal file
38
src/engines/search_result_parser.rs
Normal file
@ -0,0 +1,38 @@
|
|||||||
|
use crate::models::engine_models::EngineError;
|
||||||
|
use error_stack::{Report, Result, ResultExt};
|
||||||
|
use scraper::{Html, Selector};
|
||||||
|
|
||||||
|
pub struct SearchResultParser {
|
||||||
|
pub no_result: Selector,
|
||||||
|
pub results: Selector,
|
||||||
|
pub result_title: Selector,
|
||||||
|
pub result_url: Selector,
|
||||||
|
pub result_desc: Selector,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl SearchResultParser {
|
||||||
|
pub fn new(
|
||||||
|
no_result_selector: &str,
|
||||||
|
results_selector: &str,
|
||||||
|
result_title_selector: &str,
|
||||||
|
result_url_selector: &str,
|
||||||
|
result_desc_selector: &str,
|
||||||
|
) -> Result<SearchResultParser, EngineError> {
|
||||||
|
Ok(SearchResultParser {
|
||||||
|
no_result: new_selector(no_result_selector)?,
|
||||||
|
results: new_selector(results_selector)?,
|
||||||
|
result_title: new_selector(result_title_selector)?,
|
||||||
|
result_url: new_selector(result_url_selector)?,
|
||||||
|
result_desc: new_selector(result_desc_selector)?,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn new_selector(selector: &str) -> Result<Selector, EngineError> {
|
||||||
|
Selector::parse(selector).map_err(|err| {
|
||||||
|
Report::new(EngineError::UnexpectedError).attach_printable(format!(
|
||||||
|
"invalid CSS selector: {}, err: {:?}",
|
||||||
|
selector, err
|
||||||
|
))
|
||||||
|
})
|
||||||
|
}
|
@ -3,16 +3,34 @@
|
|||||||
//! number if provided.
|
//! number if provided.
|
||||||
|
|
||||||
use reqwest::header::HeaderMap;
|
use reqwest::header::HeaderMap;
|
||||||
use scraper::{Html, Selector};
|
use scraper::Html;
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
|
|
||||||
|
use super::search_result_parser::SearchResultParser;
|
||||||
use crate::models::aggregation_models::SearchResult;
|
use crate::models::aggregation_models::SearchResult;
|
||||||
use crate::models::engine_models::{EngineError, SearchEngine};
|
use crate::models::engine_models::{EngineError, SearchEngine};
|
||||||
use error_stack::{Report, Result, ResultExt};
|
use error_stack::{Report, Result, ResultExt};
|
||||||
|
|
||||||
/// A new Searx engine type defined in-order to implement the `SearchEngine` trait which allows to
|
/// A new Searx engine type defined in-order to implement the `SearchEngine` trait which allows to
|
||||||
/// reduce code duplication as well as allows to create vector of different search engines easily.
|
/// reduce code duplication as well as allows to create vector of different search engines easily.
|
||||||
pub struct Searx;
|
pub struct Searx {
|
||||||
|
parser: SearchResultParser,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Searx {
|
||||||
|
// new Searchx engine
|
||||||
|
pub fn new() -> Result<Searx, EngineError> {
|
||||||
|
Ok(Self {
|
||||||
|
parser: SearchResultParser::new(
|
||||||
|
"#urls>.dialog-error>p",
|
||||||
|
".result",
|
||||||
|
"h3>a",
|
||||||
|
"h3>a",
|
||||||
|
".content",
|
||||||
|
)?,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[async_trait::async_trait]
|
#[async_trait::async_trait]
|
||||||
impl SearchEngine for Searx {
|
impl SearchEngine for Searx {
|
||||||
@ -52,13 +70,7 @@ impl SearchEngine for Searx {
|
|||||||
&Searx::fetch_html_from_upstream(self, &url, header_map, request_timeout).await?,
|
&Searx::fetch_html_from_upstream(self, &url, header_map, request_timeout).await?,
|
||||||
);
|
);
|
||||||
|
|
||||||
let no_result: Selector = Selector::parse("#urls>.dialog-error>p")
|
if let Some(no_result_msg) = document.select(&self.parser.no_result).nth(1) {
|
||||||
.map_err(|_| Report::new(EngineError::UnexpectedError))
|
|
||||||
.attach_printable_lazy(|| {
|
|
||||||
format!("invalid CSS selector: {}", "#urls>.dialog-error>p")
|
|
||||||
})?;
|
|
||||||
|
|
||||||
if let Some(no_result_msg) = document.select(&no_result).nth(1) {
|
|
||||||
if no_result_msg.inner_html()
|
if no_result_msg.inner_html()
|
||||||
== "we didn't find any results. Please use another query or search in more categories"
|
== "we didn't find any results. Please use another query or search in more categories"
|
||||||
{
|
{
|
||||||
@ -66,40 +78,26 @@ impl SearchEngine for Searx {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let results: Selector = Selector::parse(".result")
|
|
||||||
.map_err(|_| Report::new(EngineError::UnexpectedError))
|
|
||||||
.attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result"))?;
|
|
||||||
let result_title: Selector = Selector::parse("h3>a")
|
|
||||||
.map_err(|_| Report::new(EngineError::UnexpectedError))
|
|
||||||
.attach_printable_lazy(|| format!("invalid CSS selector: {}", "h3>a"))?;
|
|
||||||
let result_url: Selector = Selector::parse("h3>a")
|
|
||||||
.map_err(|_| Report::new(EngineError::UnexpectedError))
|
|
||||||
.attach_printable_lazy(|| format!("invalid CSS selector: {}", "h3>a"))?;
|
|
||||||
|
|
||||||
let result_desc: Selector = Selector::parse(".content")
|
|
||||||
.map_err(|_| Report::new(EngineError::UnexpectedError))
|
|
||||||
.attach_printable_lazy(|| format!("invalid CSS selector: {}", ".content"))?;
|
|
||||||
|
|
||||||
// scrape all the results from the html
|
// scrape all the results from the html
|
||||||
Ok(document
|
Ok(document
|
||||||
.select(&results)
|
.select(&self.parser.results)
|
||||||
.map(|result| {
|
.map(|result| {
|
||||||
SearchResult::new(
|
SearchResult::new(
|
||||||
result
|
result
|
||||||
.select(&result_title)
|
.select(&self.parser.result_title)
|
||||||
.next()
|
.next()
|
||||||
.unwrap()
|
.unwrap()
|
||||||
.inner_html()
|
.inner_html()
|
||||||
.trim(),
|
.trim(),
|
||||||
result
|
result
|
||||||
.select(&result_url)
|
.select(&self.parser.result_url)
|
||||||
.next()
|
.next()
|
||||||
.unwrap()
|
.unwrap()
|
||||||
.value()
|
.value()
|
||||||
.attr("href")
|
.attr("href")
|
||||||
.unwrap(),
|
.unwrap(),
|
||||||
result
|
result
|
||||||
.select(&result_desc)
|
.select(&self.parser.result_desc)
|
||||||
.next()
|
.next()
|
||||||
.unwrap()
|
.unwrap()
|
||||||
.inner_html()
|
.inner_html()
|
||||||
|
@ -85,12 +85,14 @@ impl EngineErrorInfo {
|
|||||||
pub fn new(error: &EngineError, engine: &str) -> Self {
|
pub fn new(error: &EngineError, engine: &str) -> Self {
|
||||||
Self {
|
Self {
|
||||||
error: match error {
|
error: match error {
|
||||||
|
EngineError::EngineNotFound => "EngineNotFound".to_owned(),
|
||||||
EngineError::RequestError => "RequestError".to_owned(),
|
EngineError::RequestError => "RequestError".to_owned(),
|
||||||
EngineError::EmptyResultSet => "EmptyResultSet".to_owned(),
|
EngineError::EmptyResultSet => "EmptyResultSet".to_owned(),
|
||||||
EngineError::UnexpectedError => "UnexpectedError".to_owned(),
|
EngineError::UnexpectedError => "UnexpectedError".to_owned(),
|
||||||
},
|
},
|
||||||
engine: engine.to_owned(),
|
engine: engine.to_owned(),
|
||||||
severity_color: match error {
|
severity_color: match error {
|
||||||
|
EngineError::EngineNotFound => "red".to_owned(),
|
||||||
EngineError::RequestError => "green".to_owned(),
|
EngineError::RequestError => "green".to_owned(),
|
||||||
EngineError::EmptyResultSet => "blue".to_owned(),
|
EngineError::EmptyResultSet => "blue".to_owned(),
|
||||||
EngineError::UnexpectedError => "red".to_owned(),
|
EngineError::UnexpectedError => "red".to_owned(),
|
||||||
|
@ -2,12 +2,14 @@
|
|||||||
//! the upstream search engines with the search query provided by the user.
|
//! the upstream search engines with the search query provided by the user.
|
||||||
|
|
||||||
use super::aggregation_models::SearchResult;
|
use super::aggregation_models::SearchResult;
|
||||||
use error_stack::{Result, ResultExt};
|
use error_stack::{Report, Result, ResultExt};
|
||||||
use std::{collections::HashMap, fmt, time::Duration};
|
use std::{collections::HashMap, fmt, time::Duration};
|
||||||
|
|
||||||
/// A custom error type used for handle engine associated errors.
|
/// A custom error type used for handle engine associated errors.
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
pub enum EngineError {
|
pub enum EngineError {
|
||||||
|
// No matching engine found
|
||||||
|
EngineNotFound,
|
||||||
/// This variant handles all request related errors like forbidden, not found,
|
/// This variant handles all request related errors like forbidden, not found,
|
||||||
/// etc.
|
/// etc.
|
||||||
EmptyResultSet,
|
EmptyResultSet,
|
||||||
@ -24,6 +26,9 @@ pub enum EngineError {
|
|||||||
impl fmt::Display for EngineError {
|
impl fmt::Display for EngineError {
|
||||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||||
match self {
|
match self {
|
||||||
|
EngineError::EngineNotFound => {
|
||||||
|
write!(f, "Search engine not found")
|
||||||
|
}
|
||||||
EngineError::EmptyResultSet => {
|
EngineError::EmptyResultSet => {
|
||||||
write!(f, "The upstream search engine returned an empty result set")
|
write!(f, "The upstream search engine returned an empty result set")
|
||||||
}
|
}
|
||||||
@ -134,18 +139,21 @@ impl EngineHandler {
|
|||||||
/// # Returns
|
/// # Returns
|
||||||
///
|
///
|
||||||
/// It returns an option either containing the value or a none if the engine is unknown
|
/// It returns an option either containing the value or a none if the engine is unknown
|
||||||
pub fn new(engine_name: &str) -> Option<Self> {
|
pub fn new(engine_name: &str) -> Result<Self, EngineError> {
|
||||||
let engine: (&'static str, Box<dyn SearchEngine>) =
|
let engine: (&'static str, Box<dyn SearchEngine>) =
|
||||||
match engine_name.to_lowercase().as_str() {
|
match engine_name.to_lowercase().as_str() {
|
||||||
"duckduckgo" => (
|
"duckduckgo" => {
|
||||||
"duckduckgo",
|
let engine = crate::engines::duckduckgo::DuckDuckGo::new()?;
|
||||||
Box::new(crate::engines::duckduckgo::DuckDuckGo),
|
("duckduckgo", Box::new(engine))
|
||||||
),
|
}
|
||||||
"searx" => ("searx", Box::new(crate::engines::searx::Searx)),
|
"searx" => {
|
||||||
_ => return None,
|
let engine = crate::engines::searx::Searx::new()?;
|
||||||
|
("searx", Box::new(engine))
|
||||||
|
}
|
||||||
|
_ => return Err(Report::from(EngineError::EngineNotFound)),
|
||||||
};
|
};
|
||||||
|
|
||||||
Some(Self {
|
Ok(Self {
|
||||||
engine: engine.1,
|
engine: engine.1,
|
||||||
name: engine.0,
|
name: engine.0,
|
||||||
})
|
})
|
||||||
|
@ -191,7 +191,7 @@ async fn results(
|
|||||||
let engines: Vec<EngineHandler> = cookie_value
|
let engines: Vec<EngineHandler> = cookie_value
|
||||||
.engines
|
.engines
|
||||||
.iter()
|
.iter()
|
||||||
.filter_map(|name| EngineHandler::new(name))
|
.filter_map(|name| EngineHandler::new(name).ok())
|
||||||
.collect();
|
.collect();
|
||||||
|
|
||||||
safe_search_level = match config.safe_search {
|
safe_search_level = match config.safe_search {
|
||||||
|
Loading…
Reference in New Issue
Block a user