0
0
mirror of https://github.com/neon-mmd/websurfx.git synced 2024-11-24 23:18:22 -05:00

Refactor the search result parsing

This commit is contained in:
Zsombor Gegesy 2023-09-24 15:09:03 +02:00
parent 75a77d25f0
commit 57c73d38c8
4 changed files with 65 additions and 68 deletions

View File

@ -18,10 +18,12 @@ use super::search_result_parser::SearchResultParser;
/// A new DuckDuckGo engine type defined in-order to implement the `SearchEngine` trait which allows to /// A new DuckDuckGo engine type defined in-order to implement the `SearchEngine` trait which allows to
/// reduce code duplication as well as allows to create vector of different search engines easily. /// reduce code duplication as well as allows to create vector of different search engines easily.
pub struct DuckDuckGo { pub struct DuckDuckGo {
// The parser, used to interpret the search result.
parser: SearchResultParser, parser: SearchResultParser,
} }
impl DuckDuckGo { impl DuckDuckGo {
/// Creates the DuckDuckGo parser.
pub fn new() -> Result<Self, EngineError> { pub fn new() -> Result<Self, EngineError> {
Ok(Self { Ok(Self {
parser: SearchResultParser::new( parser: SearchResultParser::new(
@ -77,41 +79,19 @@ impl SearchEngine for DuckDuckGo {
&DuckDuckGo::fetch_html_from_upstream(self, &url, header_map, request_timeout).await?, &DuckDuckGo::fetch_html_from_upstream(self, &url, header_map, request_timeout).await?,
); );
if document.select(&self.parser.no_result).next().is_some() { if self.parser.parse_for_no_results(&document).next().is_some() {
return Err(Report::new(EngineError::EmptyResultSet)); return Err(Report::new(EngineError::EmptyResultSet));
} }
// scrape all the results from the html // scrape all the results from the html
Ok(document self.parser
.select(&self.parser.results) .parse_for_results(&document, |title, url, desc| {
.map(|result| { Some(SearchResult::new(
SearchResult::new( title.inner_html().trim(),
result &format!("https://{}", url.inner_html().trim()),
.select(&self.parser.result_title) desc.inner_html().trim(),
.next()
.unwrap()
.inner_html()
.trim(),
format!(
"https://{}",
result
.select(&self.parser.result_url)
.next()
.unwrap()
.inner_html()
.trim()
)
.as_str(),
result
.select(&self.parser.result_desc)
.next()
.unwrap()
.inner_html()
.trim(),
&["duckduckgo"], &["duckduckgo"],
) ))
}) })
.map(|search_result| (search_result.url.clone(), search_result))
.collect())
} }
} }

View File

@ -1,16 +1,21 @@
use crate::models::engine_models::EngineError; //! This modules provides helper functionalities for parsing a html document into internal SearchResult.
use error_stack::{Report, Result, ResultExt}; use std::collections::HashMap;
use scraper::{Html, Selector};
use crate::models::{aggregation_models::SearchResult, engine_models::EngineError};
use error_stack::{Report, Result};
use scraper::{html::Select, ElementRef, Html, Selector};
/// A html search result parser, based on a predefined CSS selectors.
pub struct SearchResultParser { pub struct SearchResultParser {
pub no_result: Selector, no_result: Selector,
pub results: Selector, results: Selector,
pub result_title: Selector, result_title: Selector,
pub result_url: Selector, result_url: Selector,
pub result_desc: Selector, result_desc: Selector,
} }
impl SearchResultParser { impl SearchResultParser {
/// Creates a new parser, if all the selectors are valid, otherwise it returns an EngineError
pub fn new( pub fn new(
no_result_selector: &str, no_result_selector: &str,
results_selector: &str, results_selector: &str,
@ -26,8 +31,36 @@ impl SearchResultParser {
result_desc: new_selector(result_desc_selector)?, result_desc: new_selector(result_desc_selector)?,
}) })
} }
/// Parse the html and returns element representing the 'no result found' response.
pub fn parse_for_no_results<'a>(&'a self, document: &'a Html) -> Select<'a, 'a> {
document.select(&self.no_result)
}
/// Parse the html, and convert the results to SearchResult with the help of the builder function
pub fn parse_for_results(
&self,
document: &Html,
builder: impl Fn(&ElementRef<'_>, &ElementRef<'_>, &ElementRef<'_>) -> Option<SearchResult>,
) -> Result<HashMap<String, SearchResult>, EngineError> {
let res = document
.select(&self.results)
.filter_map(|result| {
let title = result.select(&self.result_title).next();
let url = result.select(&self.result_url).next();
let desc = result.select(&self.result_desc).next();
match (title, url, desc) {
(Some(ref t), Some(ref u), Some(ref d)) => builder(t, u, d),
_ => None,
}
})
.map(|search_result| (search_result.url.clone(), search_result))
.collect();
Ok(res)
}
} }
/// Create a Selector struct, if the given parameter is a valid css expression, otherwise convert it into an EngineError.
fn new_selector(selector: &str) -> Result<Selector, EngineError> { fn new_selector(selector: &str) -> Result<Selector, EngineError> {
Selector::parse(selector).map_err(|err| { Selector::parse(selector).map_err(|err| {
Report::new(EngineError::UnexpectedError).attach_printable(format!( Report::new(EngineError::UnexpectedError).attach_printable(format!(

View File

@ -14,11 +14,12 @@ use error_stack::{Report, Result, ResultExt};
/// A new Searx engine type defined in-order to implement the `SearchEngine` trait which allows to /// A new Searx engine type defined in-order to implement the `SearchEngine` trait which allows to
/// reduce code duplication as well as allows to create vector of different search engines easily. /// reduce code duplication as well as allows to create vector of different search engines easily.
pub struct Searx { pub struct Searx {
// The parser, used to interpret the search result.
parser: SearchResultParser, parser: SearchResultParser,
} }
impl Searx { impl Searx {
// new Searchx engine /// creates a Searx parser
pub fn new() -> Result<Searx, EngineError> { pub fn new() -> Result<Searx, EngineError> {
Ok(Self { Ok(Self {
parser: SearchResultParser::new( parser: SearchResultParser::new(
@ -70,7 +71,7 @@ impl SearchEngine for Searx {
&Searx::fetch_html_from_upstream(self, &url, header_map, request_timeout).await?, &Searx::fetch_html_from_upstream(self, &url, header_map, request_timeout).await?,
); );
if let Some(no_result_msg) = document.select(&self.parser.no_result).nth(1) { if let Some(no_result_msg) = self.parser.parse_for_no_results(&document).nth(1) {
if no_result_msg.inner_html() if no_result_msg.inner_html()
== "we didn't find any results. Please use another query or search in more categories" == "we didn't find any results. Please use another query or search in more categories"
{ {
@ -79,33 +80,16 @@ impl SearchEngine for Searx {
} }
// scrape all the results from the html // scrape all the results from the html
Ok(document self.parser
.select(&self.parser.results) .parse_for_results(&document, |title, url, desc| {
.map(|result| { url.value().attr("href").map(|url| {
SearchResult::new( SearchResult::new(
result title.inner_html().trim(),
.select(&self.parser.result_title) url,
.next() desc.inner_html().trim(),
.unwrap()
.inner_html()
.trim(),
result
.select(&self.parser.result_url)
.next()
.unwrap()
.value()
.attr("href")
.unwrap(),
result
.select(&self.parser.result_desc)
.next()
.unwrap()
.inner_html()
.trim(),
&["searx"], &["searx"],
) )
}) })
.map(|search_result| (search_result.url.clone(), search_result)) })
.collect())
} }
} }

View File

@ -8,7 +8,7 @@ use std::{collections::HashMap, fmt, time::Duration};
/// A custom error type used for handle engine associated errors. /// A custom error type used for handle engine associated errors.
#[derive(Debug)] #[derive(Debug)]
pub enum EngineError { pub enum EngineError {
// No matching engine found /// No matching engine found
EngineNotFound, EngineNotFound,
/// This variant handles all request related errors like forbidden, not found, /// This variant handles all request related errors like forbidden, not found,
/// etc. /// etc.