mirror of
https://github.com/neon-mmd/websurfx.git
synced 2024-11-24 23:18:22 -05:00
Refactor the search result parsing
This commit is contained in:
parent
75a77d25f0
commit
57c73d38c8
@ -18,10 +18,12 @@ use super::search_result_parser::SearchResultParser;
|
|||||||
/// A new DuckDuckGo engine type defined in-order to implement the `SearchEngine` trait which allows to
|
/// A new DuckDuckGo engine type defined in-order to implement the `SearchEngine` trait which allows to
|
||||||
/// reduce code duplication as well as allows to create vector of different search engines easily.
|
/// reduce code duplication as well as allows to create vector of different search engines easily.
|
||||||
pub struct DuckDuckGo {
|
pub struct DuckDuckGo {
|
||||||
|
// The parser, used to interpret the search result.
|
||||||
parser: SearchResultParser,
|
parser: SearchResultParser,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl DuckDuckGo {
|
impl DuckDuckGo {
|
||||||
|
/// Creates the DuckDuckGo parser.
|
||||||
pub fn new() -> Result<Self, EngineError> {
|
pub fn new() -> Result<Self, EngineError> {
|
||||||
Ok(Self {
|
Ok(Self {
|
||||||
parser: SearchResultParser::new(
|
parser: SearchResultParser::new(
|
||||||
@ -77,41 +79,19 @@ impl SearchEngine for DuckDuckGo {
|
|||||||
&DuckDuckGo::fetch_html_from_upstream(self, &url, header_map, request_timeout).await?,
|
&DuckDuckGo::fetch_html_from_upstream(self, &url, header_map, request_timeout).await?,
|
||||||
);
|
);
|
||||||
|
|
||||||
if document.select(&self.parser.no_result).next().is_some() {
|
if self.parser.parse_for_no_results(&document).next().is_some() {
|
||||||
return Err(Report::new(EngineError::EmptyResultSet));
|
return Err(Report::new(EngineError::EmptyResultSet));
|
||||||
}
|
}
|
||||||
|
|
||||||
// scrape all the results from the html
|
// scrape all the results from the html
|
||||||
Ok(document
|
self.parser
|
||||||
.select(&self.parser.results)
|
.parse_for_results(&document, |title, url, desc| {
|
||||||
.map(|result| {
|
Some(SearchResult::new(
|
||||||
SearchResult::new(
|
title.inner_html().trim(),
|
||||||
result
|
&format!("https://{}", url.inner_html().trim()),
|
||||||
.select(&self.parser.result_title)
|
desc.inner_html().trim(),
|
||||||
.next()
|
|
||||||
.unwrap()
|
|
||||||
.inner_html()
|
|
||||||
.trim(),
|
|
||||||
format!(
|
|
||||||
"https://{}",
|
|
||||||
result
|
|
||||||
.select(&self.parser.result_url)
|
|
||||||
.next()
|
|
||||||
.unwrap()
|
|
||||||
.inner_html()
|
|
||||||
.trim()
|
|
||||||
)
|
|
||||||
.as_str(),
|
|
||||||
result
|
|
||||||
.select(&self.parser.result_desc)
|
|
||||||
.next()
|
|
||||||
.unwrap()
|
|
||||||
.inner_html()
|
|
||||||
.trim(),
|
|
||||||
&["duckduckgo"],
|
&["duckduckgo"],
|
||||||
)
|
))
|
||||||
})
|
})
|
||||||
.map(|search_result| (search_result.url.clone(), search_result))
|
|
||||||
.collect())
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,16 +1,21 @@
|
|||||||
use crate::models::engine_models::EngineError;
|
//! This modules provides helper functionalities for parsing a html document into internal SearchResult.
|
||||||
use error_stack::{Report, Result, ResultExt};
|
use std::collections::HashMap;
|
||||||
use scraper::{Html, Selector};
|
|
||||||
|
|
||||||
|
use crate::models::{aggregation_models::SearchResult, engine_models::EngineError};
|
||||||
|
use error_stack::{Report, Result};
|
||||||
|
use scraper::{html::Select, ElementRef, Html, Selector};
|
||||||
|
|
||||||
|
/// A html search result parser, based on a predefined CSS selectors.
|
||||||
pub struct SearchResultParser {
|
pub struct SearchResultParser {
|
||||||
pub no_result: Selector,
|
no_result: Selector,
|
||||||
pub results: Selector,
|
results: Selector,
|
||||||
pub result_title: Selector,
|
result_title: Selector,
|
||||||
pub result_url: Selector,
|
result_url: Selector,
|
||||||
pub result_desc: Selector,
|
result_desc: Selector,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl SearchResultParser {
|
impl SearchResultParser {
|
||||||
|
/// Creates a new parser, if all the selectors are valid, otherwise it returns an EngineError
|
||||||
pub fn new(
|
pub fn new(
|
||||||
no_result_selector: &str,
|
no_result_selector: &str,
|
||||||
results_selector: &str,
|
results_selector: &str,
|
||||||
@ -26,8 +31,36 @@ impl SearchResultParser {
|
|||||||
result_desc: new_selector(result_desc_selector)?,
|
result_desc: new_selector(result_desc_selector)?,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Parse the html and returns element representing the 'no result found' response.
|
||||||
|
pub fn parse_for_no_results<'a>(&'a self, document: &'a Html) -> Select<'a, 'a> {
|
||||||
|
document.select(&self.no_result)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Parse the html, and convert the results to SearchResult with the help of the builder function
|
||||||
|
pub fn parse_for_results(
|
||||||
|
&self,
|
||||||
|
document: &Html,
|
||||||
|
builder: impl Fn(&ElementRef<'_>, &ElementRef<'_>, &ElementRef<'_>) -> Option<SearchResult>,
|
||||||
|
) -> Result<HashMap<String, SearchResult>, EngineError> {
|
||||||
|
let res = document
|
||||||
|
.select(&self.results)
|
||||||
|
.filter_map(|result| {
|
||||||
|
let title = result.select(&self.result_title).next();
|
||||||
|
let url = result.select(&self.result_url).next();
|
||||||
|
let desc = result.select(&self.result_desc).next();
|
||||||
|
match (title, url, desc) {
|
||||||
|
(Some(ref t), Some(ref u), Some(ref d)) => builder(t, u, d),
|
||||||
|
_ => None,
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.map(|search_result| (search_result.url.clone(), search_result))
|
||||||
|
.collect();
|
||||||
|
Ok(res)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Create a Selector struct, if the given parameter is a valid css expression, otherwise convert it into an EngineError.
|
||||||
fn new_selector(selector: &str) -> Result<Selector, EngineError> {
|
fn new_selector(selector: &str) -> Result<Selector, EngineError> {
|
||||||
Selector::parse(selector).map_err(|err| {
|
Selector::parse(selector).map_err(|err| {
|
||||||
Report::new(EngineError::UnexpectedError).attach_printable(format!(
|
Report::new(EngineError::UnexpectedError).attach_printable(format!(
|
||||||
|
@ -14,11 +14,12 @@ use error_stack::{Report, Result, ResultExt};
|
|||||||
/// A new Searx engine type defined in-order to implement the `SearchEngine` trait which allows to
|
/// A new Searx engine type defined in-order to implement the `SearchEngine` trait which allows to
|
||||||
/// reduce code duplication as well as allows to create vector of different search engines easily.
|
/// reduce code duplication as well as allows to create vector of different search engines easily.
|
||||||
pub struct Searx {
|
pub struct Searx {
|
||||||
|
// The parser, used to interpret the search result.
|
||||||
parser: SearchResultParser,
|
parser: SearchResultParser,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Searx {
|
impl Searx {
|
||||||
// new Searchx engine
|
/// creates a Searx parser
|
||||||
pub fn new() -> Result<Searx, EngineError> {
|
pub fn new() -> Result<Searx, EngineError> {
|
||||||
Ok(Self {
|
Ok(Self {
|
||||||
parser: SearchResultParser::new(
|
parser: SearchResultParser::new(
|
||||||
@ -70,7 +71,7 @@ impl SearchEngine for Searx {
|
|||||||
&Searx::fetch_html_from_upstream(self, &url, header_map, request_timeout).await?,
|
&Searx::fetch_html_from_upstream(self, &url, header_map, request_timeout).await?,
|
||||||
);
|
);
|
||||||
|
|
||||||
if let Some(no_result_msg) = document.select(&self.parser.no_result).nth(1) {
|
if let Some(no_result_msg) = self.parser.parse_for_no_results(&document).nth(1) {
|
||||||
if no_result_msg.inner_html()
|
if no_result_msg.inner_html()
|
||||||
== "we didn't find any results. Please use another query or search in more categories"
|
== "we didn't find any results. Please use another query or search in more categories"
|
||||||
{
|
{
|
||||||
@ -79,33 +80,16 @@ impl SearchEngine for Searx {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// scrape all the results from the html
|
// scrape all the results from the html
|
||||||
Ok(document
|
self.parser
|
||||||
.select(&self.parser.results)
|
.parse_for_results(&document, |title, url, desc| {
|
||||||
.map(|result| {
|
url.value().attr("href").map(|url| {
|
||||||
SearchResult::new(
|
SearchResult::new(
|
||||||
result
|
title.inner_html().trim(),
|
||||||
.select(&self.parser.result_title)
|
url,
|
||||||
.next()
|
desc.inner_html().trim(),
|
||||||
.unwrap()
|
&["searx"],
|
||||||
.inner_html()
|
)
|
||||||
.trim(),
|
})
|
||||||
result
|
|
||||||
.select(&self.parser.result_url)
|
|
||||||
.next()
|
|
||||||
.unwrap()
|
|
||||||
.value()
|
|
||||||
.attr("href")
|
|
||||||
.unwrap(),
|
|
||||||
result
|
|
||||||
.select(&self.parser.result_desc)
|
|
||||||
.next()
|
|
||||||
.unwrap()
|
|
||||||
.inner_html()
|
|
||||||
.trim(),
|
|
||||||
&["searx"],
|
|
||||||
)
|
|
||||||
})
|
})
|
||||||
.map(|search_result| (search_result.url.clone(), search_result))
|
|
||||||
.collect())
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -8,7 +8,7 @@ use std::{collections::HashMap, fmt, time::Duration};
|
|||||||
/// A custom error type used for handle engine associated errors.
|
/// A custom error type used for handle engine associated errors.
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
pub enum EngineError {
|
pub enum EngineError {
|
||||||
// No matching engine found
|
/// No matching engine found
|
||||||
EngineNotFound,
|
EngineNotFound,
|
||||||
/// This variant handles all request related errors like forbidden, not found,
|
/// This variant handles all request related errors like forbidden, not found,
|
||||||
/// etc.
|
/// etc.
|
||||||
|
Loading…
Reference in New Issue
Block a user