From 57c73d38c80024bcf9daf19ed9b7e2af6b1190fd Mon Sep 17 00:00:00 2001 From: Zsombor Gegesy Date: Sun, 24 Sep 2023 15:09:03 +0200 Subject: [PATCH] Refactor the search result parsing --- src/engines/duckduckgo.rs | 40 ++++++----------------- src/engines/search_result_parser.rs | 49 ++++++++++++++++++++++++----- src/engines/searx.rs | 42 ++++++++----------------- src/models/engine_models.rs | 2 +- 4 files changed, 65 insertions(+), 68 deletions(-) diff --git a/src/engines/duckduckgo.rs b/src/engines/duckduckgo.rs index 318e764..8a28dda 100644 --- a/src/engines/duckduckgo.rs +++ b/src/engines/duckduckgo.rs @@ -18,10 +18,12 @@ use super::search_result_parser::SearchResultParser; /// A new DuckDuckGo engine type defined in-order to implement the `SearchEngine` trait which allows to /// reduce code duplication as well as allows to create vector of different search engines easily. pub struct DuckDuckGo { + // The parser, used to interpret the search result. parser: SearchResultParser, } impl DuckDuckGo { + /// Creates the DuckDuckGo parser. pub fn new() -> Result { Ok(Self { parser: SearchResultParser::new( @@ -77,41 +79,19 @@ impl SearchEngine for DuckDuckGo { &DuckDuckGo::fetch_html_from_upstream(self, &url, header_map, request_timeout).await?, ); - if document.select(&self.parser.no_result).next().is_some() { + if self.parser.parse_for_no_results(&document).next().is_some() { return Err(Report::new(EngineError::EmptyResultSet)); } // scrape all the results from the html - Ok(document - .select(&self.parser.results) - .map(|result| { - SearchResult::new( - result - .select(&self.parser.result_title) - .next() - .unwrap() - .inner_html() - .trim(), - format!( - "https://{}", - result - .select(&self.parser.result_url) - .next() - .unwrap() - .inner_html() - .trim() - ) - .as_str(), - result - .select(&self.parser.result_desc) - .next() - .unwrap() - .inner_html() - .trim(), + self.parser + .parse_for_results(&document, |title, url, desc| { + Some(SearchResult::new( + title.inner_html().trim(), + &format!("https://{}", url.inner_html().trim()), + desc.inner_html().trim(), &["duckduckgo"], - ) + )) }) - .map(|search_result| (search_result.url.clone(), search_result)) - .collect()) } } diff --git a/src/engines/search_result_parser.rs b/src/engines/search_result_parser.rs index 6918917..94fe0e8 100644 --- a/src/engines/search_result_parser.rs +++ b/src/engines/search_result_parser.rs @@ -1,16 +1,21 @@ -use crate::models::engine_models::EngineError; -use error_stack::{Report, Result, ResultExt}; -use scraper::{Html, Selector}; +//! This modules provides helper functionalities for parsing a html document into internal SearchResult. +use std::collections::HashMap; +use crate::models::{aggregation_models::SearchResult, engine_models::EngineError}; +use error_stack::{Report, Result}; +use scraper::{html::Select, ElementRef, Html, Selector}; + +/// A html search result parser, based on a predefined CSS selectors. pub struct SearchResultParser { - pub no_result: Selector, - pub results: Selector, - pub result_title: Selector, - pub result_url: Selector, - pub result_desc: Selector, + no_result: Selector, + results: Selector, + result_title: Selector, + result_url: Selector, + result_desc: Selector, } impl SearchResultParser { + /// Creates a new parser, if all the selectors are valid, otherwise it returns an EngineError pub fn new( no_result_selector: &str, results_selector: &str, @@ -26,8 +31,36 @@ impl SearchResultParser { result_desc: new_selector(result_desc_selector)?, }) } + + /// Parse the html and returns element representing the 'no result found' response. + pub fn parse_for_no_results<'a>(&'a self, document: &'a Html) -> Select<'a, 'a> { + document.select(&self.no_result) + } + + /// Parse the html, and convert the results to SearchResult with the help of the builder function + pub fn parse_for_results( + &self, + document: &Html, + builder: impl Fn(&ElementRef<'_>, &ElementRef<'_>, &ElementRef<'_>) -> Option, + ) -> Result, EngineError> { + let res = document + .select(&self.results) + .filter_map(|result| { + let title = result.select(&self.result_title).next(); + let url = result.select(&self.result_url).next(); + let desc = result.select(&self.result_desc).next(); + match (title, url, desc) { + (Some(ref t), Some(ref u), Some(ref d)) => builder(t, u, d), + _ => None, + } + }) + .map(|search_result| (search_result.url.clone(), search_result)) + .collect(); + Ok(res) + } } +/// Create a Selector struct, if the given parameter is a valid css expression, otherwise convert it into an EngineError. fn new_selector(selector: &str) -> Result { Selector::parse(selector).map_err(|err| { Report::new(EngineError::UnexpectedError).attach_printable(format!( diff --git a/src/engines/searx.rs b/src/engines/searx.rs index 32f286e..ca08b98 100644 --- a/src/engines/searx.rs +++ b/src/engines/searx.rs @@ -14,11 +14,12 @@ use error_stack::{Report, Result, ResultExt}; /// A new Searx engine type defined in-order to implement the `SearchEngine` trait which allows to /// reduce code duplication as well as allows to create vector of different search engines easily. pub struct Searx { + // The parser, used to interpret the search result. parser: SearchResultParser, } impl Searx { - // new Searchx engine + /// creates a Searx parser pub fn new() -> Result { Ok(Self { parser: SearchResultParser::new( @@ -70,7 +71,7 @@ impl SearchEngine for Searx { &Searx::fetch_html_from_upstream(self, &url, header_map, request_timeout).await?, ); - if let Some(no_result_msg) = document.select(&self.parser.no_result).nth(1) { + if let Some(no_result_msg) = self.parser.parse_for_no_results(&document).nth(1) { if no_result_msg.inner_html() == "we didn't find any results. Please use another query or search in more categories" { @@ -79,33 +80,16 @@ impl SearchEngine for Searx { } // scrape all the results from the html - Ok(document - .select(&self.parser.results) - .map(|result| { - SearchResult::new( - result - .select(&self.parser.result_title) - .next() - .unwrap() - .inner_html() - .trim(), - result - .select(&self.parser.result_url) - .next() - .unwrap() - .value() - .attr("href") - .unwrap(), - result - .select(&self.parser.result_desc) - .next() - .unwrap() - .inner_html() - .trim(), - &["searx"], - ) + self.parser + .parse_for_results(&document, |title, url, desc| { + url.value().attr("href").map(|url| { + SearchResult::new( + title.inner_html().trim(), + url, + desc.inner_html().trim(), + &["searx"], + ) + }) }) - .map(|search_result| (search_result.url.clone(), search_result)) - .collect()) } } diff --git a/src/models/engine_models.rs b/src/models/engine_models.rs index 77ec4c4..d53fc0c 100644 --- a/src/models/engine_models.rs +++ b/src/models/engine_models.rs @@ -8,7 +8,7 @@ use std::{collections::HashMap, fmt, time::Duration}; /// A custom error type used for handle engine associated errors. #[derive(Debug)] pub enum EngineError { - // No matching engine found + /// No matching engine found EngineNotFound, /// This variant handles all request related errors like forbidden, not found, /// etc.