From ca6b271bc95d2fea81552a302a7b97373926e056 Mon Sep 17 00:00:00 2001 From: Zsombor Date: Tue, 24 Dec 2024 21:49:27 +0100 Subject: [PATCH] Add Wikipedia as a search engine (#633) Currently, it only search in the English wikipedia, but it can be customized to use different ones. Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> --- src/engines/bing.rs | 6 +-- src/engines/common.rs | 23 ++++++++ src/engines/mod.rs | 2 + src/engines/mojeek.rs | 11 ++-- src/engines/wikipedia.rs | 101 ++++++++++++++++++++++++++++++++++++ src/models/engine_models.rs | 4 ++ websurfx/config.lua | 1 + 7 files changed, 136 insertions(+), 12 deletions(-) create mode 100644 src/engines/common.rs create mode 100644 src/engines/wikipedia.rs diff --git a/src/engines/bing.rs b/src/engines/bing.rs index 50cf048..8cff6d4 100644 --- a/src/engines/bing.rs +++ b/src/engines/bing.rs @@ -15,6 +15,7 @@ use crate::models::engine_models::{EngineError, SearchEngine}; use error_stack::{Report, Result, ResultExt}; +use super::common::build_cookie; use super::search_result_parser::SearchResultParser; /// A new Bing engine type defined in-order to implement the `SearchEngine` trait which allows to @@ -73,10 +74,7 @@ impl SearchEngine for Bing { ("_UR=QS=0&TQS", "0"), ]; - let mut cookie_string = String::new(); - for (k, v) in &query_params { - cookie_string.push_str(&format!("{k}={v}; ")); - } + let cookie_string = build_cookie(&query_params); let header_map = HeaderMap::try_from(&HashMap::from([ ("User-Agent".to_string(), user_agent.to_string()), diff --git a/src/engines/common.rs b/src/engines/common.rs new file mode 100644 index 0000000..5a4e923 --- /dev/null +++ b/src/engines/common.rs @@ -0,0 +1,23 @@ +//! This module provides common functionalities for engines + +/** + * Build a query from a list of key value pairs. + */ +pub fn build_query(query_params: &[(&str, &str)]) -> String { + let mut query_params_string = String::new(); + for (k, v) in query_params { + query_params_string.push_str(&format!("&{k}={v}")); + } + query_params_string +} + +/** + * Build a cookie from a list of key value pairs. + */ +pub fn build_cookie(cookie_params: &[(&str, &str)]) -> String { + let mut cookie_string = String::new(); + for (k, v) in cookie_params { + cookie_string.push_str(&format!("{k}={v}; ")); + } + cookie_string +} diff --git a/src/engines/mod.rs b/src/engines/mod.rs index a93c9c2..7ae558a 100644 --- a/src/engines/mod.rs +++ b/src/engines/mod.rs @@ -5,9 +5,11 @@ pub mod bing; pub mod brave; +pub mod common; pub mod duckduckgo; pub mod librex; pub mod mojeek; pub mod search_result_parser; pub mod searx; pub mod startpage; +pub mod wikipedia; diff --git a/src/engines/mojeek.rs b/src/engines/mojeek.rs index 60422d0..8108d5a 100644 --- a/src/engines/mojeek.rs +++ b/src/engines/mojeek.rs @@ -14,6 +14,7 @@ use crate::models::engine_models::{EngineError, SearchEngine}; use error_stack::{Report, Result, ResultExt}; +use super::common::{build_cookie, build_query}; use super::search_result_parser::SearchResultParser; /// A new Mojeek engine type defined in-order to implement the `SearchEngine` trait which allows to @@ -107,10 +108,7 @@ impl SearchEngine for Mojeek { ("safe", &safe), ]; - let mut query_params_string = String::new(); - for (k, v) in &query_params { - query_params_string.push_str(&format!("&{k}={v}")); - } + let query_params_string = build_query(&query_params); let url: String = match page { 0 => { @@ -123,10 +121,7 @@ impl SearchEngine for Mojeek { } }; - let mut cookie_string = String::new(); - for (k, v) in &query_params { - cookie_string.push_str(&format!("{k}={v}; ")); - } + let cookie_string = build_cookie(&query_params); let header_map = HeaderMap::try_from(&HashMap::from([ ("User-Agent".to_string(), user_agent.to_string()), diff --git a/src/engines/wikipedia.rs b/src/engines/wikipedia.rs new file mode 100644 index 0000000..587e114 --- /dev/null +++ b/src/engines/wikipedia.rs @@ -0,0 +1,101 @@ +//! The `wikipedia` module handles the scraping of results from wikipedia +//! with user provided query and with a page number if provided. + +use std::collections::HashMap; + +use reqwest::header::HeaderMap; +use reqwest::Client; +use scraper::Html; + +use crate::models::aggregation_models::SearchResult; + +use crate::models::engine_models::{EngineError, SearchEngine}; + +use error_stack::{Report, Result, ResultExt}; + +use super::common::build_query; +use super::search_result_parser::SearchResultParser; + +/// A new Wikipedia engine type defined in-order to implement the `SearchEngine` trait which allows to +/// reduce code duplication as well as allows to create vector of different search engines easily. +pub struct Wikipedia { + /// The parser, used to interpret the search result. + parser: SearchResultParser, + /// The id of the engine, equals to 'wikipedia-' + language + id: String, + /// The host where wikipedia can be accessed. + host: String, +} + +impl Wikipedia { + /// Creates the Wikipedia parser. + pub fn new(language: &str) -> Result { + let host = format!("https://{}.wikipedia.org", &language); + let id = format!("wikipedia-{}", &language); + Ok(Self { + parser: SearchResultParser::new( + "p.mw-search-nonefound", + ".mw-search-results li.mw-search-result", + ".mw-search-result-heading a", + ".mw-search-result-heading a", + ".searchresult", + )?, + id, + host, + }) + } +} + +#[async_trait::async_trait] +impl SearchEngine for Wikipedia { + async fn results( + &self, + query: &str, + page: u32, + user_agent: &str, + client: &Client, + _safe_search: u8, + ) -> Result, EngineError> { + let header_map = HeaderMap::try_from(&HashMap::from([ + ("User-Agent".to_string(), user_agent.to_string()), + ("Referer".to_string(), self.host.to_string()), + ])) + .change_context(EngineError::UnexpectedError)?; + + let offset = (page * 20).to_string(); + let query_params: Vec<(&str, &str)> = vec![ + ("limit", "20"), + ("offset", &offset), + ("profile", "default"), + ("search", query), + ("title", "Special:Search"), + ("ns0", "1"), + ]; + + let query_params_string = build_query(&query_params); + + let url: String = format!("{}/w/index.php?{}", self.host, query_params_string); + + let document: Html = Html::parse_document( + &Wikipedia::fetch_html_from_upstream(self, &url, header_map, client).await?, + ); + + if self.parser.parse_for_no_results(&document).next().is_some() { + return Err(Report::new(EngineError::EmptyResultSet)); + } + + // scrape all the results from the html + self.parser + .parse_for_results(&document, |title, url, desc| { + let found_url = url.attr("href"); + found_url.map(|relative_url| { + SearchResult::new( + title.inner_html().trim(), + &format!("{}{relative_url}", self.host), + desc.inner_html().trim(), + &[&self.id], + ) + }) + }) + } +} diff --git a/src/models/engine_models.rs b/src/models/engine_models.rs index 932afce..72966c5 100644 --- a/src/models/engine_models.rs +++ b/src/models/engine_models.rs @@ -206,6 +206,10 @@ impl EngineHandler { let engine = crate::engines::bing::Bing::new()?; ("bing", Box::new(engine)) } + "wikipedia" => { + let engine = crate::engines::wikipedia::Wikipedia::new("en")?; + ("wikipedia", Box::new(engine)) + } _ => { return Err(Report::from(EngineError::NoSuchEngineFound( engine_name.to_string(), diff --git a/websurfx/config.lua b/websurfx/config.lua index 632474c..5b1220e 100644 --- a/websurfx/config.lua +++ b/websurfx/config.lua @@ -76,6 +76,7 @@ upstream_search_engines = { LibreX = false, Mojeek = false, Bing = false, + Wikipedia = true, } -- select the upstream search engines from which the results should be fetched. proxy = nil -- Proxy to send outgoing requests through. Set to nil to disable. \ No newline at end of file