From 709425f60d5ddd26846adf0d46cccfeb1f20b952 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nikodem=20Rabuli=C5=84ski?= Date: Sat, 7 Sep 2024 16:53:56 +0200 Subject: [PATCH] Implement Qwant search engine --- src/engines/mod.rs | 1 + src/engines/qwant.rs | 163 ++++++++++++++++++++++++++++++++++++ src/models/engine_models.rs | 4 + websurfx/config.lua | 1 + 4 files changed, 169 insertions(+) create mode 100644 src/engines/qwant.rs diff --git a/src/engines/mod.rs b/src/engines/mod.rs index a93c9c2..4cf4bad 100644 --- a/src/engines/mod.rs +++ b/src/engines/mod.rs @@ -8,6 +8,7 @@ pub mod brave; pub mod duckduckgo; pub mod librex; pub mod mojeek; +pub mod qwant; pub mod search_result_parser; pub mod searx; pub mod startpage; diff --git a/src/engines/qwant.rs b/src/engines/qwant.rs new file mode 100644 index 0000000..fb79974 --- /dev/null +++ b/src/engines/qwant.rs @@ -0,0 +1,163 @@ +//! The `qwant` module handles the scraping of results from the qwant search engine +//! by querying the upstream qwant search engine with user provided query and with a page +//! number if provided. + +use std::collections::HashMap; + +use reqwest::header::HeaderMap; +use reqwest::Client; +use serde::Deserialize; + +use crate::models::aggregation_models::SearchResult; + +use crate::models::engine_models::{EngineError, SearchEngine}; + +use error_stack::{Report, Result, ResultExt}; + +/// A new Qwant engine type defined in-order to implement the `SearchEngine` trait which allows to +/// reduce code duplication as well as allows to create vector of different search engines easily. +pub struct Qwant; + +#[derive(Deserialize, Debug)] +#[serde(rename_all = "camelCase")] +/// Web page search result +struct QwantSearchResult { + // NOTE: This object also contains `favicon`, `url_ping_suffix`, `thumbnail_url`, + // `source`, and `is_family_friendly` attributes, + // which we currently don't care about. + /// Title of the result + title: String, + /// Url of the result + url: String, + /// Description of the result + desc: String, +} + +impl From<&QwantSearchResult> for SearchResult { + fn from(value: &QwantSearchResult) -> Self { + SearchResult::new(&value.title, &value.url, &value.desc, &["qwant"]) + } +} + +#[derive(Deserialize, Debug)] +#[serde(rename_all = "snake_case")] +#[serde(tag = "type")] +/// A result which should be shown to the user +enum QwantItem { + /// Results containing web pages relevant to the query + Web { + // NOTE: This object also contains `count` and `serpContextId` attributes, + // which we currently don't care about. + /// List of web page search results + items: Vec, + }, + #[serde(other)] + /// Other item type like "related_searches", which aren't relevant. + Other, +} + +#[derive(Deserialize, Debug)] +struct QwantItems { + // NOTE: This object also contains `headline`, `sidebar`, and `bottomline` attributes, + // which we currently don't care about. + /// Results which should be shown in the main section of the page + mainline: Vec, +} + +#[derive(Deserialize, Debug)] +struct QwantResult { + // NOTE: This object also contains `denied`, `total`, `items`, `filters`, `lastPage`, + // `instrumentation`, `onlyProductAds`, and `topClassification` attributes, + // which we currently don't care about. + /// Entries that should be shown to the user + items: QwantItems, +} + +#[derive(Deserialize, Debug)] +#[serde(rename_all = "snake_case")] +#[serde(tag = "status", content = "data")] +enum QwantApiResponse { + /// Success response + Success { + // NOTE: This object also contains `query` and `cache` attributes, + // which we currently don't care about. + /// Actual results the search produced + result: QwantResult, + }, + // TODO: Use the reported error messages + #[allow(unused)] + /// Error response + Error { + /// Machine-readable error code + error_code: i32, + #[serde(default)] + /// List of human-readable error messages + message: Vec, + }, +} + +impl From for Result { + fn from(value: QwantApiResponse) -> Self { + match value { + QwantApiResponse::Success { result } => Ok(result), + QwantApiResponse::Error { .. } => Err(Report::new(EngineError::RequestError)), + } + } +} + +#[async_trait::async_trait] +impl SearchEngine for Qwant { + async fn results( + &self, + query: &str, + page: u32, + user_agent: &str, + client: &Client, + safe_search: u8, + ) -> Result, EngineError> { + let results_per_page = 10; + let start_result = results_per_page * page; + + let url: String = format!("https://api.qwant.com/v3/search/web?q={query}&count={results_per_page}&locale=en_US&offset={start_result}&safesearch={safe_search}&device=desktop&tgp=2&displayed=true"); + + let header_map = HeaderMap::try_from(&HashMap::from([ + ("User-Agent".to_string(), user_agent.to_string()), + ("Referer".to_string(), "https://qwant.com/".to_string()), + ("Origin".to_string(), "https://qwant.com".to_string()), + ])) + .change_context(EngineError::UnexpectedError)?; + + let result: QwantApiResponse = client + .get(url) + .headers(header_map) + .send() + .await + .change_context(EngineError::RequestError)? + .json() + .await + .change_context(EngineError::RequestError)?; + + let result = Result::from(result)?; + + let results: Vec<_> = result + .items + .mainline + .into_iter() + .filter_map(|item| match item { + QwantItem::Web { items } => Some(items), + _ => None, + }) + .flatten() + .map(|result| { + let search_result = SearchResult::from(&result); + (result.url, search_result) + }) + .collect(); + + if results.is_empty() { + Err(Report::new(EngineError::EmptyResultSet)) + } else { + Ok(results) + } + } +} diff --git a/src/models/engine_models.rs b/src/models/engine_models.rs index 932afce..bdd8604 100644 --- a/src/models/engine_models.rs +++ b/src/models/engine_models.rs @@ -206,6 +206,10 @@ impl EngineHandler { let engine = crate::engines::bing::Bing::new()?; ("bing", Box::new(engine)) } + "qwant" => { + let engine = crate::engines::qwant::Qwant; + ("qwant", Box::new(engine)) + } _ => { return Err(Report::from(EngineError::NoSuchEngineFound( engine_name.to_string(), diff --git a/websurfx/config.lua b/websurfx/config.lua index f346c1f..a6ccf67 100644 --- a/websurfx/config.lua +++ b/websurfx/config.lua @@ -72,4 +72,5 @@ upstream_search_engines = { LibreX = false, Mojeek = false, Bing = false, + Qwant = false, } -- select the upstream search engines from which the results should be fetched.