0
0
mirror of https://github.com/neon-mmd/websurfx.git synced 2024-11-21 13:38:21 -05:00

Compare commits

...

7 Commits

Author SHA1 Message Date
Nikodem Rabuliński
52398820fc
Merge 4857dcb976 into 913ca1b075 2024-10-02 08:25:17 +00:00
mergify[bot]
4857dcb976
Merge branch 'rolling' into qwant 2024-10-02 08:25:14 +00:00
dependabot[bot]
913ca1b075
build(deps): bump tempfile from 3.12.0 to 3.13.0 (#612) 2024-10-02 08:24:27 +00:00
mergify[bot]
8323f49133
Merge branch 'rolling' into qwant 2024-09-10 16:03:40 +00:00
Nikodem Rabuliński
afefd023e9
engines/qwant: Parse url instead of using format
This makes sure that if a user uses & or any other symbol with
special meaning their query won't get broken
2024-09-07 17:14:00 +02:00
Nikodem Rabuliński
709425f60d
Implement Qwant search engine 2024-09-07 17:14:00 +02:00
Nikodem Rabuliński
fb0c2db08e
Enable json feature of reqwest 2024-09-07 17:14:00 +02:00
6 changed files with 190 additions and 6 deletions

8
Cargo.lock generated
View File

@ -3355,9 +3355,9 @@ dependencies = [
[[package]]
name = "rustix"
version = "0.38.36"
version = "0.38.37"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3f55e80d50763938498dd5ebb18647174e0c76dc38c5505294bb224624f30f36"
checksum = "8acb788b847c24f28525660c4d7758620a7210875711f79e7f663cc152726811"
dependencies = [
"bitflags 2.6.0",
"errno",
@ -3851,9 +3851,9 @@ checksum = "7b2093cf4c8eb1e67749a6762251bc9cd836b6fc171623bd0a9d324d37af2417"
[[package]]
name = "tempfile"
version = "3.12.0"
version = "3.13.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "04cbcdd0c794ebb0d4cf35e88edd2f7d2c4c3e9a5a6dab322839b321c6a87a64"
checksum = "f0f2c9fc62d0beef6951ccffd757e241266a2c833136efbe35af6cd2567dca5b"
dependencies = [
"cfg-if 1.0.0",
"fastrand",

View File

@ -17,7 +17,8 @@ reqwest = { version = "0.12.5", default-features = false, features = [
"rustls-tls",
"brotli",
"gzip",
"http2"
"http2",
"json"
] }
tokio = { version = "1.32.0", features = [
"rt-multi-thread",
@ -91,7 +92,7 @@ itertools = {version = "0.13.0", default-features = false}
[dev-dependencies]
rusty-hook = { version = "^0.11.2", default-features = false }
criterion = { version = "0.5.1", default-features = false }
tempfile = { version = "3.10.1", default-features = false }
tempfile = { version = "3.13.0", default-features = false }
[build-dependencies]
lightningcss = { version = "1.0.0-alpha.57", default-features = false, features = [

View File

@ -8,6 +8,7 @@ pub mod brave;
pub mod duckduckgo;
pub mod librex;
pub mod mojeek;
pub mod qwant;
pub mod search_result_parser;
pub mod searx;
pub mod startpage;

177
src/engines/qwant.rs Normal file
View File

@ -0,0 +1,177 @@
//! The `qwant` module handles the scraping of results from the qwant search engine
//! by querying the upstream qwant search engine with user provided query and with a page
//! number if provided.
use std::borrow::Cow;
use std::collections::HashMap;
use reqwest::header::HeaderMap;
use reqwest::{Client, Url};
use serde::Deserialize;
use crate::models::aggregation_models::SearchResult;
use crate::models::engine_models::{EngineError, SearchEngine};
use error_stack::{Report, Result, ResultExt};
/// A new Qwant engine type defined in-order to implement the `SearchEngine` trait which allows to
/// reduce code duplication as well as allows to create vector of different search engines easily.
pub struct Qwant;
#[derive(Deserialize, Debug)]
#[serde(rename_all = "camelCase")]
/// Web page search result
struct QwantSearchResult {
// NOTE: This object also contains `favicon`, `url_ping_suffix`, `thumbnail_url`,
// `source`, and `is_family_friendly` attributes,
// which we currently don't care about.
/// Title of the result
title: String,
/// Url of the result
url: String,
/// Description of the result
desc: String,
}
impl From<&QwantSearchResult> for SearchResult {
fn from(value: &QwantSearchResult) -> Self {
SearchResult::new(&value.title, &value.url, &value.desc, &["qwant"])
}
}
#[derive(Deserialize, Debug)]
#[serde(rename_all = "snake_case")]
#[serde(tag = "type")]
/// A result which should be shown to the user
enum QwantItem {
/// Results containing web pages relevant to the query
Web {
// NOTE: This object also contains `count` and `serpContextId` attributes,
// which we currently don't care about.
/// List of web page search results
items: Vec<QwantSearchResult>,
},
#[serde(other)]
/// Other item type like "related_searches", which aren't relevant.
Other,
}
#[derive(Deserialize, Debug)]
struct QwantItems {
// NOTE: This object also contains `headline`, `sidebar`, and `bottomline` attributes,
// which we currently don't care about.
/// Results which should be shown in the main section of the page
mainline: Vec<QwantItem>,
}
#[derive(Deserialize, Debug)]
struct QwantResult {
// NOTE: This object also contains `denied`, `total`, `items`, `filters`, `lastPage`,
// `instrumentation`, `onlyProductAds`, and `topClassification` attributes,
// which we currently don't care about.
/// Entries that should be shown to the user
items: QwantItems,
}
#[derive(Deserialize, Debug)]
#[serde(rename_all = "snake_case")]
#[serde(tag = "status", content = "data")]
enum QwantApiResponse {
/// Success response
Success {
// NOTE: This object also contains `query` and `cache` attributes,
// which we currently don't care about.
/// Actual results the search produced
result: QwantResult,
},
// TODO: Use the reported error messages
#[allow(unused)]
/// Error response
Error {
/// Machine-readable error code
error_code: i32,
#[serde(default)]
/// List of human-readable error messages
message: Vec<String>,
},
}
impl From<QwantApiResponse> for Result<QwantResult, EngineError> {
fn from(value: QwantApiResponse) -> Self {
match value {
QwantApiResponse::Success { result } => Ok(result),
QwantApiResponse::Error { .. } => Err(Report::new(EngineError::RequestError)),
}
}
}
#[async_trait::async_trait]
impl SearchEngine for Qwant {
async fn results(
&self,
query: &str,
page: u32,
user_agent: &str,
client: &Client,
safe_search: u8,
) -> Result<Vec<(String, SearchResult)>, EngineError> {
let results_per_page = 10;
let start_result = results_per_page * page;
let url = Url::parse_with_params(
"https://api.qwant.com/v3/search/web",
&[
("q", Cow::from(query)),
("count", results_per_page.to_string().into()),
("locale", "en_US".into()),
("offset", start_result.to_string().into()),
("safesearch", safe_search.to_string().into()),
("device", "desktop".into()),
("tgb", "2".into()),
("displayed", "true".into()),
],
)
.change_context(EngineError::UnexpectedError)?;
let header_map = HeaderMap::try_from(&HashMap::from([
("User-Agent".to_string(), user_agent.to_string()),
("Referer".to_string(), "https://qwant.com/".to_string()),
("Origin".to_string(), "https://qwant.com".to_string()),
]))
.change_context(EngineError::UnexpectedError)?;
let result: QwantApiResponse = client
.get(url)
.headers(header_map)
.send()
.await
.change_context(EngineError::RequestError)?
.json()
.await
.change_context(EngineError::RequestError)?;
let result = Result::from(result)?;
let results: Vec<_> = result
.items
.mainline
.into_iter()
.filter_map(|item| match item {
QwantItem::Web { items } => Some(items),
_ => None,
})
.flatten()
.map(|result| {
let search_result = SearchResult::from(&result);
(result.url, search_result)
})
.collect();
if results.is_empty() {
Err(Report::new(EngineError::EmptyResultSet))
} else {
Ok(results)
}
}
}

View File

@ -206,6 +206,10 @@ impl EngineHandler {
let engine = crate::engines::bing::Bing::new()?;
("bing", Box::new(engine))
}
"qwant" => {
let engine = crate::engines::qwant::Qwant;
("qwant", Box::new(engine))
}
_ => {
return Err(Report::from(EngineError::NoSuchEngineFound(
engine_name.to_string(),

View File

@ -74,4 +74,5 @@ upstream_search_engines = {
LibreX = false,
Mojeek = false,
Bing = false,
Qwant = false,
} -- select the upstream search engines from which the results should be fetched.