mirror of
https://github.com/neon-mmd/websurfx.git
synced 2024-11-21 13:38:21 -05:00
Merge branch 'rolling' into CHORE/461_display-the-user-provided-settings-from-the-config-or-the-ui-in-the-settings-page
This commit is contained in:
commit
7f84c6346d
@ -60,4 +60,4 @@
|
||||
# calls the build function
|
||||
packages.websurfx = packages.default;
|
||||
});
|
||||
|
||||
}
|
||||
|
@ -6,6 +6,7 @@
|
||||
pub mod brave;
|
||||
pub mod duckduckgo;
|
||||
pub mod librex;
|
||||
pub mod mojeek;
|
||||
pub mod search_result_parser;
|
||||
pub mod searx;
|
||||
pub mod startpage;
|
||||
|
151
src/engines/mojeek.rs
Normal file
151
src/engines/mojeek.rs
Normal file
@ -0,0 +1,151 @@
|
||||
//! The `mojeek` module handles the scraping of results from the mojeek search engine
|
||||
//! by querying the upstream mojeek search engine with user provided query and with a page
|
||||
//! number if provided.
|
||||
|
||||
use std::collections::HashMap;
|
||||
|
||||
use reqwest::header::HeaderMap;
|
||||
use reqwest::Client;
|
||||
use scraper::Html;
|
||||
|
||||
use crate::models::aggregation_models::SearchResult;
|
||||
|
||||
use crate::models::engine_models::{EngineError, SearchEngine};
|
||||
|
||||
use error_stack::{Report, Result, ResultExt};
|
||||
|
||||
use super::search_result_parser::SearchResultParser;
|
||||
|
||||
/// A new Mojeek engine type defined in-order to implement the `SearchEngine` trait which allows to
|
||||
/// reduce code duplication as well as allows to create vector of different search engines easily.
|
||||
pub struct Mojeek {
|
||||
/// The parser, used to interpret the search result.
|
||||
parser: SearchResultParser,
|
||||
}
|
||||
|
||||
impl Mojeek {
|
||||
/// Creates the Mojeek parser.
|
||||
pub fn new() -> Result<Self, EngineError> {
|
||||
Ok(Self {
|
||||
parser: SearchResultParser::new(
|
||||
".result-col",
|
||||
".results-standard li",
|
||||
"a span.url",
|
||||
"h2 a.title",
|
||||
"p.s",
|
||||
)?,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl SearchEngine for Mojeek {
|
||||
async fn results(
|
||||
&self,
|
||||
query: &str,
|
||||
page: u32,
|
||||
user_agent: &str,
|
||||
client: &Client,
|
||||
safe_search: u8,
|
||||
) -> Result<HashMap<String, SearchResult>, EngineError> {
|
||||
// Mojeek uses `start results from this number` convention
|
||||
// So, for 10 results per page, page 0 starts at 1, page 1
|
||||
// starts at 11, and so on.
|
||||
let results_per_page = 10;
|
||||
let start_result = results_per_page * page + 1;
|
||||
|
||||
let results_per_page = results_per_page.to_string();
|
||||
let start_result = start_result.to_string();
|
||||
|
||||
let search_engines = vec![
|
||||
"Bing",
|
||||
"Brave",
|
||||
"DuckDuckGo",
|
||||
"Ecosia",
|
||||
"Google",
|
||||
"Lilo",
|
||||
"Metager",
|
||||
"Qwant",
|
||||
"Startpage",
|
||||
"Swisscows",
|
||||
"Yandex",
|
||||
"Yep",
|
||||
"You",
|
||||
];
|
||||
let qss = search_engines.join("%2C");
|
||||
let safe = if safe_search == 0 { "0" } else { "1" };
|
||||
|
||||
// Mojeek detects automated requests, these are preferences that are
|
||||
// able to circumvent the countermeasure. Some of these are
|
||||
// not documented in their Search API
|
||||
let query_params: Vec<(&str, &str)> = vec![
|
||||
("t", results_per_page.as_str()),
|
||||
("theme", "dark"),
|
||||
("arc", "none"),
|
||||
("date", "1"),
|
||||
("cdate", "1"),
|
||||
("tlen", "100"),
|
||||
("ref", "1"),
|
||||
("hp", "minimal"),
|
||||
("lb", "en"),
|
||||
("qss", &qss),
|
||||
("safe", safe),
|
||||
];
|
||||
|
||||
let mut query_params_string = String::new();
|
||||
for (k, v) in &query_params {
|
||||
query_params_string.push_str(&format!("&{k}={v}"));
|
||||
}
|
||||
|
||||
let url: String = match page {
|
||||
0 => {
|
||||
format!("https://www.mojeek.com/search?q={query}{query_params_string}")
|
||||
}
|
||||
_ => {
|
||||
format!(
|
||||
"https://www.mojeek.com/search?q={query}&s={start_result}{query_params_string}"
|
||||
)
|
||||
}
|
||||
};
|
||||
|
||||
let mut cookie_string = String::new();
|
||||
for (k, v) in &query_params {
|
||||
cookie_string.push_str(&format!("{k}={v}; "));
|
||||
}
|
||||
|
||||
let header_map = HeaderMap::try_from(&HashMap::from([
|
||||
("USER_AGENT".to_string(), user_agent.to_string()),
|
||||
("REFERER".to_string(), "https://google.com/".to_string()),
|
||||
(
|
||||
"CONTENT_TYPE".to_string(),
|
||||
"application/x-www-form-urlencoded".to_string(),
|
||||
),
|
||||
("COOKIE".to_string(), cookie_string),
|
||||
]))
|
||||
.change_context(EngineError::UnexpectedError)?;
|
||||
|
||||
let document: Html = Html::parse_document(
|
||||
&Mojeek::fetch_html_from_upstream(self, &url, header_map, client).await?,
|
||||
);
|
||||
|
||||
if let Some(no_result_msg) = self.parser.parse_for_no_results(&document).nth(0) {
|
||||
if no_result_msg
|
||||
.inner_html()
|
||||
.contains("No pages found matching:")
|
||||
{
|
||||
return Err(Report::new(EngineError::EmptyResultSet));
|
||||
}
|
||||
}
|
||||
|
||||
// scrape all the results from the html
|
||||
self.parser
|
||||
.parse_for_results(&document, |title, url, desc| {
|
||||
Some(SearchResult::new(
|
||||
title.inner_html().trim(),
|
||||
url.inner_html().trim(),
|
||||
desc.inner_html().trim(),
|
||||
&["mojeek"],
|
||||
))
|
||||
})
|
||||
}
|
||||
}
|
@ -162,6 +162,10 @@ impl EngineHandler {
|
||||
let engine = crate::engines::librex::LibreX::new()?;
|
||||
("librex", Box::new(engine))
|
||||
}
|
||||
"mojeek" => {
|
||||
let engine = crate::engines::mojeek::Mojeek::new()?;
|
||||
("mojeek", Box::new(engine))
|
||||
}
|
||||
_ => {
|
||||
return Err(Report::from(EngineError::NoSuchEngineFound(
|
||||
engine_name.to_string(),
|
||||
|
@ -59,11 +59,11 @@ pub async fn search(
|
||||
)
|
||||
};
|
||||
|
||||
// .max(1) makes sure that the page > 0.
|
||||
let page = params.page.unwrap_or(1).max(1);
|
||||
// .max(1) makes sure that the page >= 0.
|
||||
let page = params.page.unwrap_or(1).max(1) - 1;
|
||||
|
||||
let (_, results, _) = join!(
|
||||
get_results(page - 1),
|
||||
get_results(page.saturating_sub(1)),
|
||||
get_results(page),
|
||||
get_results(page + 1)
|
||||
);
|
||||
|
@ -64,4 +64,5 @@ upstream_search_engines = {
|
||||
Brave = false,
|
||||
Startpage = false,
|
||||
LibreX = false,
|
||||
Mojeek = false,
|
||||
} -- select the upstream search engines from which the results should be fetched.
|
||||
|
Loading…
Reference in New Issue
Block a user