mirror of
https://github.com/neon-mmd/websurfx.git
synced 2024-11-21 21:48:21 -05:00
Merge pull request #388 from neon-mmd/PERF/384_optimize-the-performance-of-fetching-results-in-the-websurfx-search-engine-backend
⚡️ Optimize the performance of fetching results in the `websurfx` search engine backend
This commit is contained in:
commit
07bbea8f9b
62
Cargo.lock
generated
62
Cargo.lock
generated
@ -284,6 +284,21 @@ dependencies = [
|
|||||||
"memchr",
|
"memchr",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "alloc-no-stdlib"
|
||||||
|
version = "2.0.4"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "cc7bb162ec39d46ab1ca8c77bf72e890535becd1751bb45f64c597edb4c8c6b3"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "alloc-stdlib"
|
||||||
|
version = "0.2.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "94fb8275041c72129eb51b7d0322c29b8387a0386127718b096429201a5d6ece"
|
||||||
|
dependencies = [
|
||||||
|
"alloc-no-stdlib",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "anes"
|
name = "anes"
|
||||||
version = "0.1.6"
|
version = "0.1.6"
|
||||||
@ -314,6 +329,20 @@ version = "0.10.3"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "619743e34b5ba4e9703bba34deac3427c72507c7159f5fd030aea8cac0cfe341"
|
checksum = "619743e34b5ba4e9703bba34deac3427c72507c7159f5fd030aea8cac0cfe341"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "async-compression"
|
||||||
|
version = "0.4.5"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "bc2d0cfb2a7388d34f590e76686704c494ed7aaceed62ee1ba35cbf363abc2a5"
|
||||||
|
dependencies = [
|
||||||
|
"brotli",
|
||||||
|
"flate2",
|
||||||
|
"futures-core",
|
||||||
|
"memchr",
|
||||||
|
"pin-project-lite",
|
||||||
|
"tokio 1.34.0",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "async-once-cell"
|
name = "async-once-cell"
|
||||||
version = "0.5.3"
|
version = "0.5.3"
|
||||||
@ -412,6 +441,27 @@ dependencies = [
|
|||||||
"generic-array",
|
"generic-array",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "brotli"
|
||||||
|
version = "3.4.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "516074a47ef4bce09577a3b379392300159ce5b1ba2e501ff1c819950066100f"
|
||||||
|
dependencies = [
|
||||||
|
"alloc-no-stdlib",
|
||||||
|
"alloc-stdlib",
|
||||||
|
"brotli-decompressor",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "brotli-decompressor"
|
||||||
|
version = "2.5.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "4e2e4afe60d7dd600fdd3de8d0f08c2b7ec039712e3b6137ff98b7004e82de4f"
|
||||||
|
dependencies = [
|
||||||
|
"alloc-no-stdlib",
|
||||||
|
"alloc-stdlib",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "bstr"
|
name = "bstr"
|
||||||
version = "1.8.0"
|
version = "1.8.0"
|
||||||
@ -477,9 +527,9 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "cargo-platform"
|
name = "cargo-platform"
|
||||||
version = "0.1.4"
|
version = "0.1.5"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "12024c4645c97566567129c204f65d5815a8c9aecf30fcbe682b2fe034996d36"
|
checksum = "e34637b3140142bdf929fb439e8aa4ebad7651ebf7b1080b3930aa16ac1459ff"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"serde",
|
"serde",
|
||||||
]
|
]
|
||||||
@ -2819,6 +2869,7 @@ version = "0.11.22"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "046cd98826c46c2ac8ddecae268eb5c2e58628688a5fc7a2643704a73faba95b"
|
checksum = "046cd98826c46c2ac8ddecae268eb5c2e58628688a5fc7a2643704a73faba95b"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
|
"async-compression",
|
||||||
"base64 0.21.5",
|
"base64 0.21.5",
|
||||||
"bytes 1.5.0",
|
"bytes 1.5.0",
|
||||||
"encoding_rs",
|
"encoding_rs",
|
||||||
@ -2844,6 +2895,7 @@ dependencies = [
|
|||||||
"system-configuration",
|
"system-configuration",
|
||||||
"tokio 1.34.0",
|
"tokio 1.34.0",
|
||||||
"tokio-rustls",
|
"tokio-rustls",
|
||||||
|
"tokio-util",
|
||||||
"tower-service",
|
"tower-service",
|
||||||
"url 2.4.1",
|
"url 2.4.1",
|
||||||
"wasm-bindgen",
|
"wasm-bindgen",
|
||||||
@ -2899,9 +2951,9 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "rustix"
|
name = "rustix"
|
||||||
version = "0.38.24"
|
version = "0.38.25"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "9ad981d6c340a49cdc40a1028d9c6084ec7e9fa33fcb839cab656a267071e234"
|
checksum = "dc99bc2d4f1fed22595588a013687477aedf3cdcfb26558c559edb67b4d9b22e"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"bitflags 2.4.1",
|
"bitflags 2.4.1",
|
||||||
"errno",
|
"errno",
|
||||||
@ -3989,7 +4041,7 @@ checksum = "14247bb57be4f377dfb94c72830b8ce8fc6beac03cf4bf7b9732eadd414123fc"
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "websurfx"
|
name = "websurfx"
|
||||||
version = "1.2.26"
|
version = "1.2.27"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"actix-cors",
|
"actix-cors",
|
||||||
"actix-files",
|
"actix-files",
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "websurfx"
|
name = "websurfx"
|
||||||
version = "1.2.26"
|
version = "1.2.27"
|
||||||
edition = "2021"
|
edition = "2021"
|
||||||
description = "An open-source alternative to Searx that provides clean, ad-free, and organic results with incredible speed while keeping privacy and security in mind."
|
description = "An open-source alternative to Searx that provides clean, ad-free, and organic results with incredible speed while keeping privacy and security in mind."
|
||||||
repository = "https://github.com/neon-mmd/websurfx"
|
repository = "https://github.com/neon-mmd/websurfx"
|
||||||
@ -13,7 +13,7 @@ bench = false
|
|||||||
path = "src/bin/websurfx.rs"
|
path = "src/bin/websurfx.rs"
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
reqwest = {version="0.11.22", default-features=false, features=["rustls-tls"]}
|
reqwest = {version="0.11.22", default-features=false, features=["rustls-tls","brotli", "gzip"]}
|
||||||
tokio = {version="1.32.0",features=["rt-multi-thread","macros"], default-features = false}
|
tokio = {version="1.32.0",features=["rt-multi-thread","macros"], default-features = false}
|
||||||
serde = {version="1.0.190", default-features=false, features=["derive"]}
|
serde = {version="1.0.190", default-features=false, features=["derive"]}
|
||||||
serde_json = {version="1.0.108", default-features=false}
|
serde_json = {version="1.0.108", default-features=false}
|
||||||
|
@ -4,7 +4,7 @@
|
|||||||
|
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
|
|
||||||
use reqwest::header::HeaderMap;
|
use reqwest::{header::HeaderMap, Client};
|
||||||
use scraper::Html;
|
use scraper::Html;
|
||||||
|
|
||||||
use crate::models::aggregation_models::SearchResult;
|
use crate::models::aggregation_models::SearchResult;
|
||||||
@ -42,7 +42,7 @@ impl SearchEngine for Brave {
|
|||||||
query: &str,
|
query: &str,
|
||||||
page: u32,
|
page: u32,
|
||||||
user_agent: &str,
|
user_agent: &str,
|
||||||
request_timeout: u8,
|
client: &Client,
|
||||||
safe_search: u8,
|
safe_search: u8,
|
||||||
) -> Result<HashMap<String, SearchResult>, EngineError> {
|
) -> Result<HashMap<String, SearchResult>, EngineError> {
|
||||||
let url = format!("https://search.brave.com/search?q={query}&offset={page}");
|
let url = format!("https://search.brave.com/search?q={query}&offset={page}");
|
||||||
@ -68,7 +68,7 @@ impl SearchEngine for Brave {
|
|||||||
.change_context(EngineError::UnexpectedError)?;
|
.change_context(EngineError::UnexpectedError)?;
|
||||||
|
|
||||||
let document: Html = Html::parse_document(
|
let document: Html = Html::parse_document(
|
||||||
&Brave::fetch_html_from_upstream(self, &url, header_map, request_timeout).await?,
|
&Brave::fetch_html_from_upstream(self, &url, header_map, client).await?,
|
||||||
);
|
);
|
||||||
|
|
||||||
if let Some(no_result_msg) = self.parser.parse_for_no_results(&document).nth(0) {
|
if let Some(no_result_msg) = self.parser.parse_for_no_results(&document).nth(0) {
|
||||||
|
@ -5,6 +5,7 @@
|
|||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
|
|
||||||
use reqwest::header::HeaderMap;
|
use reqwest::header::HeaderMap;
|
||||||
|
use reqwest::Client;
|
||||||
use scraper::Html;
|
use scraper::Html;
|
||||||
|
|
||||||
use crate::models::aggregation_models::SearchResult;
|
use crate::models::aggregation_models::SearchResult;
|
||||||
@ -44,7 +45,7 @@ impl SearchEngine for DuckDuckGo {
|
|||||||
query: &str,
|
query: &str,
|
||||||
page: u32,
|
page: u32,
|
||||||
user_agent: &str,
|
user_agent: &str,
|
||||||
request_timeout: u8,
|
client: &Client,
|
||||||
_safe_search: u8,
|
_safe_search: u8,
|
||||||
) -> Result<HashMap<String, SearchResult>, EngineError> {
|
) -> Result<HashMap<String, SearchResult>, EngineError> {
|
||||||
// Page number can be missing or empty string and so appropriate handling is required
|
// Page number can be missing or empty string and so appropriate handling is required
|
||||||
@ -76,7 +77,7 @@ impl SearchEngine for DuckDuckGo {
|
|||||||
.change_context(EngineError::UnexpectedError)?;
|
.change_context(EngineError::UnexpectedError)?;
|
||||||
|
|
||||||
let document: Html = Html::parse_document(
|
let document: Html = Html::parse_document(
|
||||||
&DuckDuckGo::fetch_html_from_upstream(self, &url, header_map, request_timeout).await?,
|
&DuckDuckGo::fetch_html_from_upstream(self, &url, header_map, client).await?,
|
||||||
);
|
);
|
||||||
|
|
||||||
if self.parser.parse_for_no_results(&document).next().is_some() {
|
if self.parser.parse_for_no_results(&document).next().is_some() {
|
||||||
|
@ -3,6 +3,7 @@
|
|||||||
//! number if provided.
|
//! number if provided.
|
||||||
|
|
||||||
use reqwest::header::HeaderMap;
|
use reqwest::header::HeaderMap;
|
||||||
|
use reqwest::Client;
|
||||||
use scraper::Html;
|
use scraper::Html;
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
|
|
||||||
@ -40,7 +41,7 @@ impl SearchEngine for Searx {
|
|||||||
query: &str,
|
query: &str,
|
||||||
page: u32,
|
page: u32,
|
||||||
user_agent: &str,
|
user_agent: &str,
|
||||||
request_timeout: u8,
|
client: &Client,
|
||||||
mut safe_search: u8,
|
mut safe_search: u8,
|
||||||
) -> Result<HashMap<String, SearchResult>, EngineError> {
|
) -> Result<HashMap<String, SearchResult>, EngineError> {
|
||||||
// Page number can be missing or empty string and so appropriate handling is required
|
// Page number can be missing or empty string and so appropriate handling is required
|
||||||
@ -68,7 +69,7 @@ impl SearchEngine for Searx {
|
|||||||
.change_context(EngineError::UnexpectedError)?;
|
.change_context(EngineError::UnexpectedError)?;
|
||||||
|
|
||||||
let document: Html = Html::parse_document(
|
let document: Html = Html::parse_document(
|
||||||
&Searx::fetch_html_from_upstream(self, &url, header_map, request_timeout).await?,
|
&Searx::fetch_html_from_upstream(self, &url, header_map, client).await?,
|
||||||
);
|
);
|
||||||
|
|
||||||
if let Some(no_result_msg) = self.parser.parse_for_no_results(&document).nth(1) {
|
if let Some(no_result_msg) = self.parser.parse_for_no_results(&document).nth(1) {
|
||||||
|
@ -3,7 +3,8 @@
|
|||||||
|
|
||||||
use super::aggregation_models::SearchResult;
|
use super::aggregation_models::SearchResult;
|
||||||
use error_stack::{Report, Result, ResultExt};
|
use error_stack::{Report, Result, ResultExt};
|
||||||
use std::{collections::HashMap, fmt, time::Duration};
|
use reqwest::Client;
|
||||||
|
use std::{collections::HashMap, fmt};
|
||||||
|
|
||||||
/// A custom error type used for handle engine associated errors.
|
/// A custom error type used for handle engine associated errors.
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
@ -71,12 +72,11 @@ pub trait SearchEngine: Sync + Send {
|
|||||||
&self,
|
&self,
|
||||||
url: &str,
|
url: &str,
|
||||||
header_map: reqwest::header::HeaderMap,
|
header_map: reqwest::header::HeaderMap,
|
||||||
request_timeout: u8,
|
client: &Client,
|
||||||
) -> Result<String, EngineError> {
|
) -> Result<String, EngineError> {
|
||||||
// fetch the html from upstream search engine
|
// fetch the html from upstream search engine
|
||||||
Ok(reqwest::Client::new()
|
Ok(client
|
||||||
.get(url)
|
.get(url)
|
||||||
.timeout(Duration::from_secs(request_timeout as u64)) // Add timeout to request to avoid DDOSing the server
|
|
||||||
.headers(header_map) // add spoofed headers to emulate human behavior
|
.headers(header_map) // add spoofed headers to emulate human behavior
|
||||||
.send()
|
.send()
|
||||||
.await
|
.await
|
||||||
@ -109,7 +109,7 @@ pub trait SearchEngine: Sync + Send {
|
|||||||
query: &str,
|
query: &str,
|
||||||
page: u32,
|
page: u32,
|
||||||
user_agent: &str,
|
user_agent: &str,
|
||||||
request_timeout: u8,
|
client: &Client,
|
||||||
safe_search: u8,
|
safe_search: u8,
|
||||||
) -> Result<HashMap<String, SearchResult>, EngineError>;
|
) -> Result<HashMap<String, SearchResult>, EngineError>;
|
||||||
}
|
}
|
||||||
|
@ -9,6 +9,7 @@ use crate::models::{
|
|||||||
};
|
};
|
||||||
use error_stack::Report;
|
use error_stack::Report;
|
||||||
use regex::Regex;
|
use regex::Regex;
|
||||||
|
use reqwest::{Client, ClientBuilder};
|
||||||
use std::time::{SystemTime, UNIX_EPOCH};
|
use std::time::{SystemTime, UNIX_EPOCH};
|
||||||
use std::{
|
use std::{
|
||||||
collections::HashMap,
|
collections::HashMap,
|
||||||
@ -18,6 +19,9 @@ use std::{
|
|||||||
use std::{fs::File, io::BufRead};
|
use std::{fs::File, io::BufRead};
|
||||||
use tokio::task::JoinHandle;
|
use tokio::task::JoinHandle;
|
||||||
|
|
||||||
|
/// A constant for holding the prebuilt Client globally in the app.
|
||||||
|
static CLIENT: std::sync::OnceLock<Client> = std::sync::OnceLock::new();
|
||||||
|
|
||||||
/// Aliases for long type annotations
|
/// Aliases for long type annotations
|
||||||
type FutureVec = Vec<JoinHandle<Result<HashMap<String, SearchResult>, Report<EngineError>>>>;
|
type FutureVec = Vec<JoinHandle<Result<HashMap<String, SearchResult>, Report<EngineError>>>>;
|
||||||
|
|
||||||
@ -68,6 +72,16 @@ pub async fn aggregate(
|
|||||||
request_timeout: u8,
|
request_timeout: u8,
|
||||||
safe_search: u8,
|
safe_search: u8,
|
||||||
) -> Result<SearchResults, Box<dyn std::error::Error>> {
|
) -> Result<SearchResults, Box<dyn std::error::Error>> {
|
||||||
|
let client = CLIENT.get_or_init(|| {
|
||||||
|
ClientBuilder::new()
|
||||||
|
.timeout(Duration::from_secs(request_timeout as u64)) // Add timeout to request to avoid DDOSing the server
|
||||||
|
.https_only(true)
|
||||||
|
.gzip(true)
|
||||||
|
.brotli(true)
|
||||||
|
.build()
|
||||||
|
.unwrap()
|
||||||
|
});
|
||||||
|
|
||||||
let user_agent: &str = random_user_agent();
|
let user_agent: &str = random_user_agent();
|
||||||
|
|
||||||
// Add a random delay before making the request.
|
// Add a random delay before making the request.
|
||||||
@ -88,7 +102,7 @@ pub async fn aggregate(
|
|||||||
let query: String = query.to_owned();
|
let query: String = query.to_owned();
|
||||||
tasks.push(tokio::spawn(async move {
|
tasks.push(tokio::spawn(async move {
|
||||||
search_engine
|
search_engine
|
||||||
.results(&query, page, user_agent, request_timeout, safe_search)
|
.results(&query, page, user_agent, client, safe_search)
|
||||||
.await
|
.await
|
||||||
}));
|
}));
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user