0
0
mirror of https://github.com/neon-mmd/websurfx.git synced 2024-11-25 15:38:21 -05:00

Merge branch 'rolling' into rolling

This commit is contained in:
alamin655 2023-11-20 21:12:47 +05:30 committed by GitHub
commit d28cbb96a1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 86 additions and 18 deletions

60
Cargo.lock generated
View File

@ -284,6 +284,21 @@ dependencies = [
"memchr", "memchr",
] ]
[[package]]
name = "alloc-no-stdlib"
version = "2.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cc7bb162ec39d46ab1ca8c77bf72e890535becd1751bb45f64c597edb4c8c6b3"
[[package]]
name = "alloc-stdlib"
version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "94fb8275041c72129eb51b7d0322c29b8387a0386127718b096429201a5d6ece"
dependencies = [
"alloc-no-stdlib",
]
[[package]] [[package]]
name = "anes" name = "anes"
version = "0.1.6" version = "0.1.6"
@ -326,6 +341,20 @@ version = "0.10.3"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "619743e34b5ba4e9703bba34deac3427c72507c7159f5fd030aea8cac0cfe341" checksum = "619743e34b5ba4e9703bba34deac3427c72507c7159f5fd030aea8cac0cfe341"
[[package]]
name = "async-compression"
version = "0.4.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bc2d0cfb2a7388d34f590e76686704c494ed7aaceed62ee1ba35cbf363abc2a5"
dependencies = [
"brotli",
"flate2",
"futures-core",
"memchr",
"pin-project-lite",
"tokio 1.34.0",
]
[[package]] [[package]]
name = "async-once-cell" name = "async-once-cell"
version = "0.5.3" version = "0.5.3"
@ -437,6 +466,27 @@ dependencies = [
"generic-array", "generic-array",
] ]
[[package]]
name = "brotli"
version = "3.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "516074a47ef4bce09577a3b379392300159ce5b1ba2e501ff1c819950066100f"
dependencies = [
"alloc-no-stdlib",
"alloc-stdlib",
"brotli-decompressor",
]
[[package]]
name = "brotli-decompressor"
version = "2.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4e2e4afe60d7dd600fdd3de8d0f08c2b7ec039712e3b6137ff98b7004e82de4f"
dependencies = [
"alloc-no-stdlib",
"alloc-stdlib",
]
[[package]] [[package]]
name = "bstr" name = "bstr"
version = "1.7.0" version = "1.7.0"
@ -502,9 +552,9 @@ dependencies = [
[[package]] [[package]]
name = "cargo-platform" name = "cargo-platform"
version = "0.1.4" version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "12024c4645c97566567129c204f65d5815a8c9aecf30fcbe682b2fe034996d36" checksum = "e34637b3140142bdf929fb439e8aa4ebad7651ebf7b1080b3930aa16ac1459ff"
dependencies = [ dependencies = [
"serde", "serde",
] ]
@ -2840,6 +2890,7 @@ version = "0.11.22"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "046cd98826c46c2ac8ddecae268eb5c2e58628688a5fc7a2643704a73faba95b" checksum = "046cd98826c46c2ac8ddecae268eb5c2e58628688a5fc7a2643704a73faba95b"
dependencies = [ dependencies = [
"async-compression",
"base64 0.21.5", "base64 0.21.5",
"bytes 1.5.0", "bytes 1.5.0",
"encoding_rs", "encoding_rs",
@ -2865,6 +2916,7 @@ dependencies = [
"system-configuration", "system-configuration",
"tokio 1.33.0", "tokio 1.33.0",
"tokio-rustls", "tokio-rustls",
"tokio-util",
"tower-service", "tower-service",
"url 2.4.1", "url 2.4.1",
"wasm-bindgen", "wasm-bindgen",
@ -2920,9 +2972,9 @@ dependencies = [
[[package]] [[package]]
name = "rustix" name = "rustix"
version = "0.38.21" version = "0.38.25"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2b426b0506e5d50a7d8dafcf2e81471400deb602392c7dd110815afb4eaf02a3" checksum = "dc99bc2d4f1fed22595588a013687477aedf3cdcfb26558c559edb67b4d9b22e"
dependencies = [ dependencies = [
"bitflags 2.4.1", "bitflags 2.4.1",
"errno", "errno",

View File

@ -13,7 +13,7 @@ bench = false
path = "src/bin/websurfx.rs" path = "src/bin/websurfx.rs"
[dependencies] [dependencies]
reqwest = {version="0.11.22", default-features=false, features=["rustls-tls"]} reqwest = {version="0.11.22", default-features=false, features=["rustls-tls","brotli", "gzip"]}
tokio = {version="1.32.0",features=["rt-multi-thread","macros"], default-features = false} tokio = {version="1.32.0",features=["rt-multi-thread","macros"], default-features = false}
serde = {version="1.0.190", default-features=false, features=["derive"]} serde = {version="1.0.190", default-features=false, features=["derive"]}
serde_json = {version="1.0.108", default-features=false} serde_json = {version="1.0.108", default-features=false}

View File

@ -4,7 +4,7 @@
use std::collections::HashMap; use std::collections::HashMap;
use reqwest::header::HeaderMap; use reqwest::{header::HeaderMap, Client};
use scraper::Html; use scraper::Html;
use crate::models::aggregation_models::SearchResult; use crate::models::aggregation_models::SearchResult;
@ -42,7 +42,7 @@ impl SearchEngine for Brave {
query: &str, query: &str,
page: u32, page: u32,
user_agent: &str, user_agent: &str,
request_timeout: u8, client: &Client,
safe_search: u8, safe_search: u8,
) -> Result<HashMap<String, SearchResult>, EngineError> { ) -> Result<HashMap<String, SearchResult>, EngineError> {
let url = format!("https://search.brave.com/search?q={query}&offset={page}"); let url = format!("https://search.brave.com/search?q={query}&offset={page}");
@ -68,7 +68,7 @@ impl SearchEngine for Brave {
.change_context(EngineError::UnexpectedError)?; .change_context(EngineError::UnexpectedError)?;
let document: Html = Html::parse_document( let document: Html = Html::parse_document(
&Brave::fetch_html_from_upstream(self, &url, header_map, request_timeout).await?, &Brave::fetch_html_from_upstream(self, &url, header_map, client).await?,
); );
if let Some(no_result_msg) = self.parser.parse_for_no_results(&document).nth(0) { if let Some(no_result_msg) = self.parser.parse_for_no_results(&document).nth(0) {

View File

@ -5,6 +5,7 @@
use std::collections::HashMap; use std::collections::HashMap;
use reqwest::header::HeaderMap; use reqwest::header::HeaderMap;
use reqwest::Client;
use scraper::Html; use scraper::Html;
use crate::models::aggregation_models::SearchResult; use crate::models::aggregation_models::SearchResult;
@ -44,7 +45,7 @@ impl SearchEngine for DuckDuckGo {
query: &str, query: &str,
page: u32, page: u32,
user_agent: &str, user_agent: &str,
request_timeout: u8, client: &Client,
_safe_search: u8, _safe_search: u8,
) -> Result<HashMap<String, SearchResult>, EngineError> { ) -> Result<HashMap<String, SearchResult>, EngineError> {
// Page number can be missing or empty string and so appropriate handling is required // Page number can be missing or empty string and so appropriate handling is required
@ -76,7 +77,7 @@ impl SearchEngine for DuckDuckGo {
.change_context(EngineError::UnexpectedError)?; .change_context(EngineError::UnexpectedError)?;
let document: Html = Html::parse_document( let document: Html = Html::parse_document(
&DuckDuckGo::fetch_html_from_upstream(self, &url, header_map, request_timeout).await?, &DuckDuckGo::fetch_html_from_upstream(self, &url, header_map, client).await?,
); );
if self.parser.parse_for_no_results(&document).next().is_some() { if self.parser.parse_for_no_results(&document).next().is_some() {

View File

@ -3,6 +3,7 @@
//! number if provided. //! number if provided.
use reqwest::header::HeaderMap; use reqwest::header::HeaderMap;
use reqwest::Client;
use scraper::Html; use scraper::Html;
use std::collections::HashMap; use std::collections::HashMap;
@ -40,7 +41,7 @@ impl SearchEngine for Searx {
query: &str, query: &str,
page: u32, page: u32,
user_agent: &str, user_agent: &str,
request_timeout: u8, client: &Client,
mut safe_search: u8, mut safe_search: u8,
) -> Result<HashMap<String, SearchResult>, EngineError> { ) -> Result<HashMap<String, SearchResult>, EngineError> {
// Page number can be missing or empty string and so appropriate handling is required // Page number can be missing or empty string and so appropriate handling is required
@ -68,7 +69,7 @@ impl SearchEngine for Searx {
.change_context(EngineError::UnexpectedError)?; .change_context(EngineError::UnexpectedError)?;
let document: Html = Html::parse_document( let document: Html = Html::parse_document(
&Searx::fetch_html_from_upstream(self, &url, header_map, request_timeout).await?, &Searx::fetch_html_from_upstream(self, &url, header_map, client).await?,
); );
if let Some(no_result_msg) = self.parser.parse_for_no_results(&document).nth(1) { if let Some(no_result_msg) = self.parser.parse_for_no_results(&document).nth(1) {

View File

@ -3,7 +3,8 @@
use super::aggregation_models::SearchResult; use super::aggregation_models::SearchResult;
use error_stack::{Report, Result, ResultExt}; use error_stack::{Report, Result, ResultExt};
use std::{collections::HashMap, fmt, time::Duration}; use reqwest::Client;
use std::{collections::HashMap, fmt};
/// A custom error type used for handle engine associated errors. /// A custom error type used for handle engine associated errors.
#[derive(Debug)] #[derive(Debug)]
@ -71,12 +72,11 @@ pub trait SearchEngine: Sync + Send {
&self, &self,
url: &str, url: &str,
header_map: reqwest::header::HeaderMap, header_map: reqwest::header::HeaderMap,
request_timeout: u8, client: &Client,
) -> Result<String, EngineError> { ) -> Result<String, EngineError> {
// fetch the html from upstream search engine // fetch the html from upstream search engine
Ok(reqwest::Client::new() Ok(client
.get(url) .get(url)
.timeout(Duration::from_secs(request_timeout as u64)) // Add timeout to request to avoid DDOSing the server
.headers(header_map) // add spoofed headers to emulate human behavior .headers(header_map) // add spoofed headers to emulate human behavior
.send() .send()
.await .await
@ -109,7 +109,7 @@ pub trait SearchEngine: Sync + Send {
query: &str, query: &str,
page: u32, page: u32,
user_agent: &str, user_agent: &str,
request_timeout: u8, client: &Client,
safe_search: u8, safe_search: u8,
) -> Result<HashMap<String, SearchResult>, EngineError>; ) -> Result<HashMap<String, SearchResult>, EngineError>;
} }

View File

@ -9,6 +9,7 @@ use crate::models::{
}; };
use error_stack::Report; use error_stack::Report;
use regex::Regex; use regex::Regex;
use reqwest::{Client, ClientBuilder};
use std::time::{SystemTime, UNIX_EPOCH}; use std::time::{SystemTime, UNIX_EPOCH};
use std::{ use std::{
collections::HashMap, collections::HashMap,
@ -18,6 +19,9 @@ use std::{
use std::{fs::File, io::BufRead}; use std::{fs::File, io::BufRead};
use tokio::task::JoinHandle; use tokio::task::JoinHandle;
/// A constant for holding the prebuilt Client globally in the app.
static CLIENT: std::sync::OnceLock<Client> = std::sync::OnceLock::new();
/// Aliases for long type annotations /// Aliases for long type annotations
type FutureVec = Vec<JoinHandle<Result<HashMap<String, SearchResult>, Report<EngineError>>>>; type FutureVec = Vec<JoinHandle<Result<HashMap<String, SearchResult>, Report<EngineError>>>>;
@ -68,6 +72,16 @@ pub async fn aggregate(
request_timeout: u8, request_timeout: u8,
safe_search: u8, safe_search: u8,
) -> Result<SearchResults, Box<dyn std::error::Error>> { ) -> Result<SearchResults, Box<dyn std::error::Error>> {
let client = CLIENT.get_or_init(|| {
ClientBuilder::new()
.timeout(Duration::from_secs(request_timeout as u64)) // Add timeout to request to avoid DDOSing the server
.https_only(true)
.gzip(true)
.brotli(true)
.build()
.unwrap()
});
let user_agent: &str = random_user_agent(); let user_agent: &str = random_user_agent();
// Add a random delay before making the request. // Add a random delay before making the request.
@ -88,7 +102,7 @@ pub async fn aggregate(
let query: String = query.to_owned(); let query: String = query.to_owned();
tasks.push(tokio::spawn(async move { tasks.push(tokio::spawn(async move {
search_engine search_engine
.results(&query, page, user_agent, request_timeout, safe_search) .results(&query, page, user_agent, client, safe_search)
.await .await
})); }));
} }