0
0
mirror of https://github.com/neon-mmd/websurfx.git synced 2024-10-18 06:22:53 -04:00

add code to evade ip blocking, improve pagination code and fix documentation

This commit is contained in:
neon_arch 2023-05-02 11:58:21 +03:00
parent f8c3c8dcbe
commit c170de8194
14 changed files with 264 additions and 61 deletions

39
Cargo.lock generated
View File

@ -447,6 +447,16 @@ dependencies = [
"bitflags", "bitflags",
] ]
[[package]]
name = "combine"
version = "4.6.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "35ed6e9d84f0b51a7f52daf1c7d71dd136fd7a3f41a8462b8cdb8c78d920fad4"
dependencies = [
"bytes 1.4.0",
"memchr",
]
[[package]] [[package]]
name = "convert_case" name = "convert_case"
version = "0.4.0" version = "0.4.0"
@ -1427,6 +1437,12 @@ version = "2.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "60302e4db3a61da70c0cb7991976248362f30319e88850c487b9b95bbf059e00" checksum = "60302e4db3a61da70c0cb7991976248362f30319e88850c487b9b95bbf059e00"
[[package]]
name = "md5"
version = "0.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771"
[[package]] [[package]]
name = "memchr" name = "memchr"
version = "2.5.0" version = "2.5.0"
@ -2157,6 +2173,20 @@ dependencies = [
"rand_core 0.3.1", "rand_core 0.3.1",
] ]
[[package]]
name = "redis"
version = "0.23.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3ea8c51b5dc1d8e5fd3350ec8167f464ec0995e79f2e90a075b63371500d557f"
dependencies = [
"combine",
"itoa 1.0.6",
"percent-encoding 2.2.0",
"ryu",
"sha1_smol",
"url 2.3.1",
]
[[package]] [[package]]
name = "redox_syscall" name = "redox_syscall"
version = "0.1.57" version = "0.1.57"
@ -2526,6 +2556,12 @@ dependencies = [
"digest", "digest",
] ]
[[package]]
name = "sha1_smol"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ae1a47186c03a32177042e55dbc5fd5aee900b8e0069a8d70fba96a9375cd012"
[[package]] [[package]]
name = "sha2" name = "sha2"
version = "0.10.6" version = "0.10.6"
@ -3291,6 +3327,9 @@ dependencies = [
"fake-useragent", "fake-useragent",
"handlebars", "handlebars",
"log", "log",
"md5",
"rand 0.6.5",
"redis",
"reqwest 0.11.17", "reqwest 0.11.17",
"rlua", "rlua",
"scraper", "scraper",

View File

@ -15,6 +15,9 @@ actix-web = {version="4.3.1"}
actix-files = {version="0.6.2"} actix-files = {version="0.6.2"}
serde_json = {version="*"} serde_json = {version="*"}
fake-useragent = {version="*"} fake-useragent = {version="*"}
env_logger = "0.10.0" env_logger = {version="0.10.0"}
log = "0.4.17" log = {version="0.4.17"}
rlua = {version="*"} rlua = {version="*"}
redis = {version="*"}
md5 = {version="*"}
rand={version="*"}

78
src/cache/cacher.rs vendored Normal file
View File

@ -0,0 +1,78 @@
//! This module provides the functionality to cache the aggregated results fetched and aggregated
//! from the upstream search engines in a json format.
use md5::compute;
use redis::{Client, Commands, Connection};
/// A named struct which stores the redis Connection url address to which the client will
/// connect to.
///
/// # Fields
///
/// * `redis_connection_url` - It stores the redis Connection url address.
#[derive(Clone)]
pub struct RedisCache {
redis_connection_url: String,
}
impl RedisCache {
/// Constructs a new `SearchResult` with the given arguments needed for the struct.
///
/// # Arguments
///
/// * `redis_connection_url` - It stores the redis Connection url address.
pub fn new(redis_connection_url: String) -> Self {
RedisCache {
redis_connection_url,
}
}
/// A helper function which computes the hash of the url and formats and returns it as string.
///
/// # Arguments
///
/// * `url` - It takes an url as string.
fn compute_url_hash(self, url: &str) -> String {
format!("{:?}", compute(url))
}
/// A function which fetches the cached json results as json string from the redis server.
///
/// # Arguments
///
/// * `url` - It takes an url as a string.
pub fn cached_results_json(self, url: String) -> Result<String, Box<dyn std::error::Error>> {
let hashed_url_string = self.clone().compute_url_hash(&url);
let mut redis_connection: Connection =
Client::open(self.redis_connection_url)?.get_connection()?;
Ok(redis_connection.get(hashed_url_string)?)
}
/// A function which caches the results by using the hashed `url` as the key and
/// `json results` as the value and stores it in redis server with ttl(time to live)
/// set to 60 seconds.
///
/// # Arguments
///
/// * `json_results` - It takes the json results string as an argument.
/// * `url` - It takes the url as a String.
pub fn cache_results(
self,
json_results: String,
url: String,
) -> Result<(), Box<dyn std::error::Error>> {
let hashed_url_string = self.clone().compute_url_hash(&url);
let mut redis_connection: Connection =
Client::open(self.redis_connection_url)?.get_connection()?;
// put results_json into cache
redis_connection.set(hashed_url_string.clone(), json_results)?;
// Set the TTL for the key to 60 seconds
redis_connection
.expire::<String, u32>(hashed_url_string.clone(), 60)
.unwrap();
Ok(())
}
}

1
src/cache/mod.rs vendored Normal file
View File

@ -0,0 +1 @@
pub mod cacher;

View File

@ -11,11 +11,15 @@ use std::fs;
// //
/// * `port` - It stores the parsed port number option on which the server should launch. /// * `port` - It stores the parsed port number option on which the server should launch.
/// * `binding_ip_addr` - It stores the parsed ip address option on which the server should launch /// * `binding_ip_addr` - It stores the parsed ip address option on which the server should launch
/// * `style` - It stores the theming options for the website.
/// * `redis_connection_url` - It stores the redis connection url address on which the redis
/// client should connect.
#[derive(Clone)] #[derive(Clone)]
pub struct Config { pub struct Config {
pub port: u16, pub port: u16,
pub binding_ip_addr: String, pub binding_ip_addr: String,
pub style: Style, pub style: Style,
pub redis_connection_url: String,
} }
impl Config { impl Config {
@ -44,6 +48,7 @@ impl Config {
globals.get::<_, String>("theme")?, globals.get::<_, String>("theme")?,
globals.get::<_, String>("colorscheme")?, globals.get::<_, String>("colorscheme")?,
), ),
redis_connection_url: globals.get::<_, String>("redis_connection_url")?,
}) })
}) })
} }

View File

@ -1,21 +1,24 @@
//! This module provides public models for handling, storing and serializing parsed config file //! This module provides public models for handling, storing and serializing parsed config file
//! options from config.lua by grouping them togather. //! options from config.lua by grouping them togather.
use serde::Serialize; use serde::{Deserialize, Serialize};
/// A named struct which stores, serializes and groups the parsed config file options of theme and /// A named struct which stores,deserializes, serializes and groups the parsed config file options
/// colorscheme names into the Style struct which derives the `Clone` and `Serialize` traits /// of theme and colorscheme names into the Style struct which derives the `Clone`, `Serialize`
/// where the `Clone` trait is derived for allowing the struct to be cloned and passed to the /// and Deserialize traits where the `Clone` trait is derived for allowing the struct to be
/// server as a shared data between all routes except `/robots.txt` and the `Serialize` trait /// cloned and passed to the server as a shared data between all routes except `/robots.txt` and
/// has been derived for allowing the object to be serialized so that it can be passed to /// the `Serialize` trait has been derived for allowing the object to be serialized so that it
/// handlebars template files. /// can be passed to handlebars template files and the `Deserialize` trait has been derived in
/// order to allow the deserializing the json back to struct in aggregate function in
/// aggregator.rs and create a new struct out of it and then serialize it back to json and pass
/// it to the template files.
/// ///
/// # Fields /// # Fields
// //
/// * `theme` - It stores the parsed theme option used to set a theme for the website. /// * `theme` - It stores the parsed theme option used to set a theme for the website.
/// * `colorscheme` - It stores the parsed colorscheme option used to set a colorscheme for the /// * `colorscheme` - It stores the parsed colorscheme option used to set a colorscheme for the
/// theme being used. /// theme being used.
#[derive(Serialize, Clone)] #[derive(Serialize, Deserialize, Clone)]
pub struct Style { pub struct Style {
pub theme: String, pub theme: String,
pub colorscheme: String, pub colorscheme: String,

View File

@ -2,9 +2,10 @@
//! by querying the upstream duckduckgo search engine with user provided query and with a page //! by querying the upstream duckduckgo search engine with user provided query and with a page
//! number if provided. //! number if provided.
use std::collections::HashMap; use std::{collections::HashMap, time::Duration};
use reqwest::header::USER_AGENT; use rand::Rng;
use reqwest::header::{HeaderMap, CONTENT_TYPE, REFERER, USER_AGENT};
use scraper::{Html, Selector}; use scraper::{Html, Selector};
use crate::search_results_handler::aggregation_models::RawSearchResult; use crate::search_results_handler::aggregation_models::RawSearchResult;
@ -17,7 +18,7 @@ use crate::search_results_handler::aggregation_models::RawSearchResult;
/// # Arguments /// # Arguments
/// ///
/// * `query` - Takes the user provided query to query to the upstream search engine with. /// * `query` - Takes the user provided query to query to the upstream search engine with.
/// * `page` - Takes an Option<u32> as argument which can be either None or a valid page number. /// * `page` - Takes an u32 as an argument.
/// * `user_agent` - Takes a random user agent string as an argument. /// * `user_agent` - Takes a random user agent string as an argument.
/// ///
/// # Errors /// # Errors
@ -27,32 +28,41 @@ use crate::search_results_handler::aggregation_models::RawSearchResult;
/// selector fails to initialize" /// selector fails to initialize"
pub async fn results( pub async fn results(
query: &str, query: &str,
page: Option<u32>, page: u32,
user_agent: &str, user_agent: &str,
) -> Result<HashMap<String, RawSearchResult>, Box<dyn std::error::Error>> { ) -> Result<HashMap<String, RawSearchResult>, Box<dyn std::error::Error>> {
// Page number can be missing or empty string and so appropriate handling is required // Page number can be missing or empty string and so appropriate handling is required
// so that upstream server recieves valid page number. // so that upstream server recieves valid page number.
let url: String = match page { let url: String = match page {
Some(page_number) => { 1 => {
if page_number <= 1 { format!("https://html.duckduckgo.com/html/?q={query}&s=&dc=&v=1&o=json&api=/d.js")
format!("https://html.duckduckgo.com/html/?q={query}&s=&dc=&v=1&o=json&api=/d.js") }
} else { _ => {
format!( format!(
"https://duckduckgo.com/html/?q={}&s={}&dc={}&v=1&o=json&api=/d.js", "https://duckduckgo.com/html/?q={}&s={}&dc={}&v=1&o=json&api=/d.js",
query, query,
page_number / 2 * 30, (page / 2 + (page % 2)) * 30,
page_number / 2 * 30 + 1 (page / 2 + (page % 2)) * 30 + 1
) )
}
} }
None => format!("https://html.duckduckgo.com/html/?q={query}&s=&dc=&v=1&o=json&api=/d.js"),
}; };
// Add a random delay before making the request.
let mut rng = rand::thread_rng();
let delay_secs = rng.gen_range(1, 10);
std::thread::sleep(Duration::from_secs(delay_secs));
// initializing HeaderMap and adding appropriate headers.
let mut header_map = HeaderMap::new();
header_map.insert(USER_AGENT, user_agent.parse()?);
header_map.insert(REFERER, "https://google.com/".parse()?);
header_map.insert(CONTENT_TYPE, "text/html; charset=UTF-8".parse()?);
// fetch the html from upstream duckduckgo engine // fetch the html from upstream duckduckgo engine
// TODO: Write better error handling code to handle no results case. // TODO: Write better error handling code to handle no results case.
let results: String = reqwest::Client::new() let results: String = reqwest::Client::new()
.get(url) .get(url)
.header(USER_AGENT, user_agent) .headers(header_map) // add spoofed headers to emulate human behaviour
.send() .send()
.await? .await?
.text() .text()

View File

@ -2,10 +2,10 @@
//! by querying the upstream searx search engine instance with user provided query and with a page //! by querying the upstream searx search engine instance with user provided query and with a page
//! number if provided. //! number if provided.
use std::collections::HashMap; use rand::Rng;
use reqwest::header::{HeaderMap, CONTENT_TYPE, REFERER, USER_AGENT};
use reqwest::header::USER_AGENT;
use scraper::{Html, Selector}; use scraper::{Html, Selector};
use std::{collections::HashMap, time::Duration};
use crate::search_results_handler::aggregation_models::RawSearchResult; use crate::search_results_handler::aggregation_models::RawSearchResult;
@ -17,7 +17,7 @@ use crate::search_results_handler::aggregation_models::RawSearchResult;
/// # Arguments /// # Arguments
/// ///
/// * `query` - Takes the user provided query to query to the upstream search engine with. /// * `query` - Takes the user provided query to query to the upstream search engine with.
/// * `page` - Takes an Option<u32> as argument which can be either None or a valid page number. /// * `page` - Takes an u32 as an argument.
/// * `user_agent` - Takes a random user agent string as an argument. /// * `user_agent` - Takes a random user agent string as an argument.
/// ///
/// # Errors /// # Errors
@ -27,27 +27,29 @@ use crate::search_results_handler::aggregation_models::RawSearchResult;
/// selector fails to initialize" /// selector fails to initialize"
pub async fn results( pub async fn results(
query: &str, query: &str,
page: Option<u32>, page: u32,
user_agent: &str, user_agent: &str,
) -> Result<HashMap<String, RawSearchResult>, Box<dyn std::error::Error>> { ) -> Result<HashMap<String, RawSearchResult>, Box<dyn std::error::Error>> {
// Page number can be missing or empty string and so appropriate handling is required // Page number can be missing or empty string and so appropriate handling is required
// so that upstream server recieves valid page number. // so that upstream server recieves valid page number.
let url: String = match page { let url: String = format!("https://searx.work/search?q={query}&pageno={page}");
Some(page_number) => {
if page_number <= 1 { // Add random delay before making the request.
format!("https://searx.work/search?q={query}") let mut rng = rand::thread_rng();
} else { let delay_secs = rng.gen_range(1, 10);
format!("https://searx.work/search?q={query}&pageno={page_number}",) std::thread::sleep(Duration::from_secs(delay_secs));
}
} // initializing headers and adding appropriate headers.
None => format!("https://searx.work/search?q={query}"), let mut header_map = HeaderMap::new();
}; header_map.insert(USER_AGENT, user_agent.parse()?);
header_map.insert(REFERER, "https://google.com/".parse()?);
header_map.insert(CONTENT_TYPE, "application/x-www-form-urlencoded".parse()?);
// fetch the html from upstream searx instance engine // fetch the html from upstream searx instance engine
// TODO: Write better error handling code to handle no results case. // TODO: Write better error handling code to handle no results case.
let results: String = reqwest::Client::new() let results: String = reqwest::Client::new()
.get(url) .get(url)
.header(USER_AGENT, user_agent) .headers(header_map) // add spoofed headers to emulate human behaviours.
.send() .send()
.await? .await?
.text() .text()

View File

@ -1,6 +1,7 @@
//! This main library module provides the functionality to provide and handle the Tcp server //! This main library module provides the functionality to provide and handle the Tcp server
//! and register all the routes for the `websurfx` meta search engine website. //! and register all the routes for the `websurfx` meta search engine website.
pub mod cache;
pub mod config_parser; pub mod config_parser;
pub mod engines; pub mod engines;
pub mod search_results_handler; pub mod search_results_handler;

View File

@ -1,12 +1,12 @@
//! This module provides public models for handling, storing and serializing of search results //! This module provides public models for handling, storing and serializing of search results
//! data scraped from the upstream search engines. //! data scraped from the upstream search engines.
use serde::Serialize; use serde::{Deserialize, Serialize};
use crate::config_parser::parser_models::Style; use crate::config_parser::parser_models::Style;
/// A named struct to store and serialize the individual search result from all the scraped /// A named struct to store, serialize and deserializes the individual search result from all the
/// and aggregated search results from the upstream search engines. /// scraped and aggregated search results from the upstream search engines.
/// ///
/// # Fields /// # Fields
/// ///
@ -16,7 +16,7 @@ use crate::config_parser::parser_models::Style;
/// * `url` - The url to be displayed below the search result title in html. /// * `url` - The url to be displayed below the search result title in html.
/// * `description` - The description of the search result. /// * `description` - The description of the search result.
/// * `engine` - The names of the upstream engines from which this results were provided. /// * `engine` - The names of the upstream engines from which this results were provided.
#[derive(Debug, Serialize)] #[derive(Debug, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")] #[serde(rename_all = "camelCase")]
pub struct SearchResult { pub struct SearchResult {
pub title: String, pub title: String,
@ -116,15 +116,15 @@ impl RawSearchResult {
} }
} }
/// A named struct to store and serialize the all the search results scraped and aggregated /// A named struct to store, serialize, deserialize the all the search results scraped and
/// from the upstream search engines. /// aggregated from the upstream search engines.
/// ///
/// # Fields /// # Fields
/// ///
/// * `results` - Stores the individual serializable `SearchResult` struct into a vector of /// * `results` - Stores the individual serializable `SearchResult` struct into a vector of
/// `SearchResult` structs. /// `SearchResult` structs.
/// * `page_query` - Stores the current pages search query `q` provided in the search url. /// * `page_query` - Stores the current pages search query `q` provided in the search url.
#[derive(Serialize)] #[derive(Serialize, Deserialize)]
#[serde(rename_all = "camelCase")] #[serde(rename_all = "camelCase")]
pub struct SearchResults { pub struct SearchResults {
pub results: Vec<SearchResult>, pub results: Vec<SearchResult>,

View File

@ -25,7 +25,7 @@ use crate::engines::{duckduckgo, searx};
/// # Arguments /// # Arguments
/// ///
/// * `query` - Accepts a string to query with the above upstream search engines. /// * `query` - Accepts a string to query with the above upstream search engines.
/// * `page` - Accepts an Option<u32> which could either be a None or a valid page number. /// * `page` - Accepts an u32 page number.
/// ///
/// # Error /// # Error
/// ///
@ -34,7 +34,7 @@ use crate::engines::{duckduckgo, searx};
/// containing appropriate values. /// containing appropriate values.
pub async fn aggregate( pub async fn aggregate(
query: &str, query: &str,
page: Option<u32>, page: u32,
) -> Result<SearchResults, Box<dyn std::error::Error>> { ) -> Result<SearchResults, Box<dyn std::error::Error>> {
let user_agent: String = random_user_agent(); let user_agent: String = random_user_agent();
let mut result_map: HashMap<String, RawSearchResult> = HashMap::new(); let mut result_map: HashMap<String, RawSearchResult> = HashMap::new();

View File

@ -4,7 +4,11 @@
use std::fs::read_to_string; use std::fs::read_to_string;
use crate::{config_parser::parser::Config, search_results_handler::aggregator::aggregate}; use crate::{
cache::cacher::RedisCache,
config_parser::parser::Config,
search_results_handler::{aggregation_models::SearchResults, aggregator::aggregate},
};
use actix_web::{get, web, HttpRequest, HttpResponse}; use actix_web::{get, web, HttpRequest, HttpResponse};
use handlebars::Handlebars; use handlebars::Handlebars;
use serde::Deserialize; use serde::Deserialize;
@ -67,6 +71,9 @@ pub async fn search(
config: web::Data<Config>, config: web::Data<Config>,
) -> Result<HttpResponse, Box<dyn std::error::Error>> { ) -> Result<HttpResponse, Box<dyn std::error::Error>> {
let params = web::Query::<SearchParams>::from_query(req.query_string())?; let params = web::Query::<SearchParams>::from_query(req.query_string())?;
//Initialize redis cache connection struct
let redis_cache = RedisCache::new(config.redis_connection_url.clone());
match &params.q { match &params.q {
Some(query) => { Some(query) => {
if query.trim().is_empty() { if query.trim().is_empty() {
@ -74,11 +81,63 @@ pub async fn search(
.insert_header(("location", "/")) .insert_header(("location", "/"))
.finish()) .finish())
} else { } else {
let mut results_json: crate::search_results_handler::aggregation_models::SearchResults = // Initialize the page url as an empty string
aggregate(query, params.page).await?; let mut page_url = String::new();
results_json.add_style(config.style.clone());
let page_content: String = hbs.render("search", &results_json)?; // Find whether the page is valid page number if not then return
Ok(HttpResponse::Ok().body(page_content)) // the first page number and also construct the page_url accordingly
let page = match params.page {
Some(page_number) => {
if page_number <= 1 {
page_url = format!(
"http://{}:{}/search?q={}&page={}",
config.binding_ip_addr, config.port, query, 1
);
1
} else {
page_url = format!(
"http://{}:{}/search?q={}&page={}",
config.binding_ip_addr, config.port, query, page_number
);
page_number
}
}
None => {
page_url = format!(
"http://{}:{}{}&page={}",
config.binding_ip_addr,
config.port,
req.uri(),
1
);
1
}
};
// fetch the cached results json.
let cached_results_json = redis_cache.clone().cached_results_json(page_url.clone());
// check if fetched results was indeed fetched or it was an error and if so
// handle the data accordingly.
match cached_results_json {
Ok(results_json) => {
let new_results_json: SearchResults = serde_json::from_str(&results_json)?;
let page_content: String = hbs.render("search", &new_results_json)?;
Ok(HttpResponse::Ok().body(page_content))
}
Err(_) => {
let mut results_json: crate::search_results_handler::aggregation_models::SearchResults =
aggregate(query, page).await?;
results_json.add_style(config.style.clone());
redis_cache.clone().cache_results(
serde_json::to_string(&results_json)?,
page_url.clone(),
)?;
let page_content: String = hbs.render("search", &results_json)?;
Ok(HttpResponse::Ok().body(page_content))
}
}
} }
} }
None => Ok(HttpResponse::Found() None => Ok(HttpResponse::Found()
@ -115,6 +174,3 @@ pub async fn settings(
let page_content: String = hbs.render("settings", &config.style)?; let page_content: String = hbs.render("settings", &config.style)?;
Ok(HttpResponse::Ok().body(page_content)) Ok(HttpResponse::Ok().body(page_content))
} }
// TODO: Write tests for tesing parameters for search function that if provided with something
// other than u32 like alphabets and special characters than it should panic

View File

@ -41,3 +41,5 @@ async fn test_index() {
assert_eq!(res.text().await.unwrap(), template); assert_eq!(res.text().await.unwrap(), template);
} }
// TODO: Write tests for tesing parameters for search function that if provided with something
// other than u32 like alphabets and special characters than it should panic

View File

@ -16,3 +16,6 @@ binding_ip_addr = "127.0.0.1" --ip address on the which server should be launche
-- }} -- }}
colorscheme = "catppuccin-mocha" -- the colorscheme name which should be used for the website theme colorscheme = "catppuccin-mocha" -- the colorscheme name which should be used for the website theme
theme = "simple" -- the theme name which should be used for the website theme = "simple" -- the theme name which should be used for the website
-- Caching
redis_connection_url = "redis://127.0.0.1:8082" -- redis connection url address on which the client should connect on.