From c170de8194a948348394997791b65b55b96f22a1 Mon Sep 17 00:00:00 2001 From: neon_arch Date: Tue, 2 May 2023 11:58:21 +0300 Subject: [PATCH] add code to evade ip blocking, improve pagination code and fix documentation --- Cargo.lock | 39 ++++++++++ Cargo.toml | 7 +- src/cache/cacher.rs | 78 +++++++++++++++++++ src/cache/mod.rs | 1 + src/config_parser/parser.rs | 5 ++ src/config_parser/parser_models.rs | 19 +++-- src/engines/duckduckgo.rs | 44 +++++++---- src/engines/searx.rs | 34 ++++---- src/lib.rs | 1 + .../aggregation_models.rs | 14 ++-- src/search_results_handler/aggregator.rs | 4 +- src/server/routes.rs | 74 +++++++++++++++--- tests/index.rs | 2 + websurfx/config.lua | 3 + 14 files changed, 264 insertions(+), 61 deletions(-) create mode 100644 src/cache/cacher.rs create mode 100644 src/cache/mod.rs diff --git a/Cargo.lock b/Cargo.lock index 8323e12..3a68b6a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -447,6 +447,16 @@ dependencies = [ "bitflags", ] +[[package]] +name = "combine" +version = "4.6.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "35ed6e9d84f0b51a7f52daf1c7d71dd136fd7a3f41a8462b8cdb8c78d920fad4" +dependencies = [ + "bytes 1.4.0", + "memchr", +] + [[package]] name = "convert_case" version = "0.4.0" @@ -1427,6 +1437,12 @@ version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "60302e4db3a61da70c0cb7991976248362f30319e88850c487b9b95bbf059e00" +[[package]] +name = "md5" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771" + [[package]] name = "memchr" version = "2.5.0" @@ -2157,6 +2173,20 @@ dependencies = [ "rand_core 0.3.1", ] +[[package]] +name = "redis" +version = "0.23.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3ea8c51b5dc1d8e5fd3350ec8167f464ec0995e79f2e90a075b63371500d557f" +dependencies = [ + "combine", + "itoa 1.0.6", + "percent-encoding 2.2.0", + "ryu", + "sha1_smol", + "url 2.3.1", +] + [[package]] name = "redox_syscall" version = "0.1.57" @@ -2526,6 +2556,12 @@ dependencies = [ "digest", ] +[[package]] +name = "sha1_smol" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae1a47186c03a32177042e55dbc5fd5aee900b8e0069a8d70fba96a9375cd012" + [[package]] name = "sha2" version = "0.10.6" @@ -3291,6 +3327,9 @@ dependencies = [ "fake-useragent", "handlebars", "log", + "md5", + "rand 0.6.5", + "redis", "reqwest 0.11.17", "rlua", "scraper", diff --git a/Cargo.toml b/Cargo.toml index 3f38026..6fcb28f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -15,6 +15,9 @@ actix-web = {version="4.3.1"} actix-files = {version="0.6.2"} serde_json = {version="*"} fake-useragent = {version="*"} -env_logger = "0.10.0" -log = "0.4.17" +env_logger = {version="0.10.0"} +log = {version="0.4.17"} rlua = {version="*"} +redis = {version="*"} +md5 = {version="*"} +rand={version="*"} diff --git a/src/cache/cacher.rs b/src/cache/cacher.rs new file mode 100644 index 0000000..54d9a48 --- /dev/null +++ b/src/cache/cacher.rs @@ -0,0 +1,78 @@ +//! This module provides the functionality to cache the aggregated results fetched and aggregated +//! from the upstream search engines in a json format. + +use md5::compute; +use redis::{Client, Commands, Connection}; + +/// A named struct which stores the redis Connection url address to which the client will +/// connect to. +/// +/// # Fields +/// +/// * `redis_connection_url` - It stores the redis Connection url address. +#[derive(Clone)] +pub struct RedisCache { + redis_connection_url: String, +} + +impl RedisCache { + /// Constructs a new `SearchResult` with the given arguments needed for the struct. + /// + /// # Arguments + /// + /// * `redis_connection_url` - It stores the redis Connection url address. + pub fn new(redis_connection_url: String) -> Self { + RedisCache { + redis_connection_url, + } + } + + /// A helper function which computes the hash of the url and formats and returns it as string. + /// + /// # Arguments + /// + /// * `url` - It takes an url as string. + fn compute_url_hash(self, url: &str) -> String { + format!("{:?}", compute(url)) + } + + /// A function which fetches the cached json results as json string from the redis server. + /// + /// # Arguments + /// + /// * `url` - It takes an url as a string. + pub fn cached_results_json(self, url: String) -> Result> { + let hashed_url_string = self.clone().compute_url_hash(&url); + let mut redis_connection: Connection = + Client::open(self.redis_connection_url)?.get_connection()?; + Ok(redis_connection.get(hashed_url_string)?) + } + + /// A function which caches the results by using the hashed `url` as the key and + /// `json results` as the value and stores it in redis server with ttl(time to live) + /// set to 60 seconds. + /// + /// # Arguments + /// + /// * `json_results` - It takes the json results string as an argument. + /// * `url` - It takes the url as a String. + pub fn cache_results( + self, + json_results: String, + url: String, + ) -> Result<(), Box> { + let hashed_url_string = self.clone().compute_url_hash(&url); + let mut redis_connection: Connection = + Client::open(self.redis_connection_url)?.get_connection()?; + + // put results_json into cache + redis_connection.set(hashed_url_string.clone(), json_results)?; + + // Set the TTL for the key to 60 seconds + redis_connection + .expire::(hashed_url_string.clone(), 60) + .unwrap(); + + Ok(()) + } +} diff --git a/src/cache/mod.rs b/src/cache/mod.rs new file mode 100644 index 0000000..91a91ca --- /dev/null +++ b/src/cache/mod.rs @@ -0,0 +1 @@ +pub mod cacher; diff --git a/src/config_parser/parser.rs b/src/config_parser/parser.rs index 226a760..4625bd8 100644 --- a/src/config_parser/parser.rs +++ b/src/config_parser/parser.rs @@ -11,11 +11,15 @@ use std::fs; // /// * `port` - It stores the parsed port number option on which the server should launch. /// * `binding_ip_addr` - It stores the parsed ip address option on which the server should launch +/// * `style` - It stores the theming options for the website. +/// * `redis_connection_url` - It stores the redis connection url address on which the redis +/// client should connect. #[derive(Clone)] pub struct Config { pub port: u16, pub binding_ip_addr: String, pub style: Style, + pub redis_connection_url: String, } impl Config { @@ -44,6 +48,7 @@ impl Config { globals.get::<_, String>("theme")?, globals.get::<_, String>("colorscheme")?, ), + redis_connection_url: globals.get::<_, String>("redis_connection_url")?, }) }) } diff --git a/src/config_parser/parser_models.rs b/src/config_parser/parser_models.rs index f27e085..42baf0d 100644 --- a/src/config_parser/parser_models.rs +++ b/src/config_parser/parser_models.rs @@ -1,21 +1,24 @@ //! This module provides public models for handling, storing and serializing parsed config file //! options from config.lua by grouping them togather. -use serde::Serialize; +use serde::{Deserialize, Serialize}; -/// A named struct which stores, serializes and groups the parsed config file options of theme and -/// colorscheme names into the Style struct which derives the `Clone` and `Serialize` traits -/// where the `Clone` trait is derived for allowing the struct to be cloned and passed to the -/// server as a shared data between all routes except `/robots.txt` and the `Serialize` trait -/// has been derived for allowing the object to be serialized so that it can be passed to -/// handlebars template files. +/// A named struct which stores,deserializes, serializes and groups the parsed config file options +/// of theme and colorscheme names into the Style struct which derives the `Clone`, `Serialize` +/// and Deserialize traits where the `Clone` trait is derived for allowing the struct to be +/// cloned and passed to the server as a shared data between all routes except `/robots.txt` and +/// the `Serialize` trait has been derived for allowing the object to be serialized so that it +/// can be passed to handlebars template files and the `Deserialize` trait has been derived in +/// order to allow the deserializing the json back to struct in aggregate function in +/// aggregator.rs and create a new struct out of it and then serialize it back to json and pass +/// it to the template files. /// /// # Fields // /// * `theme` - It stores the parsed theme option used to set a theme for the website. /// * `colorscheme` - It stores the parsed colorscheme option used to set a colorscheme for the /// theme being used. -#[derive(Serialize, Clone)] +#[derive(Serialize, Deserialize, Clone)] pub struct Style { pub theme: String, pub colorscheme: String, diff --git a/src/engines/duckduckgo.rs b/src/engines/duckduckgo.rs index 6f227d6..d1c5bd0 100644 --- a/src/engines/duckduckgo.rs +++ b/src/engines/duckduckgo.rs @@ -2,9 +2,10 @@ //! by querying the upstream duckduckgo search engine with user provided query and with a page //! number if provided. -use std::collections::HashMap; +use std::{collections::HashMap, time::Duration}; -use reqwest::header::USER_AGENT; +use rand::Rng; +use reqwest::header::{HeaderMap, CONTENT_TYPE, REFERER, USER_AGENT}; use scraper::{Html, Selector}; use crate::search_results_handler::aggregation_models::RawSearchResult; @@ -17,7 +18,7 @@ use crate::search_results_handler::aggregation_models::RawSearchResult; /// # Arguments /// /// * `query` - Takes the user provided query to query to the upstream search engine with. -/// * `page` - Takes an Option as argument which can be either None or a valid page number. +/// * `page` - Takes an u32 as an argument. /// * `user_agent` - Takes a random user agent string as an argument. /// /// # Errors @@ -27,32 +28,41 @@ use crate::search_results_handler::aggregation_models::RawSearchResult; /// selector fails to initialize" pub async fn results( query: &str, - page: Option, + page: u32, user_agent: &str, ) -> Result, Box> { // Page number can be missing or empty string and so appropriate handling is required // so that upstream server recieves valid page number. let url: String = match page { - Some(page_number) => { - if page_number <= 1 { - format!("https://html.duckduckgo.com/html/?q={query}&s=&dc=&v=1&o=json&api=/d.js") - } else { - format!( - "https://duckduckgo.com/html/?q={}&s={}&dc={}&v=1&o=json&api=/d.js", - query, - page_number / 2 * 30, - page_number / 2 * 30 + 1 - ) - } + 1 => { + format!("https://html.duckduckgo.com/html/?q={query}&s=&dc=&v=1&o=json&api=/d.js") + } + _ => { + format!( + "https://duckduckgo.com/html/?q={}&s={}&dc={}&v=1&o=json&api=/d.js", + query, + (page / 2 + (page % 2)) * 30, + (page / 2 + (page % 2)) * 30 + 1 + ) } - None => format!("https://html.duckduckgo.com/html/?q={query}&s=&dc=&v=1&o=json&api=/d.js"), }; + // Add a random delay before making the request. + let mut rng = rand::thread_rng(); + let delay_secs = rng.gen_range(1, 10); + std::thread::sleep(Duration::from_secs(delay_secs)); + + // initializing HeaderMap and adding appropriate headers. + let mut header_map = HeaderMap::new(); + header_map.insert(USER_AGENT, user_agent.parse()?); + header_map.insert(REFERER, "https://google.com/".parse()?); + header_map.insert(CONTENT_TYPE, "text/html; charset=UTF-8".parse()?); + // fetch the html from upstream duckduckgo engine // TODO: Write better error handling code to handle no results case. let results: String = reqwest::Client::new() .get(url) - .header(USER_AGENT, user_agent) + .headers(header_map) // add spoofed headers to emulate human behaviour .send() .await? .text() diff --git a/src/engines/searx.rs b/src/engines/searx.rs index bfba1c6..508655c 100644 --- a/src/engines/searx.rs +++ b/src/engines/searx.rs @@ -2,10 +2,10 @@ //! by querying the upstream searx search engine instance with user provided query and with a page //! number if provided. -use std::collections::HashMap; - -use reqwest::header::USER_AGENT; +use rand::Rng; +use reqwest::header::{HeaderMap, CONTENT_TYPE, REFERER, USER_AGENT}; use scraper::{Html, Selector}; +use std::{collections::HashMap, time::Duration}; use crate::search_results_handler::aggregation_models::RawSearchResult; @@ -17,7 +17,7 @@ use crate::search_results_handler::aggregation_models::RawSearchResult; /// # Arguments /// /// * `query` - Takes the user provided query to query to the upstream search engine with. -/// * `page` - Takes an Option as argument which can be either None or a valid page number. +/// * `page` - Takes an u32 as an argument. /// * `user_agent` - Takes a random user agent string as an argument. /// /// # Errors @@ -27,27 +27,29 @@ use crate::search_results_handler::aggregation_models::RawSearchResult; /// selector fails to initialize" pub async fn results( query: &str, - page: Option, + page: u32, user_agent: &str, ) -> Result, Box> { // Page number can be missing or empty string and so appropriate handling is required // so that upstream server recieves valid page number. - let url: String = match page { - Some(page_number) => { - if page_number <= 1 { - format!("https://searx.work/search?q={query}") - } else { - format!("https://searx.work/search?q={query}&pageno={page_number}",) - } - } - None => format!("https://searx.work/search?q={query}"), - }; + let url: String = format!("https://searx.work/search?q={query}&pageno={page}"); + + // Add random delay before making the request. + let mut rng = rand::thread_rng(); + let delay_secs = rng.gen_range(1, 10); + std::thread::sleep(Duration::from_secs(delay_secs)); + + // initializing headers and adding appropriate headers. + let mut header_map = HeaderMap::new(); + header_map.insert(USER_AGENT, user_agent.parse()?); + header_map.insert(REFERER, "https://google.com/".parse()?); + header_map.insert(CONTENT_TYPE, "application/x-www-form-urlencoded".parse()?); // fetch the html from upstream searx instance engine // TODO: Write better error handling code to handle no results case. let results: String = reqwest::Client::new() .get(url) - .header(USER_AGENT, user_agent) + .headers(header_map) // add spoofed headers to emulate human behaviours. .send() .await? .text() diff --git a/src/lib.rs b/src/lib.rs index 5e7a332..c234658 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,6 +1,7 @@ //! This main library module provides the functionality to provide and handle the Tcp server //! and register all the routes for the `websurfx` meta search engine website. +pub mod cache; pub mod config_parser; pub mod engines; pub mod search_results_handler; diff --git a/src/search_results_handler/aggregation_models.rs b/src/search_results_handler/aggregation_models.rs index 3d2e081..4fe670e 100644 --- a/src/search_results_handler/aggregation_models.rs +++ b/src/search_results_handler/aggregation_models.rs @@ -1,12 +1,12 @@ //! This module provides public models for handling, storing and serializing of search results //! data scraped from the upstream search engines. -use serde::Serialize; +use serde::{Deserialize, Serialize}; use crate::config_parser::parser_models::Style; -/// A named struct to store and serialize the individual search result from all the scraped -/// and aggregated search results from the upstream search engines. +/// A named struct to store, serialize and deserializes the individual search result from all the +/// scraped and aggregated search results from the upstream search engines. /// /// # Fields /// @@ -16,7 +16,7 @@ use crate::config_parser::parser_models::Style; /// * `url` - The url to be displayed below the search result title in html. /// * `description` - The description of the search result. /// * `engine` - The names of the upstream engines from which this results were provided. -#[derive(Debug, Serialize)] +#[derive(Debug, Serialize, Deserialize)] #[serde(rename_all = "camelCase")] pub struct SearchResult { pub title: String, @@ -116,15 +116,15 @@ impl RawSearchResult { } } -/// A named struct to store and serialize the all the search results scraped and aggregated -/// from the upstream search engines. +/// A named struct to store, serialize, deserialize the all the search results scraped and +/// aggregated from the upstream search engines. /// /// # Fields /// /// * `results` - Stores the individual serializable `SearchResult` struct into a vector of /// `SearchResult` structs. /// * `page_query` - Stores the current pages search query `q` provided in the search url. -#[derive(Serialize)] +#[derive(Serialize, Deserialize)] #[serde(rename_all = "camelCase")] pub struct SearchResults { pub results: Vec, diff --git a/src/search_results_handler/aggregator.rs b/src/search_results_handler/aggregator.rs index 096c5c7..5fd5770 100644 --- a/src/search_results_handler/aggregator.rs +++ b/src/search_results_handler/aggregator.rs @@ -25,7 +25,7 @@ use crate::engines::{duckduckgo, searx}; /// # Arguments /// /// * `query` - Accepts a string to query with the above upstream search engines. -/// * `page` - Accepts an Option which could either be a None or a valid page number. +/// * `page` - Accepts an u32 page number. /// /// # Error /// @@ -34,7 +34,7 @@ use crate::engines::{duckduckgo, searx}; /// containing appropriate values. pub async fn aggregate( query: &str, - page: Option, + page: u32, ) -> Result> { let user_agent: String = random_user_agent(); let mut result_map: HashMap = HashMap::new(); diff --git a/src/server/routes.rs b/src/server/routes.rs index 221bbbf..1ee9f35 100644 --- a/src/server/routes.rs +++ b/src/server/routes.rs @@ -4,7 +4,11 @@ use std::fs::read_to_string; -use crate::{config_parser::parser::Config, search_results_handler::aggregator::aggregate}; +use crate::{ + cache::cacher::RedisCache, + config_parser::parser::Config, + search_results_handler::{aggregation_models::SearchResults, aggregator::aggregate}, +}; use actix_web::{get, web, HttpRequest, HttpResponse}; use handlebars::Handlebars; use serde::Deserialize; @@ -67,6 +71,9 @@ pub async fn search( config: web::Data, ) -> Result> { let params = web::Query::::from_query(req.query_string())?; + + //Initialize redis cache connection struct + let redis_cache = RedisCache::new(config.redis_connection_url.clone()); match ¶ms.q { Some(query) => { if query.trim().is_empty() { @@ -74,11 +81,63 @@ pub async fn search( .insert_header(("location", "/")) .finish()) } else { - let mut results_json: crate::search_results_handler::aggregation_models::SearchResults = - aggregate(query, params.page).await?; - results_json.add_style(config.style.clone()); - let page_content: String = hbs.render("search", &results_json)?; - Ok(HttpResponse::Ok().body(page_content)) + // Initialize the page url as an empty string + let mut page_url = String::new(); + + // Find whether the page is valid page number if not then return + // the first page number and also construct the page_url accordingly + let page = match params.page { + Some(page_number) => { + if page_number <= 1 { + page_url = format!( + "http://{}:{}/search?q={}&page={}", + config.binding_ip_addr, config.port, query, 1 + ); + 1 + } else { + page_url = format!( + "http://{}:{}/search?q={}&page={}", + config.binding_ip_addr, config.port, query, page_number + ); + + page_number + } + } + None => { + page_url = format!( + "http://{}:{}{}&page={}", + config.binding_ip_addr, + config.port, + req.uri(), + 1 + ); + + 1 + } + }; + + // fetch the cached results json. + let cached_results_json = redis_cache.clone().cached_results_json(page_url.clone()); + // check if fetched results was indeed fetched or it was an error and if so + // handle the data accordingly. + match cached_results_json { + Ok(results_json) => { + let new_results_json: SearchResults = serde_json::from_str(&results_json)?; + let page_content: String = hbs.render("search", &new_results_json)?; + Ok(HttpResponse::Ok().body(page_content)) + } + Err(_) => { + let mut results_json: crate::search_results_handler::aggregation_models::SearchResults = + aggregate(query, page).await?; + results_json.add_style(config.style.clone()); + redis_cache.clone().cache_results( + serde_json::to_string(&results_json)?, + page_url.clone(), + )?; + let page_content: String = hbs.render("search", &results_json)?; + Ok(HttpResponse::Ok().body(page_content)) + } + } } } None => Ok(HttpResponse::Found() @@ -115,6 +174,3 @@ pub async fn settings( let page_content: String = hbs.render("settings", &config.style)?; Ok(HttpResponse::Ok().body(page_content)) } - -// TODO: Write tests for tesing parameters for search function that if provided with something -// other than u32 like alphabets and special characters than it should panic diff --git a/tests/index.rs b/tests/index.rs index 6ef11c4..e3059bf 100644 --- a/tests/index.rs +++ b/tests/index.rs @@ -41,3 +41,5 @@ async fn test_index() { assert_eq!(res.text().await.unwrap(), template); } +// TODO: Write tests for tesing parameters for search function that if provided with something +// other than u32 like alphabets and special characters than it should panic diff --git a/websurfx/config.lua b/websurfx/config.lua index cf28c13..916a9b3 100644 --- a/websurfx/config.lua +++ b/websurfx/config.lua @@ -16,3 +16,6 @@ binding_ip_addr = "127.0.0.1" --ip address on the which server should be launche -- }} colorscheme = "catppuccin-mocha" -- the colorscheme name which should be used for the website theme theme = "simple" -- the theme name which should be used for the website + +-- Caching +redis_connection_url = "redis://127.0.0.1:8082" -- redis connection url address on which the client should connect on.