From c170de8194a948348394997791b65b55b96f22a1 Mon Sep 17 00:00:00 2001
From: neon_arch <mustafadhuleb53@gmail.com>
Date: Tue, 2 May 2023 11:58:21 +0300
Subject: [PATCH] add code to evade ip blocking, improve pagination code and
 fix documentation

---
 Cargo.lock                                    | 39 ++++++++++
 Cargo.toml                                    |  7 +-
 src/cache/cacher.rs                           | 78 +++++++++++++++++++
 src/cache/mod.rs                              |  1 +
 src/config_parser/parser.rs                   |  5 ++
 src/config_parser/parser_models.rs            | 19 +++--
 src/engines/duckduckgo.rs                     | 44 +++++++----
 src/engines/searx.rs                          | 34 ++++----
 src/lib.rs                                    |  1 +
 .../aggregation_models.rs                     | 14 ++--
 src/search_results_handler/aggregator.rs      |  4 +-
 src/server/routes.rs                          | 74 +++++++++++++++---
 tests/index.rs                                |  2 +
 websurfx/config.lua                           |  3 +
 14 files changed, 264 insertions(+), 61 deletions(-)
 create mode 100644 src/cache/cacher.rs
 create mode 100644 src/cache/mod.rs

diff --git a/Cargo.lock b/Cargo.lock
index 8323e12..3a68b6a 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -447,6 +447,16 @@ dependencies = [
  "bitflags",
 ]
 
+[[package]]
+name = "combine"
+version = "4.6.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "35ed6e9d84f0b51a7f52daf1c7d71dd136fd7a3f41a8462b8cdb8c78d920fad4"
+dependencies = [
+ "bytes 1.4.0",
+ "memchr",
+]
+
 [[package]]
 name = "convert_case"
 version = "0.4.0"
@@ -1427,6 +1437,12 @@ version = "2.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "60302e4db3a61da70c0cb7991976248362f30319e88850c487b9b95bbf059e00"
 
+[[package]]
+name = "md5"
+version = "0.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771"
+
 [[package]]
 name = "memchr"
 version = "2.5.0"
@@ -2157,6 +2173,20 @@ dependencies = [
  "rand_core 0.3.1",
 ]
 
+[[package]]
+name = "redis"
+version = "0.23.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3ea8c51b5dc1d8e5fd3350ec8167f464ec0995e79f2e90a075b63371500d557f"
+dependencies = [
+ "combine",
+ "itoa 1.0.6",
+ "percent-encoding 2.2.0",
+ "ryu",
+ "sha1_smol",
+ "url 2.3.1",
+]
+
 [[package]]
 name = "redox_syscall"
 version = "0.1.57"
@@ -2526,6 +2556,12 @@ dependencies = [
  "digest",
 ]
 
+[[package]]
+name = "sha1_smol"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ae1a47186c03a32177042e55dbc5fd5aee900b8e0069a8d70fba96a9375cd012"
+
 [[package]]
 name = "sha2"
 version = "0.10.6"
@@ -3291,6 +3327,9 @@ dependencies = [
  "fake-useragent",
  "handlebars",
  "log",
+ "md5",
+ "rand 0.6.5",
+ "redis",
  "reqwest 0.11.17",
  "rlua",
  "scraper",
diff --git a/Cargo.toml b/Cargo.toml
index 3f38026..6fcb28f 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -15,6 +15,9 @@ actix-web = {version="4.3.1"}
 actix-files = {version="0.6.2"}
 serde_json = {version="*"}
 fake-useragent = {version="*"}
-env_logger = "0.10.0"
-log = "0.4.17"
+env_logger = {version="0.10.0"}
+log = {version="0.4.17"}
 rlua = {version="*"}
+redis = {version="*"}
+md5 = {version="*"}
+rand={version="*"}
diff --git a/src/cache/cacher.rs b/src/cache/cacher.rs
new file mode 100644
index 0000000..54d9a48
--- /dev/null
+++ b/src/cache/cacher.rs
@@ -0,0 +1,78 @@
+//! This module provides the functionality to cache the aggregated results fetched and aggregated
+//! from the upstream search engines in a json format.
+
+use md5::compute;
+use redis::{Client, Commands, Connection};
+
+/// A named struct which stores the redis Connection url address to which the client will
+/// connect to.
+///
+/// # Fields
+///
+/// * `redis_connection_url` - It stores the redis Connection url address.
+#[derive(Clone)]
+pub struct RedisCache {
+    redis_connection_url: String,
+}
+
+impl RedisCache {
+    /// Constructs a new `SearchResult` with the given arguments needed for the struct.
+    ///
+    /// # Arguments
+    ///
+    /// * `redis_connection_url` - It stores the redis Connection url address.
+    pub fn new(redis_connection_url: String) -> Self {
+        RedisCache {
+            redis_connection_url,
+        }
+    }
+
+    /// A helper function which computes the hash of the url and formats and returns it as string.
+    ///
+    /// # Arguments
+    ///
+    /// * `url` - It takes an url as string.
+    fn compute_url_hash(self, url: &str) -> String {
+        format!("{:?}", compute(url))
+    }
+
+    /// A function which fetches the cached json results as json string from the redis server.
+    ///
+    /// # Arguments
+    ///
+    /// * `url` - It takes an url as a string.
+    pub fn cached_results_json(self, url: String) -> Result<String, Box<dyn std::error::Error>> {
+        let hashed_url_string = self.clone().compute_url_hash(&url);
+        let mut redis_connection: Connection =
+            Client::open(self.redis_connection_url)?.get_connection()?;
+        Ok(redis_connection.get(hashed_url_string)?)
+    }
+
+    /// A function which caches the results by using the hashed `url` as the key and
+    /// `json results` as the value and stores it in redis server with ttl(time to live)
+    /// set to 60 seconds.
+    ///
+    /// # Arguments
+    ///
+    /// * `json_results` - It takes the json results string as an argument.
+    /// * `url` - It takes the url as a String.
+    pub fn cache_results(
+        self,
+        json_results: String,
+        url: String,
+    ) -> Result<(), Box<dyn std::error::Error>> {
+        let hashed_url_string = self.clone().compute_url_hash(&url);
+        let mut redis_connection: Connection =
+            Client::open(self.redis_connection_url)?.get_connection()?;
+
+        // put results_json into cache
+        redis_connection.set(hashed_url_string.clone(), json_results)?;
+
+        // Set the TTL for the key to 60 seconds
+        redis_connection
+            .expire::<String, u32>(hashed_url_string.clone(), 60)
+            .unwrap();
+
+        Ok(())
+    }
+}
diff --git a/src/cache/mod.rs b/src/cache/mod.rs
new file mode 100644
index 0000000..91a91ca
--- /dev/null
+++ b/src/cache/mod.rs
@@ -0,0 +1 @@
+pub mod cacher; 
diff --git a/src/config_parser/parser.rs b/src/config_parser/parser.rs
index 226a760..4625bd8 100644
--- a/src/config_parser/parser.rs
+++ b/src/config_parser/parser.rs
@@ -11,11 +11,15 @@ use std::fs;
 //
 /// * `port` - It stores the parsed port number option on which the server should launch.
 /// * `binding_ip_addr` - It stores the parsed ip address option on which the server should launch
+/// * `style` - It stores the theming options for the website.
+/// * `redis_connection_url` - It stores the redis connection url address on which the redis
+/// client should connect.
 #[derive(Clone)]
 pub struct Config {
     pub port: u16,
     pub binding_ip_addr: String,
     pub style: Style,
+    pub redis_connection_url: String,
 }
 
 impl Config {
@@ -44,6 +48,7 @@ impl Config {
                     globals.get::<_, String>("theme")?,
                     globals.get::<_, String>("colorscheme")?,
                 ),
+                redis_connection_url: globals.get::<_, String>("redis_connection_url")?,
             })
         })
     }
diff --git a/src/config_parser/parser_models.rs b/src/config_parser/parser_models.rs
index f27e085..42baf0d 100644
--- a/src/config_parser/parser_models.rs
+++ b/src/config_parser/parser_models.rs
@@ -1,21 +1,24 @@
 //! This module provides public models for handling, storing and serializing parsed config file
 //! options from config.lua by grouping them togather.
 
-use serde::Serialize;
+use serde::{Deserialize, Serialize};
 
-/// A named struct which stores, serializes and groups the parsed config file options of theme and
-/// colorscheme names into the Style struct which derives the `Clone` and `Serialize` traits
-/// where the `Clone` trait is derived for allowing the struct to be cloned and passed to the
-/// server as a shared data between all routes except `/robots.txt` and the `Serialize` trait
-/// has been derived for allowing the object to be serialized so that it can be passed to
-/// handlebars template files.
+/// A named struct which stores,deserializes, serializes and groups the parsed config file options
+/// of theme and colorscheme names into the Style struct which derives the `Clone`, `Serialize`
+/// and Deserialize traits where the `Clone` trait is derived for allowing the struct to be
+/// cloned and passed to the server as a shared data between all routes except `/robots.txt` and
+/// the `Serialize` trait has been derived for allowing the object to be serialized so that it
+/// can be passed to handlebars template files and the `Deserialize` trait has been derived in
+/// order to allow the deserializing the json back to struct in aggregate function in
+/// aggregator.rs and create a new struct out of it and then serialize it back to json and pass
+/// it to the template files.
 ///
 /// # Fields
 //
 /// * `theme` - It stores the parsed theme option used to set a theme for the website.
 /// * `colorscheme` - It stores the parsed colorscheme option used to set a colorscheme for the
 /// theme being used.
-#[derive(Serialize, Clone)]
+#[derive(Serialize, Deserialize, Clone)]
 pub struct Style {
     pub theme: String,
     pub colorscheme: String,
diff --git a/src/engines/duckduckgo.rs b/src/engines/duckduckgo.rs
index 6f227d6..d1c5bd0 100644
--- a/src/engines/duckduckgo.rs
+++ b/src/engines/duckduckgo.rs
@@ -2,9 +2,10 @@
 //! by querying the upstream duckduckgo search engine with user provided query and with a page
 //! number if provided.
 
-use std::collections::HashMap;
+use std::{collections::HashMap, time::Duration};
 
-use reqwest::header::USER_AGENT;
+use rand::Rng;
+use reqwest::header::{HeaderMap, CONTENT_TYPE, REFERER, USER_AGENT};
 use scraper::{Html, Selector};
 
 use crate::search_results_handler::aggregation_models::RawSearchResult;
@@ -17,7 +18,7 @@ use crate::search_results_handler::aggregation_models::RawSearchResult;
 /// # Arguments
 ///
 /// * `query` - Takes the user provided query to query to the upstream search engine with.
-/// * `page` - Takes an Option<u32> as argument which can be either None or a valid page number.
+/// * `page` - Takes an u32 as an argument.
 /// * `user_agent` - Takes a random user agent string as an argument.
 ///
 /// # Errors
@@ -27,32 +28,41 @@ use crate::search_results_handler::aggregation_models::RawSearchResult;
 /// selector fails to initialize"
 pub async fn results(
     query: &str,
-    page: Option<u32>,
+    page: u32,
     user_agent: &str,
 ) -> Result<HashMap<String, RawSearchResult>, Box<dyn std::error::Error>> {
     // Page number can be missing or empty string and so appropriate handling is required
     // so that upstream server recieves valid page number.
     let url: String = match page {
-        Some(page_number) => {
-            if page_number <= 1 {
-                format!("https://html.duckduckgo.com/html/?q={query}&s=&dc=&v=1&o=json&api=/d.js")
-            } else {
-                format!(
-                    "https://duckduckgo.com/html/?q={}&s={}&dc={}&v=1&o=json&api=/d.js",
-                    query,
-                    page_number / 2 * 30,
-                    page_number / 2 * 30 + 1
-                )
-            }
+        1 => {
+            format!("https://html.duckduckgo.com/html/?q={query}&s=&dc=&v=1&o=json&api=/d.js")
+        }
+        _ => {
+            format!(
+                "https://duckduckgo.com/html/?q={}&s={}&dc={}&v=1&o=json&api=/d.js",
+                query,
+                (page / 2 + (page % 2)) * 30,
+                (page / 2 + (page % 2)) * 30 + 1
+            )
         }
-        None => format!("https://html.duckduckgo.com/html/?q={query}&s=&dc=&v=1&o=json&api=/d.js"),
     };
 
+    // Add a random delay before making the request.
+    let mut rng = rand::thread_rng();
+    let delay_secs = rng.gen_range(1, 10);
+    std::thread::sleep(Duration::from_secs(delay_secs));
+
+    // initializing HeaderMap and adding appropriate headers.
+    let mut header_map = HeaderMap::new();
+    header_map.insert(USER_AGENT, user_agent.parse()?);
+    header_map.insert(REFERER, "https://google.com/".parse()?);
+    header_map.insert(CONTENT_TYPE, "text/html; charset=UTF-8".parse()?);
+
     // fetch the html from upstream duckduckgo engine
     // TODO: Write better error handling code to handle no results case.
     let results: String = reqwest::Client::new()
         .get(url)
-        .header(USER_AGENT, user_agent)
+        .headers(header_map) // add spoofed headers to emulate human behaviour
         .send()
         .await?
         .text()
diff --git a/src/engines/searx.rs b/src/engines/searx.rs
index bfba1c6..508655c 100644
--- a/src/engines/searx.rs
+++ b/src/engines/searx.rs
@@ -2,10 +2,10 @@
 //! by querying the upstream searx search engine instance with user provided query and with a page
 //! number if provided.
 
-use std::collections::HashMap;
-
-use reqwest::header::USER_AGENT;
+use rand::Rng;
+use reqwest::header::{HeaderMap, CONTENT_TYPE, REFERER, USER_AGENT};
 use scraper::{Html, Selector};
+use std::{collections::HashMap, time::Duration};
 
 use crate::search_results_handler::aggregation_models::RawSearchResult;
 
@@ -17,7 +17,7 @@ use crate::search_results_handler::aggregation_models::RawSearchResult;
 /// # Arguments
 ///
 /// * `query` - Takes the user provided query to query to the upstream search engine with.
-/// * `page` - Takes an Option<u32> as argument which can be either None or a valid page number.
+/// * `page` - Takes an u32 as an argument.
 /// * `user_agent` - Takes a random user agent string as an argument.
 ///
 /// # Errors
@@ -27,27 +27,29 @@ use crate::search_results_handler::aggregation_models::RawSearchResult;
 /// selector fails to initialize"
 pub async fn results(
     query: &str,
-    page: Option<u32>,
+    page: u32,
     user_agent: &str,
 ) -> Result<HashMap<String, RawSearchResult>, Box<dyn std::error::Error>> {
     // Page number can be missing or empty string and so appropriate handling is required
     // so that upstream server recieves valid page number.
-    let url: String = match page {
-        Some(page_number) => {
-            if page_number <= 1 {
-                format!("https://searx.work/search?q={query}")
-            } else {
-                format!("https://searx.work/search?q={query}&pageno={page_number}",)
-            }
-        }
-        None => format!("https://searx.work/search?q={query}"),
-    };
+    let url: String = format!("https://searx.work/search?q={query}&pageno={page}");
+
+    // Add random delay before making the request.
+    let mut rng = rand::thread_rng();
+    let delay_secs = rng.gen_range(1, 10);
+    std::thread::sleep(Duration::from_secs(delay_secs));
+
+    // initializing headers and adding appropriate headers.
+    let mut header_map = HeaderMap::new();
+    header_map.insert(USER_AGENT, user_agent.parse()?);
+    header_map.insert(REFERER, "https://google.com/".parse()?);
+    header_map.insert(CONTENT_TYPE, "application/x-www-form-urlencoded".parse()?);
 
     // fetch the html from upstream searx instance engine
     // TODO: Write better error handling code to handle no results case.
     let results: String = reqwest::Client::new()
         .get(url)
-        .header(USER_AGENT, user_agent)
+        .headers(header_map) // add spoofed headers to emulate human behaviours.
         .send()
         .await?
         .text()
diff --git a/src/lib.rs b/src/lib.rs
index 5e7a332..c234658 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,6 +1,7 @@
 //! This main library module provides the functionality to provide and handle the Tcp server
 //! and register all the routes for the `websurfx` meta search engine website.
 
+pub mod cache;
 pub mod config_parser;
 pub mod engines;
 pub mod search_results_handler;
diff --git a/src/search_results_handler/aggregation_models.rs b/src/search_results_handler/aggregation_models.rs
index 3d2e081..4fe670e 100644
--- a/src/search_results_handler/aggregation_models.rs
+++ b/src/search_results_handler/aggregation_models.rs
@@ -1,12 +1,12 @@
 //! This module provides public models for handling, storing and serializing of search results
 //! data scraped from the upstream search engines.
 
-use serde::Serialize;
+use serde::{Deserialize, Serialize};
 
 use crate::config_parser::parser_models::Style;
 
-/// A named struct to store and serialize the individual search result from all the scraped
-/// and aggregated search results from the upstream search engines.
+/// A named struct to store, serialize and deserializes the individual search result from all the
+/// scraped and aggregated search results from the upstream search engines.
 ///
 /// # Fields
 ///
@@ -16,7 +16,7 @@ use crate::config_parser::parser_models::Style;
 /// * `url` - The url to be displayed below the search result title in html.
 /// * `description` - The description of the search result.
 /// * `engine` - The names of the upstream engines from which this results were provided.
-#[derive(Debug, Serialize)]
+#[derive(Debug, Serialize, Deserialize)]
 #[serde(rename_all = "camelCase")]
 pub struct SearchResult {
     pub title: String,
@@ -116,15 +116,15 @@ impl RawSearchResult {
     }
 }
 
-/// A named struct to store and serialize the all the search results scraped and aggregated
-/// from the upstream search engines.
+/// A named struct to store, serialize, deserialize the all the search results scraped and 
+/// aggregated from the upstream search engines.
 ///
 /// # Fields
 ///
 /// * `results` - Stores the individual serializable `SearchResult` struct into a vector of
 /// `SearchResult` structs.
 /// * `page_query` - Stores the current pages search query `q` provided in the search url.
-#[derive(Serialize)]
+#[derive(Serialize, Deserialize)]
 #[serde(rename_all = "camelCase")]
 pub struct SearchResults {
     pub results: Vec<SearchResult>,
diff --git a/src/search_results_handler/aggregator.rs b/src/search_results_handler/aggregator.rs
index 096c5c7..5fd5770 100644
--- a/src/search_results_handler/aggregator.rs
+++ b/src/search_results_handler/aggregator.rs
@@ -25,7 +25,7 @@ use crate::engines::{duckduckgo, searx};
 /// # Arguments
 ///
 /// * `query` - Accepts a string to query with the above upstream search engines.
-/// * `page` - Accepts an Option<u32> which could either be a None or a valid page number.
+/// * `page` - Accepts an u32 page number.
 ///
 /// # Error
 ///
@@ -34,7 +34,7 @@ use crate::engines::{duckduckgo, searx};
 /// containing appropriate values.
 pub async fn aggregate(
     query: &str,
-    page: Option<u32>,
+    page: u32,
 ) -> Result<SearchResults, Box<dyn std::error::Error>> {
     let user_agent: String = random_user_agent();
     let mut result_map: HashMap<String, RawSearchResult> = HashMap::new();
diff --git a/src/server/routes.rs b/src/server/routes.rs
index 221bbbf..1ee9f35 100644
--- a/src/server/routes.rs
+++ b/src/server/routes.rs
@@ -4,7 +4,11 @@
 
 use std::fs::read_to_string;
 
-use crate::{config_parser::parser::Config, search_results_handler::aggregator::aggregate};
+use crate::{
+    cache::cacher::RedisCache,
+    config_parser::parser::Config,
+    search_results_handler::{aggregation_models::SearchResults, aggregator::aggregate},
+};
 use actix_web::{get, web, HttpRequest, HttpResponse};
 use handlebars::Handlebars;
 use serde::Deserialize;
@@ -67,6 +71,9 @@ pub async fn search(
     config: web::Data<Config>,
 ) -> Result<HttpResponse, Box<dyn std::error::Error>> {
     let params = web::Query::<SearchParams>::from_query(req.query_string())?;
+
+    //Initialize redis cache connection struct
+    let redis_cache = RedisCache::new(config.redis_connection_url.clone());
     match &params.q {
         Some(query) => {
             if query.trim().is_empty() {
@@ -74,11 +81,63 @@ pub async fn search(
                     .insert_header(("location", "/"))
                     .finish())
             } else {
-                let mut results_json: crate::search_results_handler::aggregation_models::SearchResults =
-                    aggregate(query, params.page).await?;
-                results_json.add_style(config.style.clone());
-                let page_content: String = hbs.render("search", &results_json)?;
-                Ok(HttpResponse::Ok().body(page_content))
+                // Initialize the page url as an empty string
+                let mut page_url = String::new();
+
+                // Find whether the page is valid page number if not then return
+                // the first page number and also construct the page_url accordingly
+                let page = match params.page {
+                    Some(page_number) => {
+                        if page_number <= 1 {
+                            page_url = format!(
+                                "http://{}:{}/search?q={}&page={}",
+                                config.binding_ip_addr, config.port, query, 1
+                            );
+                            1
+                        } else {
+                            page_url = format!(
+                                "http://{}:{}/search?q={}&page={}",
+                                config.binding_ip_addr, config.port, query, page_number
+                            );
+
+                            page_number
+                        }
+                    }
+                    None => {
+                        page_url = format!(
+                            "http://{}:{}{}&page={}",
+                            config.binding_ip_addr,
+                            config.port,
+                            req.uri(),
+                            1
+                        );
+
+                        1
+                    }
+                };
+
+                // fetch the cached results json.
+                let cached_results_json = redis_cache.clone().cached_results_json(page_url.clone());
+                // check if fetched results was indeed fetched or it was an error and if so
+                // handle the data accordingly.
+                match cached_results_json {
+                    Ok(results_json) => {
+                        let new_results_json: SearchResults = serde_json::from_str(&results_json)?;
+                        let page_content: String = hbs.render("search", &new_results_json)?;
+                        Ok(HttpResponse::Ok().body(page_content))
+                    }
+                    Err(_) => {
+                        let mut results_json: crate::search_results_handler::aggregation_models::SearchResults =
+                    aggregate(query, page).await?;
+                        results_json.add_style(config.style.clone());
+                        redis_cache.clone().cache_results(
+                            serde_json::to_string(&results_json)?,
+                            page_url.clone(),
+                        )?;
+                        let page_content: String = hbs.render("search", &results_json)?;
+                        Ok(HttpResponse::Ok().body(page_content))
+                    }
+                }
             }
         }
         None => Ok(HttpResponse::Found()
@@ -115,6 +174,3 @@ pub async fn settings(
     let page_content: String = hbs.render("settings", &config.style)?;
     Ok(HttpResponse::Ok().body(page_content))
 }
-
-// TODO: Write tests for tesing parameters for search function that if provided with something
-// other than u32 like alphabets and special characters than it should panic
diff --git a/tests/index.rs b/tests/index.rs
index 6ef11c4..e3059bf 100644
--- a/tests/index.rs
+++ b/tests/index.rs
@@ -41,3 +41,5 @@ async fn test_index() {
     assert_eq!(res.text().await.unwrap(), template);
 }
 
+// TODO: Write tests for tesing parameters for search function that if provided with something
+// other than u32 like alphabets and special characters than it should panic
diff --git a/websurfx/config.lua b/websurfx/config.lua
index cf28c13..916a9b3 100644
--- a/websurfx/config.lua
+++ b/websurfx/config.lua
@@ -16,3 +16,6 @@ binding_ip_addr = "127.0.0.1" --ip address on the which server should be launche
 -- }}
 colorscheme = "catppuccin-mocha" -- the colorscheme name which should be used for the website theme
 theme = "simple" -- the theme name which should be used for the website
+
+-- Caching
+redis_connection_url = "redis://127.0.0.1:8082" -- redis connection url address on which the client should connect on.