0
0
mirror of https://github.com/neon-mmd/websurfx.git synced 2024-11-21 13:38:21 -05:00

Merge pull request #202 from neon-mmd/feat-disallow-user-to-search-via-lists

 `Safe search` levels and `filter lists` for the app
This commit is contained in:
alamin655 2023-09-11 11:25:12 +05:30 committed by GitHub
commit 867753a135
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
14 changed files with 271 additions and 80 deletions

22
Cargo.lock generated
View File

@ -532,18 +532,18 @@ dependencies = [
[[package]]
name = "clap"
version = "4.4.1"
version = "4.4.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7c8d502cbaec4595d2e7d5f61e318f05417bd2b66fdc3809498f0d3fdf0bea27"
checksum = "6a13b88d2c62ff462f88e4a121f17a82c1af05693a2f192b5c38d14de73c19f6"
dependencies = [
"clap_builder",
]
[[package]]
name = "clap_builder"
version = "4.4.1"
version = "4.4.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5891c7bc0edb3e1c2204fc5e94009affabeb1821c9e5fdc3959536c5c0bb984d"
checksum = "2bb9faaa7c2ef94b2743a21f5a29e6f0010dff4caa69ac8e9d6cf8b6fa74da08"
dependencies = [
"anstyle",
"clap_lex",
@ -1270,9 +1270,9 @@ checksum = "eabb4a44450da02c90444cf74558da904edde8fb4e9035a9a6a4e15445af0bd7"
[[package]]
name = "handlebars"
version = "4.3.7"
version = "4.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "83c3372087601b532857d332f5957cbae686da52bb7810bf038c3e3c3cc2fa0d"
checksum = "c39b3bc2a8f715298032cf5087e58573809374b08160aa7d750582bdb82d2683"
dependencies = [
"log",
"pest",
@ -2494,9 +2494,9 @@ dependencies = [
[[package]]
name = "redis"
version = "0.23.2"
version = "0.23.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ffd6543a7bc6428396845f6854ccf3d1ae8823816592e2cbe74f20f50f209d02"
checksum = "4f49cdc0bb3f412bf8e7d1bd90fe1d9eb10bc5c399ba90973c14662a27b3f8ba"
dependencies = [
"arc-swap",
"async-trait",
@ -2663,9 +2663,9 @@ dependencies = [
[[package]]
name = "rustix"
version = "0.38.10"
version = "0.38.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ed6248e1caa625eb708e266e06159f135e8c26f2bb7ceb72dc4b2766d0340964"
checksum = "c0c3dde1fc030af041adc40e79c0e7fbcf431dd24870053d187d7c66e4b87453"
dependencies = [
"bitflags 2.4.0",
"errno",
@ -3697,7 +3697,7 @@ dependencies = [
[[package]]
name = "websurfx"
version = "0.18.6"
version = "0.19.0"
dependencies = [
"actix-cors",
"actix-files",

View File

@ -1,6 +1,6 @@
[package]
name = "websurfx"
version = "0.18.6"
version = "0.19.0"
edition = "2021"
description = "An open-source alternative to Searx that provides clean, ad-free, and organic results with incredible speed while keeping privacy and security in mind."
repository = "https://github.com/neon-mmd/websurfx"
@ -10,7 +10,7 @@ license = "AGPL-3.0"
reqwest = {version="0.11.20",features=["json"]}
tokio = {version="1.32.0",features=["rt-multi-thread","macros"]}
serde = {version="1.0.188",features=["derive"]}
handlebars = { version = "4.3.7", features = ["dir_source"] }
handlebars = { version = "4.4.0", features = ["dir_source"] }
scraper = {version="0.17.1"}
actix-web = {version="4.4.0", features = ["cookies"]}
actix-files = {version="0.6.2"}
@ -19,8 +19,8 @@ serde_json = {version="1.0.105"}
fake-useragent = {version="0.1.3"}
env_logger = {version="0.10.0"}
log = {version="0.4.20"}
mlua = {version="0.8.10",features=["luajit"]}
redis = {version="0.23.2",features=["tokio-comp","connection-manager"]}
mlua = {version="0.8.10", features=["luajit"]}
redis = {version="0.23.3", features=["tokio-comp","connection-manager"]}
md5 = {version="0.7.0"}
rand={version="0.8.5"}
once_cell = {version="1.18.0"}

BIN
public/images/barricade.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 892 KiB

BIN
public/images/filter.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 102 KiB

View File

@ -132,6 +132,35 @@ body {
width: 1.2rem;
height: 1.2rem;
}
.results .result_disallowed,
.results .result_filtered {
display: flex;
justify-content: center;
align-items: center;
gap: 10rem;
font-size: 2rem;
color: var(--foreground-color);
margin: 0rem 7rem;
}
.results .result_disallowed .user_query,
.results .result_filtered .user_query {
color: var(--background-color);
font-weight: 300;
}
.results .result_disallowed img,
.results .result_filtered img {
width: 30rem;
}
.results .result_disallowed div,
.results .result_filtered div {
display: flex;
flex-direction: column;
gap: 1rem;
line-break: strict;
}
/* styles for the footer and header */

View File

@ -13,7 +13,39 @@
{{/each}}
</div>
</div>
{{/each}} {{else}}
{{/each}} {{else}} {{#if disallowed}}
<div class="result_disallowed">
<div class="description">
<p>
Your search - <span class="user_query">{{{this.pageQuery}}}</span> -
has been disallowed.
</p>
<p class="description_paragraph">Dear user,</p>
<p class="description_paragraph">
The query - <span class="user_query">{{{this.pageQuery}}}</span> - has
been blacklisted via server configuration and hence disallowed by the
server. Henceforth no results could be displayed for your query.
</p>
</div>
<img src="./images/barricade.png" alt="Image of a Barricade" />
</div>
{{else}} {{#if filtered}}
<div class="result_filtered">
<div class="description">
<p>
Your search - <span class="user_query">{{{this.pageQuery}}}</span> -
has been filtered.
</p>
<p class="description_paragraph">Dear user,</p>
<p class="description_paragraph">
All the search results contain results that has been configured to be
filtered out via server configuration and henceforth has been
completely filtered out.
</p>
</div>
<img src="./images/filter.png" alt="Image of a paper inside a funnel" />
</div>
{{else}}
<div class="result_not_found">
<p>Your search - {{{this.pageQuery}}} - did not match any documents.</p>
<p class="suggestions">Suggestions:</p>
@ -24,7 +56,7 @@
</ul>
<img src="./images/no_results.gif" alt="Man fishing gif" />
</div>
{{/if}}
{{/if}} {{/if}} {{/if}}
</div>
<div class="page_navigation">
<button type="button" onclick="navigate_backward()">

View File

@ -35,6 +35,7 @@ pub struct Config {
pub upstream_search_engines: Vec<crate::engines::engine_models::EngineHandler>,
pub request_timeout: u8,
pub threads: u8,
pub safe_search: u8,
}
/// Configuration options for the aggregator.
@ -89,6 +90,16 @@ impl Config {
parsed_threads
};
let parsed_safe_search: u8 = globals.get::<_, u8>("safe_search")?;
let safe_search: u8 = match parsed_safe_search {
0..=4 => parsed_safe_search,
_ => {
log::error!("Config Error: The value of `safe_search` option should be a non zero positive integer from 0 to 4.");
log::error!("Falling back to using the value `1` for the option");
1
}
};
Ok(Config {
port: globals.get::<_, u16>("port")?,
binding_ip: globals.get::<_, String>("binding_ip")?,
@ -110,6 +121,7 @@ impl Config {
.collect(),
request_timeout: globals.get::<_, u8>("request_timeout")?,
threads,
safe_search,
})
}
}

View File

@ -43,6 +43,7 @@ impl SearchEngine for DuckDuckGo {
page: u32,
user_agent: &str,
request_timeout: u8,
_safe_search: u8,
) -> Result<HashMap<String, SearchResult>, EngineError> {
// Page number can be missing or empty string and so appropriate handling is required
// so that upstream server recieves valid page number.

View File

@ -71,6 +71,7 @@ pub trait SearchEngine: Sync + Send {
page: u32,
user_agent: &str,
request_timeout: u8,
safe_search: u8,
) -> Result<HashMap<String, SearchResult>, EngineError>;
}

View File

@ -42,12 +42,21 @@ impl SearchEngine for Searx {
page: u32,
user_agent: &str,
request_timeout: u8,
mut safe_search: u8,
) -> Result<HashMap<String, SearchResult>, EngineError> {
// Page number can be missing or empty string and so appropriate handling is required
// so that upstream server recieves valid page number.
if safe_search == 3 {
safe_search = 2;
};
let url: String = match page {
0 | 1 => format!("https://searx.work/search?q={query}&pageno=1"),
_ => format!("https://searx.work/search?q={query}&pageno={page}"),
0 | 1 => {
format!("https://searx.work/search?q={query}&pageno=1&safesearch={safe_search}")
}
_ => format!(
"https://searx.work/search?q={query}&pageno={page}&safesearch={safe_search}"
),
};
// initializing headers and adding appropriate headers.

View File

@ -102,13 +102,15 @@ impl EngineErrorInfo {
/// and the type of error that caused it.
/// * `empty_result_set` - Stores a boolean which indicates that no engines gave a result for the
/// given search query.
#[derive(Serialize, Deserialize)]
#[derive(Serialize, Deserialize, Default)]
#[serde(rename_all = "camelCase")]
pub struct SearchResults {
pub results: Vec<SearchResult>,
pub page_query: String,
pub style: Style,
pub engine_errors_info: SmallVec<[EngineErrorInfo; 0]>,
pub engine_errors_info: Vec<EngineErrorInfo>,
pub disallowed: bool,
pub filtered: bool,
}
impl SearchResults {
@ -122,6 +124,7 @@ impl SearchResults {
/// the search url.
/// * `empty_result_set` - Takes a boolean which indicates that no engines gave a result for the
/// given search query.
/// * ``
pub fn new(
results: Vec<SearchResult>,
page_query: &str,
@ -131,12 +134,38 @@ impl SearchResults {
results,
page_query: page_query.to_owned(),
style: Style::default(),
engine_errors_info: SmallVec::from(engine_errors_info),
engine_errors_info: engine_errors_info.to_owned(),
disallowed: Default::default(),
filtered: Default::default(),
}
}
/// A setter function to add website style to the return search results.
pub fn add_style(&mut self, style: &Style) {
self.style = style.to_owned();
self.style = style.clone();
}
/// A setter function that sets disallowed to true.
pub fn set_disallowed(&mut self) {
self.disallowed = true;
}
/// A setter function to set the current page search query.
pub fn set_page_query(&mut self, page: &str) {
self.page_query = page.to_owned();
}
/// A setter function that sets the filtered to true.
pub fn set_filtered(&mut self) {
self.filtered = true;
}
/// A getter function that gets the value of `engine_errors_info`.
pub fn engine_errors_info(&mut self) -> Vec<EngineErrorInfo> {
std::mem::take(&mut self.engine_errors_info)
}
/// A getter function that gets the value of `results`.
pub fn results(&mut self) -> Vec<SearchResult> {
self.results.clone()
}
}

View File

@ -70,6 +70,7 @@ pub async fn aggregate(
debug: bool,
upstream_search_engines: &[EngineHandler],
request_timeout: u8,
safe_search: u8,
) -> Result<SearchResults, Box<dyn std::error::Error>> {
let user_agent: &str = random_user_agent();
@ -91,7 +92,13 @@ pub async fn aggregate(
let query: String = query.to_owned();
tasks.push(tokio::spawn(async move {
search_engine
.results(&query, page, user_agent, request_timeout)
.results(
&query,
page,
user_agent.clone(),
request_timeout,
safe_search,
)
.await
}));
}
@ -150,6 +157,7 @@ pub async fn aggregate(
}
}
if safe_search >= 3 {
let mut blacklist_map: HashMap<String, SearchResult> = HashMap::new();
filter_with_lists(
&mut result_map,
@ -164,6 +172,7 @@ pub async fn aggregate(
)?;
drop(blacklist_map);
}
let results: Vec<SearchResult> = result_map.into_values().collect();
@ -189,7 +198,7 @@ pub fn filter_with_lists(
let mut reader = BufReader::new(File::open(file_path)?);
for line in reader.by_ref().lines() {
let re = Regex::new(&line?)?;
let re = Regex::new(line?.trim())?;
// Iterate over each search result in the map and check if it matches the regex pattern
for (url, search_result) in map_to_be_filtered.clone().into_iter() {

View File

@ -2,7 +2,10 @@
//! meta search engine website and provide appropriate response to each route/page
//! when requested.
use std::fs::read_to_string;
use std::{
fs::{read_to_string, File},
io::{BufRead, BufReader, Read},
};
use crate::{
cache::cacher::RedisCache,
@ -13,12 +16,13 @@ use crate::{
};
use actix_web::{get, web, HttpRequest, HttpResponse};
use handlebars::Handlebars;
use regex::Regex;
use serde::Deserialize;
use tokio::join;
// ---- Constants ----
/// Initialize redis cache connection once and store it on the heap.
const REDIS_CACHE: async_once_cell::OnceCell<RedisCache> = async_once_cell::OnceCell::new();
static REDIS_CACHE: async_once_cell::OnceCell<RedisCache> = async_once_cell::OnceCell::new();
/// A named struct which deserializes all the user provided search parameters and stores them.
///
@ -32,6 +36,7 @@ const REDIS_CACHE: async_once_cell::OnceCell<RedisCache> = async_once_cell::Once
struct SearchParams {
q: Option<String>,
page: Option<u32>,
safesearch: Option<u8>,
}
/// Handles the route of index page or main page of the `websurfx` meta search engine website.
@ -105,42 +110,58 @@ pub async fn search(
None => 1,
};
let safe_search: u8 = match config.safe_search {
3..=4 => config.safe_search,
_ => match &params.safesearch {
Some(safesearch) => match safesearch {
0..=2 => *safesearch,
_ => 1,
},
None => config.safe_search,
},
};
let (_, results, _) = join!(
results(
format!(
"http://{}:{}/search?q={}&page={}",
"http://{}:{}/search?q={}&page={}&safesearch={}",
config.binding_ip,
config.port,
query,
page - 1
page - 1,
safe_search
),
&config,
query,
page - 1,
&req,
req.clone(),
safe_search
),
results(
format!(
"http://{}:{}/search?q={}&page={}",
config.binding_ip, config.port, query, page
"http://{}:{}/search?q={}&page={}&safesearch={}",
config.binding_ip, config.port, query, page, safe_search
),
&config,
query,
page,
&req,
req.clone(),
safe_search
),
results(
format!(
"http://{}:{}/search?q={}&page={}",
"http://{}:{}/search?q={}&page={}&safesearch={}",
config.binding_ip,
config.port,
query,
page + 1
page + 1,
safe_search
),
&config,
query,
page + 1,
&req,
req.clone(),
safe_search
)
);
@ -160,9 +181,10 @@ async fn results(
config: &Config,
query: &str,
page: u32,
req: &HttpRequest,
req: HttpRequest,
safe_search: u8,
) -> Result<SearchResults, Box<dyn std::error::Error>> {
let redis_cache: RedisCache = REDIS_CACHE
let mut redis_cache: RedisCache = REDIS_CACHE
.get_or_init(async {
// Initialize redis cache connection pool only one and store it in the heap.
RedisCache::new(&config.redis_url, 5).await.unwrap()
@ -178,6 +200,23 @@ async fn results(
match cached_results_json {
Ok(results) => Ok(serde_json::from_str::<SearchResults>(&results)?),
Err(_) => {
if safe_search == 4 {
let mut results: SearchResults = SearchResults::default();
let mut _flag: bool =
is_match_from_filter_list(file_path(FileType::BlockList)?, query)?;
_flag = !is_match_from_filter_list(file_path(FileType::AllowList)?, query)?;
if _flag {
results.set_disallowed();
results.add_style(&config.style);
results.set_page_query(query);
redis_cache
.cache_results(&serde_json::to_string(&results)?, &url)
.await?;
return Ok(results);
}
}
// check if the cookie value is empty or not if it is empty then use the
// default selected upstream search engines from the config file otherwise
// parse the non-empty cookie and grab the user selected engines from the
@ -199,6 +238,7 @@ async fn results(
config.debug,
&engines,
config.request_timeout,
safe_search,
)
.await?
}
@ -210,14 +250,16 @@ async fn results(
config.debug,
&config.upstream_search_engines,
config.request_timeout,
safe_search,
)
.await?
}
};
if results.engine_errors_info().is_empty() && results.results().is_empty() {
results.set_filtered();
}
results.add_style(&config.style);
redis_cache
.clone()
.cache_results(&serde_json::to_string(&results)?, &url)
.await?;
Ok(results)
@ -225,6 +267,22 @@ async fn results(
}
}
fn is_match_from_filter_list(
file_path: &str,
query: &str,
) -> Result<bool, Box<dyn std::error::Error>> {
let mut flag = false;
let mut reader = BufReader::new(File::open(file_path)?);
for line in reader.by_ref().lines() {
let re = Regex::new(&line?)?;
if re.is_match(query) {
flag = true;
break;
}
}
Ok(flag)
}
/// Handles the route of robots.txt page of the `websurfx` meta search engine website.
#[get("/robots.txt")]
pub async fn robots_data(_req: HttpRequest) -> Result<HttpResponse, Box<dyn std::error::Error>> {

View File

@ -11,6 +11,17 @@ production_use = false -- whether to use production mode or not (in other words
-- There will be a random delay before sending the request to the search engines, this is to prevent DDoSing the upstream search engines from a large number of simultaneous requests.
request_timeout = 30 -- timeout for the search requests sent to the upstream search engines to be fetched (value in seconds).
-- ### Search ###
-- Filter results based on different levels. The levels provided are:
-- {{
-- 0 - None
-- 1 - Low
-- 2 - Moderate
-- 3 - High
-- 4 - Aggressive
-- }}
safe_search = 2
-- ### Website ###
-- The different colorschemes provided are:
-- {{