From 109dcde80b5a7602938e04ac4c490db4b9a77704 Mon Sep 17 00:00:00 2001 From: Spencerjibz Date: Wed, 20 Mar 2024 17:45:48 +0000 Subject: [PATCH 1/2] add sorting by relevance and merge new changes --- Cargo.lock | 80 +++++++++++++++++++ Cargo.toml | 128 +++++++++++++++++++++---------- src/models/aggregation_models.rs | 99 +++++++++++++++++++++++- src/results/aggregator.rs | 32 +++++++- 4 files changed, 297 insertions(+), 42 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 452ab2d..9b7412d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -245,6 +245,12 @@ version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" +[[package]] +name = "adler32" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aae1277d39aeec15cb388266ecc24b11c80469deae6067e17a1a7aa9e5c1f234" + [[package]] name = "aead" version = "0.5.2" @@ -1840,6 +1846,16 @@ dependencies = [ "winapi-build", ] +[[package]] +name = "keyword_extraction" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c25710ba2c50e4762b267b7387a989d8d1a8235f5cf26cd84e34aac30b263140" +dependencies = [ + "regex", + "unicode-segmentation", +] + [[package]] name = "language-tags" version = "0.3.2" @@ -1858,6 +1874,26 @@ version = "0.2.153" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c198f91728a82281a64e1f4f9eeb25d82cb32a5de251c6bd1b5154d63a8e7bd" +[[package]] +name = "libflate" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5ff4ae71b685bbad2f2f391fe74f6b7659a34871c08b210fdc039e43bee07d18" +dependencies = [ + "adler32", + "crc32fast", + "libflate_lz77", +] + +[[package]] +name = "libflate_lz77" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a52d3a8bfc85f250440e4424db7d857e241a3aebbbe301f3eb606ab15c39acbf" +dependencies = [ + "rle-decode-fast", +] + [[package]] name = "libmimalloc-sys" version = "0.1.35" @@ -3029,6 +3065,12 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "rle-decode-fast" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3582f63211428f83597b51b2ddb88e2a91a9d52d12831f9d08f5e624e8977422" + [[package]] name = "rustc-demangle" version = "0.1.23" @@ -3431,6 +3473,15 @@ version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" +[[package]] +name = "stop-words" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8500024d809de02ecbf998472b7bed3c4fca380df2be68917f6a473bdb28ddcc" +dependencies = [ + "serde_json", +] + [[package]] name = "string" version = "0.2.1" @@ -3607,6 +3658,26 @@ dependencies = [ "utf-8", ] +[[package]] +name = "thesaurus" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3e33ea271e53da683cd3439c04ff3b734f3d6052ea33a65ec9e0fa89a4f96369" +dependencies = [ + "lazy_static", + "thesaurus-wordnet", +] + +[[package]] +name = "thesaurus-wordnet" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e0b5b98479fc7554a0cedad4c03264b1caeecc7af51b4e44945c759bab43e35" +dependencies = [ + "libflate", + "serde_json", +] + [[package]] name = "thousands" version = "0.2.0" @@ -3984,6 +4055,12 @@ dependencies = [ "tinyvec", ] +[[package]] +name = "unicode-segmentation" +version = "1.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d4c87d22b6e3f4a18d4d40ef354e97c90fcb14dd91d7dc0aa9d8a1172ebf7202" + [[package]] name = "unicode-width" version = "0.1.11" @@ -4219,6 +4296,7 @@ dependencies = [ "error-stack", "fake-useragent", "futures 0.3.30", + "keyword_extraction", "lightningcss", "log", "maud", @@ -4234,7 +4312,9 @@ dependencies = [ "serde", "serde_json", "smallvec 1.13.1", + "stop-words", "tempfile", + "thesaurus", "tokio 1.36.0", ] diff --git a/Cargo.toml b/Cargo.toml index a9152af..94d4d1f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,45 +13,94 @@ bench = false path = "src/bin/websurfx.rs" [dependencies] -reqwest = {version="0.11.24", default-features=false, features=["rustls-tls","brotli", "gzip"]} -tokio = {version="1.32.0",features=["rt-multi-thread","macros", "fs", "io-util"], default-features = false} -serde = {version="1.0.196", default-features=false, features=["derive"]} -serde_json = {version="1.0.109", default-features=false} -maud = {version="0.25.0", default-features=false, features=["actix-web"]} -scraper = {version="0.18.1", default-features = false} -actix-web = {version="4.4.0", features = ["cookies", "macros", "compress-brotli"], default-features=false} -actix-files = {version="0.6.5", default-features=false} -actix-cors = {version="0.7.0", default-features=false} -fake-useragent = {version="0.1.3", default-features=false} -env_logger = {version="0.11.1", default-features=false} -log = {version="0.4.21", default-features=false} -mlua = {version="0.9.1", features=["luajit", "vendored"], default-features=false} -redis = {version="0.24.0", features=["tokio-comp","connection-manager"], default-features = false, optional = true} -blake3 = {version="1.5.0", default-features=false} -error-stack = {version="0.4.0", default-features=false, features=["std"]} -async-trait = {version="0.1.76", default-features=false} -regex = {version="1.9.4", features=["perf"], default-features = false} -smallvec = {version="1.13.1", features=["union", "serde"], default-features=false} -futures = {version="0.3.30", default-features=false, features=["alloc"]} -dhat = {version="0.3.2", optional = true, default-features=false} +reqwest = { version = "0.11.24", default-features = false, features = [ + "rustls-tls", + "brotli", + "gzip", +] } +tokio = { version = "1.32.0", features = [ + "rt-multi-thread", + "macros", + "fs", + "io-util", +], default-features = false } +serde = { version = "1.0.196", default-features = false, features = ["derive"] } +serde_json = { version = "1.0.109", default-features = false } +maud = { version = "0.25.0", default-features = false, features = [ + "actix-web", +] } +scraper = { version = "0.18.1", default-features = false } +actix-web = { version = "4.4.0", features = [ + "cookies", + "macros", + "compress-brotli", +], default-features = false } +actix-files = { version = "0.6.5", default-features = false } +actix-cors = { version = "0.7.0", default-features = false } +fake-useragent = { version = "0.1.3", default-features = false } +env_logger = { version = "0.11.1", default-features = false } +log = { version = "0.4.21", default-features = false } +mlua = { version = "0.9.1", features = [ + "luajit", + "vendored", +], default-features = false } +redis = { version = "0.24.0", features = [ + "tokio-comp", + "connection-manager", +], default-features = false, optional = true } +blake3 = { version = "1.5.0", default-features = false } +error-stack = { version = "0.4.0", default-features = false, features = [ + "std", +] } +async-trait = { version = "0.1.76", default-features = false } +regex = { version = "1.9.4", features = ["perf"], default-features = false } +smallvec = { version = "1.13.1", features = [ + "union", + "serde", +], default-features = false } +futures = { version = "0.3.30", default-features = false, features = ["alloc"] } +dhat = { version = "0.3.2", optional = true, default-features = false } mimalloc = { version = "0.1.38", default-features = false } -async-once-cell = {version="0.5.3", default-features=false} -actix-governor = {version="0.5.0", default-features=false} -mini-moka = { version="0.10", optional = true, default-features=false, features=["sync"]} -async-compression = { version = "0.4.6", default-features = false, features=["brotli","tokio"], optional=true} -chacha20poly1305={version="0.10.1", default-features=false, features=["alloc","getrandom"], optional=true} -chacha20 = {version="0.9.1", default-features=false, optional=true} -base64 = {version="0.21.5", default-features=false, features=["std"], optional=true} -cfg-if = {version="1.0.0", default-features=false,optional=true} +async-once-cell = { version = "0.5.3", default-features = false } +actix-governor = { version = "0.5.0", default-features = false } +mini-moka = { version = "0.10", optional = true, default-features = false, features = [ + "sync", +] } +async-compression = { version = "0.4.6", default-features = false, features = [ + "brotli", + "tokio", +], optional = true } +chacha20poly1305 = { version = "0.10.1", default-features = false, features = [ + "alloc", + "getrandom", +], optional = true } +chacha20 = { version = "0.9.1", default-features = false, optional = true } +base64 = { version = "0.21.5", default-features = false, features = [ + "std", +], optional = true } +cfg-if = { version = "1.0.0", default-features = false, optional = true } +keyword_extraction = { version = "1.3.0", default-features = false, features = [ + "tf_idf", + + +] } + +stop-words = "0.8.0" +thesaurus = { version = "0.5.2", default-features = false, optional = true, features = [ + "wordnet", + "static", +] } [dev-dependencies] -rusty-hook = {version="^0.11.2", default-features=false} -criterion = {version="0.5.1", default-features=false} -tempfile = {version="3.10.1", default-features=false} +rusty-hook = { version = "^0.11.2", default-features = false } +criterion = { version = "0.5.1", default-features = false } +tempfile = { version = "3.10.1", default-features = false } [build-dependencies] -lightningcss = {version="1.0.0-alpha.52", default-features=false, features=["grid"]} -minify-js = {version="0.6.0", default-features=false} +lightningcss = { version = "1.0.0-alpha.52", default-features = false, features = [ + "grid", +] } +minify-js = { version = "0.6.0", default-features = false } [profile.dev] opt-level = 0 @@ -81,10 +130,11 @@ strip = "debuginfo" [features] default = ["memory-cache"] -dhat-heap = ["dep:dhat"] +dhat-heap = ["dep:dhat"] memory-cache = ["dep:mini-moka"] -redis-cache = ["dep:redis","dep:base64"] -compress-cache-results = ["dep:async-compression","dep:cfg-if"] -encrypt-cache-results = ["dep:chacha20poly1305","dep:chacha20"] -cec-cache-results = ["compress-cache-results","encrypt-cache-results"] +redis-cache = ["dep:redis", "dep:base64"] +compress-cache-results = ["dep:async-compression", "dep:cfg-if"] +encrypt-cache-results = ["dep:chacha20poly1305", "dep:chacha20"] +cec-cache-results = ["compress-cache-results", "encrypt-cache-results"] experimental-io-uring = ["actix-web/experimental-io-uring"] +use-synonyms-search = ["dep:thesaurus"] diff --git a/src/models/aggregation_models.rs b/src/models/aggregation_models.rs index 6be3958..d4146da 100644 --- a/src/models/aggregation_models.rs +++ b/src/models/aggregation_models.rs @@ -4,12 +4,13 @@ use super::engine_models::EngineError; use serde::{Deserialize, Serialize}; use smallvec::SmallVec; - +#[cfg(feature = "use-synonyms-search")] +use thesaurus::synonyms; /// A named struct to store the raw scraped search results scraped search results from the /// upstream search engines before aggregating it.It derives the Clone trait which is needed /// to write idiomatic rust using `Iterators`. /// (href url in html in simple words). -#[derive(Clone, Serialize, Deserialize)] +#[derive(Clone, Serialize, Deserialize, Debug)] #[serde(rename_all = "camelCase")] pub struct SearchResult { /// The title of the search result. @@ -20,6 +21,8 @@ pub struct SearchResult { pub description: String, /// The names of the upstream engines from which this results were provided. pub engine: SmallVec<[String; 0]>, + /// The td-tdf score of the result in regards to the title, url and description and the user's query + pub relevance_score: f32, } impl SearchResult { @@ -38,6 +41,7 @@ impl SearchResult { url: url.to_owned(), description: description.to_owned(), engine: engine.iter().map(|name| name.to_string()).collect(), + relevance_score: 0.0, } } @@ -58,6 +62,46 @@ impl SearchResult { pub fn engine(&mut self) -> String { std::mem::take(&mut self.engine[0]) } + + /// calculates and update the relevance score of the current search. + + /// # Arguments + /// + /// * query - the query string used to obtain the results + /// + /// + + pub fn calculate_relevance(&mut self, query: &str) { + use stop_words::{get, LANGUAGE}; + // when language settings can change to any of the ones supported on this crate: https://docs.rs/crate/stop-words/0.8.0 + let documents = [ + self.title.clone(), + self.url.clone(), + self.description.clone(), + ]; + + let stop_words = get(LANGUAGE::English); + let punctuation = [ + ".".to_owned(), + ",".to_owned(), + ":".to_owned(), + ";".to_owned(), + "!".to_owned(), + "?".to_owned(), + "(".to_owned(), + ")".to_owned(), + "[".to_owned(), + "]".to_owned(), + "{".to_owned(), + "}".to_owned(), + "\"".to_owned(), + "'".to_owned(), + "<".to_owned(), + ">".to_owned(), + ]; + + self.relevance_score = calculate_tf_idf(query, &documents, &stop_words, &punctuation); + } } /// A named struct that stores the error info related to the upstream search engines. @@ -182,3 +226,54 @@ impl SearchResults { self.no_engines_selected = true; } } + +/// Helper function to calculate the tf-idf for the search query. +///
The approach is as [`as`](https://en.wikipedia.org/wiki/Tf%E2%80%93idf). +///
Find a sample article about TF-IDF [`here`](https://medium.com/analytics-vidhya/tf-idf-term-frequency-technique-easiest-explanation-for-text-classification-in-nlp-with-code-8ca3912e58c3) +/// ### Arguments +/// * `query` - a user's search query +/// * `documents` - a list of text used for comparision (url, title, description) +/// * `stop_words` - A list of language specific stop words. +/// * `punctuation` - list of punctuation symbols. +/// ### Returns +/// * `score` - The average tf-idf score of the word tokens (and synonyms) in the query +fn calculate_tf_idf( + query: &str, + documents: &[String], + stop_words: &[String], + punctuation: &[String], +) -> f32 { + use keyword_extraction::{ + tf_idf::{TfIdf, TfIdfParams}, + tokenizer::Tokenizer, + }; + + let params = TfIdfParams::UnprocessedDocuments(documents, stop_words, Some(punctuation)); + let tf_idf = TfIdf::new(params); + let tokener = Tokenizer::new(query, stop_words, Some(punctuation)); + let query_tokens = tokener.split_into_words(); + let mut search_tokens = vec![]; + + for token in query_tokens { + #[cfg(feature = "use-synonyms-search")] + { + // find some synonyms and add them to the search (from wordnet or moby if feature is enabled) + let synonyms = synonyms(&token); + search_tokens.extend(synonyms) + } + search_tokens.push(token); + } + + let mut total_score = 0.0f32; + for token in search_tokens.iter() { + total_score += tf_idf.get_score(token); + } + + let result = total_score / (search_tokens.len() as f32); + + if result.is_nan() { + 0.0 + } else { + result + } +} diff --git a/src/results/aggregator.rs b/src/results/aggregator.rs index b46befd..b929d45 100644 --- a/src/results/aggregator.rs +++ b/src/results/aggregator.rs @@ -182,7 +182,17 @@ pub async fn aggregate( drop(blacklist_map); } - let results: Vec = result_map.iter().map(|(_, value)| value.clone()).collect(); + let mut results: Vec = result_map + .iter() + .map(|(_, value)| { + let mut copy = value.clone(); + if !copy.url.contains("temu.com") { + copy.calculate_relevance(query.as_str()) + } + copy + }) + .collect(); + sort_search_results(&mut results); Ok(SearchResults::new(results, &engine_errors_info)) } @@ -232,6 +242,21 @@ pub async fn filter_with_lists( Ok(()) } +/// Sorts SearchResults by relevance score. +///
sort_unstable is used as its faster,stability is not an issue on our side. +/// For reasons why, check out [`this`](https://rust-lang.github.io/rfcs/1884-unstable-sort.html) +/// # Arguments +/// * `results` - A mutable slice or Vec of SearchResults +/// +fn sort_search_results(results: &mut [SearchResult]) { + results.sort_unstable_by(|a, b| { + use std::cmp::Ordering; + + b.relevance_score + .partial_cmp(&a.relevance_score) + .unwrap_or(Ordering::Less) + }) +} #[cfg(test)] mod tests { use super::*; @@ -251,6 +276,7 @@ mod tests { description: "This domain is for use in illustrative examples in documents." .to_owned(), engine: smallvec!["Google".to_owned(), "Bing".to_owned()], + relevance_score: 0.0, }, )); map_to_be_filtered.push(( @@ -260,6 +286,7 @@ mod tests { url: "https://www.rust-lang.org/".to_owned(), description: "A systems programming language that runs blazingly fast, prevents segfaults, and guarantees thread safety.".to_owned(), engine: smallvec!["Google".to_owned(), "DuckDuckGo".to_owned()], + relevance_score:0.0 },) ); @@ -300,6 +327,7 @@ mod tests { description: "This domain is for use in illustrative examples in documents." .to_owned(), engine: smallvec!["Google".to_owned(), "Bing".to_owned()], + relevance_score: 0.0, }, )); map_to_be_filtered.push(( @@ -309,6 +337,7 @@ mod tests { url: "https://www.rust-lang.org/".to_owned(), description: "A systems programming language that runs blazingly fast, prevents segfaults, and guarantees thread safety.".to_owned(), engine: smallvec!["Google".to_owned(), "DuckDuckGo".to_owned()], + relevance_score:0.0 }, )); @@ -365,6 +394,7 @@ mod tests { description: "This domain is for use in illustrative examples in documents." .to_owned(), engine: smallvec!["Google".to_owned(), "Bing".to_owned()], + relevance_score: 0.0, }, )); From b87c4604363683d873d401b1a820ff6d382e408f Mon Sep 17 00:00:00 2001 From: Spencerjibz Date: Wed, 20 Mar 2024 18:11:06 +0000 Subject: [PATCH 2/2] fix conflicts --- src/models/aggregation_models.rs | 39 ++++++++++++++++---------------- src/results/aggregator.rs | 4 ++-- 2 files changed, 21 insertions(+), 22 deletions(-) diff --git a/src/models/aggregation_models.rs b/src/models/aggregation_models.rs index d4146da..bd489c4 100644 --- a/src/models/aggregation_models.rs +++ b/src/models/aggregation_models.rs @@ -40,29 +40,10 @@ impl SearchResult { title: title.to_owned(), url: url.to_owned(), description: description.to_owned(), - engine: engine.iter().map(|name| name.to_string()).collect(), relevance_score: 0.0, + engine: engine.iter().map(|name| name.to_string()).collect(), } } - - /// A function which adds the engine name provided as a string into a vector of strings. - /// - /// # Arguments - /// - /// * `engine` - Takes an engine name provided as a String. - pub fn add_engines(&mut self, engine: &str) { - self.engine.push(engine.to_owned()) - } - - /// A function which returns the engine name stored from the struct as a string. - /// - /// # Returns - /// - /// An engine name stored as a string from the struct. - pub fn engine(&mut self) -> String { - std::mem::take(&mut self.engine[0]) - } - /// calculates and update the relevance score of the current search. /// # Arguments @@ -102,6 +83,24 @@ impl SearchResult { self.relevance_score = calculate_tf_idf(query, &documents, &stop_words, &punctuation); } + + /// A function which adds the engine name provided as a string into a vector of strings. + /// + /// # Arguments + /// + /// * `engine` - Takes an engine name provided as a String. + pub fn add_engines(&mut self, engine: &str) { + self.engine.push(engine.to_owned()) + } + + /// A function which returns the engine name stored from the struct as a string. + /// + /// # Returns + /// + /// An engine name stored as a string from the struct. + pub fn engine(&mut self) -> String { + std::mem::take(&mut self.engine[0]) + } } /// A named struct that stores the error info related to the upstream search engines. diff --git a/src/results/aggregator.rs b/src/results/aggregator.rs index b929d45..11b9895 100644 --- a/src/results/aggregator.rs +++ b/src/results/aggregator.rs @@ -8,6 +8,7 @@ use crate::models::{ aggregation_models::{EngineErrorInfo, SearchResult, SearchResults}, engine_models::{EngineError, EngineHandler}, }; + use error_stack::Report; use futures::stream::FuturesUnordered; use regex::Regex; @@ -241,7 +242,6 @@ pub async fn filter_with_lists( Ok(()) } - /// Sorts SearchResults by relevance score. ///
sort_unstable is used as its faster,stability is not an issue on our side. /// For reasons why, check out [`this`](https://rust-lang.github.io/rfcs/1884-unstable-sort.html) @@ -275,8 +275,8 @@ mod tests { url: "https://www.example.com".to_owned(), description: "This domain is for use in illustrative examples in documents." .to_owned(), - engine: smallvec!["Google".to_owned(), "Bing".to_owned()], relevance_score: 0.0, + engine: smallvec!["Google".to_owned(), "Bing".to_owned()], }, )); map_to_be_filtered.push((