From bb50e8bb25777fc40c0dddf88edd0ca0abebb984 Mon Sep 17 00:00:00 2001 From: Spencer Date: Mon, 25 Mar 2024 09:16:49 +0000 Subject: [PATCH] =?UTF-8?q?=E2=9C=A8=20Ranking=20of=20aggregated=20search?= =?UTF-8?q?=20results=20based=20on=20relevancy=20of=20the=20search=20resul?= =?UTF-8?q?t=20to=20the=20user's=20search=20query=20(#549)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * add sorting by relevance and merge new changes * fix conflicts * Update src/models/aggregation_models.rs Co-authored-by: neon_arch * Update src/models/aggregation_models.rs Co-authored-by: neon_arch * Update Cargo.toml Co-authored-by: neon_arch * Update Cargo.toml Co-authored-by: neon_arch * Update Cargo.toml Co-authored-by: neon_arch * enable non-static-synonyms features --------- Co-authored-by: neon_arch --- Cargo.lock | 79 +++++++++++++++++++ Cargo.toml | 128 +++++++++++++++++++++---------- src/models/aggregation_models.rs | 98 ++++++++++++++++++++++- src/results/aggregator.rs | 32 +++++++- 4 files changed, 296 insertions(+), 41 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index df0d58f..faa6084 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -245,6 +245,12 @@ version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" +[[package]] +name = "adler32" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aae1277d39aeec15cb388266ecc24b11c80469deae6067e17a1a7aa9e5c1f234" + [[package]] name = "aead" version = "0.5.2" @@ -1840,6 +1846,16 @@ dependencies = [ "winapi-build", ] +[[package]] +name = "keyword_extraction" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c25710ba2c50e4762b267b7387a989d8d1a8235f5cf26cd84e34aac30b263140" +dependencies = [ + "regex", + "unicode-segmentation", +] + [[package]] name = "language-tags" version = "0.3.2" @@ -1858,6 +1874,26 @@ version = "0.2.153" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c198f91728a82281a64e1f4f9eeb25d82cb32a5de251c6bd1b5154d63a8e7bd" +[[package]] +name = "libflate" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5ff4ae71b685bbad2f2f391fe74f6b7659a34871c08b210fdc039e43bee07d18" +dependencies = [ + "adler32", + "crc32fast", + "libflate_lz77", +] + +[[package]] +name = "libflate_lz77" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a52d3a8bfc85f250440e4424db7d857e241a3aebbbe301f3eb606ab15c39acbf" +dependencies = [ + "rle-decode-fast", +] + [[package]] name = "libmimalloc-sys" version = "0.1.35" @@ -3029,6 +3065,12 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "rle-decode-fast" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3582f63211428f83597b51b2ddb88e2a91a9d52d12831f9d08f5e624e8977422" + [[package]] name = "rustc-demangle" version = "0.1.23" @@ -3431,6 +3473,15 @@ version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" +[[package]] +name = "stop-words" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8500024d809de02ecbf998472b7bed3c4fca380df2be68917f6a473bdb28ddcc" +dependencies = [ + "serde_json", +] + [[package]] name = "string" version = "0.2.1" @@ -3607,6 +3658,25 @@ dependencies = [ "utf-8", ] +[[package]] +name = "thesaurus" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3e33ea271e53da683cd3439c04ff3b734f3d6052ea33a65ec9e0fa89a4f96369" +dependencies = [ + "lazy_static", + "thesaurus-moby", +] + +[[package]] +name = "thesaurus-moby" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "28f7806d5dbe7d9b627e332f88269a014a6a1d40ec411d4ea66cb702aabce4cf" +dependencies = [ + "libflate", +] + [[package]] name = "thousands" version = "0.2.0" @@ -3984,6 +4054,12 @@ dependencies = [ "tinyvec", ] +[[package]] +name = "unicode-segmentation" +version = "1.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d4c87d22b6e3f4a18d4d40ef354e97c90fcb14dd91d7dc0aa9d8a1172ebf7202" + [[package]] name = "unicode-width" version = "0.1.11" @@ -4219,6 +4295,7 @@ dependencies = [ "error-stack", "fake-useragent", "futures 0.3.30", + "keyword_extraction", "lightningcss", "log", "maud", @@ -4234,7 +4311,9 @@ dependencies = [ "serde", "serde_json", "smallvec 1.13.1", + "stop-words", "tempfile", + "thesaurus", "tokio 1.36.0", ] diff --git a/Cargo.toml b/Cargo.toml index 6882fbf..fd6e873 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,45 +13,93 @@ bench = false path = "src/bin/websurfx.rs" [dependencies] -reqwest = {version="0.11.24", default-features=false, features=["rustls-tls","brotli", "gzip"]} -tokio = {version="1.32.0",features=["rt-multi-thread","macros", "fs", "io-util"], default-features = false} -serde = {version="1.0.196", default-features=false, features=["derive"]} -serde_json = {version="1.0.109", default-features=false} -maud = {version="0.25.0", default-features=false, features=["actix-web"]} -scraper = {version="0.18.1", default-features = false} -actix-web = {version="4.4.0", features = ["cookies", "macros", "compress-brotli"], default-features=false} -actix-files = {version="0.6.5", default-features=false} -actix-cors = {version="0.7.0", default-features=false} -fake-useragent = {version="0.1.3", default-features=false} -env_logger = {version="0.11.1", default-features=false} -log = {version="0.4.21", default-features=false} -mlua = {version="0.9.1", features=["luajit", "vendored"], default-features=false} -redis = {version="0.24.0", features=["tokio-comp","connection-manager"], default-features = false, optional = true} -blake3 = {version="1.5.0", default-features=false} -error-stack = {version="0.4.0", default-features=false, features=["std"]} -async-trait = {version="0.1.76", default-features=false} -regex = {version="1.9.4", features=["perf"], default-features = false} -smallvec = {version="1.13.1", features=["union", "serde"], default-features=false} -futures = {version="0.3.30", default-features=false, features=["alloc"]} -dhat = {version="0.3.2", optional = true, default-features=false} +reqwest = { version = "0.11.24", default-features = false, features = [ + "rustls-tls", + "brotli", + "gzip", +] } +tokio = { version = "1.32.0", features = [ + "rt-multi-thread", + "macros", + "fs", + "io-util", +], default-features = false } +serde = { version = "1.0.196", default-features = false, features = ["derive"] } +serde_json = { version = "1.0.109", default-features = false } +maud = { version = "0.25.0", default-features = false, features = [ + "actix-web", +] } +scraper = { version = "0.18.1", default-features = false } +actix-web = { version = "4.4.0", features = [ + "cookies", + "macros", + "compress-brotli", +], default-features = false } +actix-files = { version = "0.6.5", default-features = false } +actix-cors = { version = "0.7.0", default-features = false } +fake-useragent = { version = "0.1.3", default-features = false } +env_logger = { version = "0.11.1", default-features = false } +log = { version = "0.4.21", default-features = false } +mlua = { version = "0.9.1", features = [ + "luajit", + "vendored", +], default-features = false } +redis = { version = "0.24.0", features = [ + "tokio-comp", + "connection-manager", +], default-features = false, optional = true } +blake3 = { version = "1.5.0", default-features = false } +error-stack = { version = "0.4.0", default-features = false, features = [ + "std", +] } +async-trait = { version = "0.1.76", default-features = false } +regex = { version = "1.9.4", features = ["perf"], default-features = false } +smallvec = { version = "1.13.1", features = [ + "union", + "serde", +], default-features = false } +futures = { version = "0.3.30", default-features = false, features = ["alloc"] } +dhat = { version = "0.3.2", optional = true, default-features = false } mimalloc = { version = "0.1.38", default-features = false } -async-once-cell = {version="0.5.3", default-features=false} -actix-governor = {version="0.5.0", default-features=false} -mini-moka = { version="0.10", optional = true, default-features=false, features=["sync"]} -async-compression = { version = "0.4.6", default-features = false, features=["brotli","tokio"], optional=true} -chacha20poly1305={version="0.10.1", default-features=false, features=["alloc","getrandom"], optional=true} -chacha20 = {version="0.9.1", default-features=false, optional=true} -base64 = {version="0.21.5", default-features=false, features=["std"], optional=true} -cfg-if = {version="1.0.0", default-features=false,optional=true} +async-once-cell = { version = "0.5.3", default-features = false } +actix-governor = { version = "0.5.0", default-features = false } +mini-moka = { version = "0.10", optional = true, default-features = false, features = [ + "sync", +] } +async-compression = { version = "0.4.6", default-features = false, features = [ + "brotli", + "tokio", +], optional = true } +chacha20poly1305 = { version = "0.10.1", default-features = false, features = [ + "alloc", + "getrandom", +], optional = true } +chacha20 = { version = "0.9.1", default-features = false, optional = true } +base64 = { version = "0.21.5", default-features = false, features = [ + "std", +], optional = true } +cfg-if = { version = "1.0.0", default-features = false, optional = true } +keyword_extraction = { version = "1.3.0", default-features = false, features = [ + "tf_idf", + + +] } + +stop-words = { version = "0.8.0", default-features = false, features = ["iso"] } +thesaurus = { version = "0.5.2", default-features = false, optional = true, features = [ + "moby", +] } [dev-dependencies] -rusty-hook = {version="^0.11.2", default-features=false} -criterion = {version="0.5.1", default-features=false} -tempfile = {version="3.10.1", default-features=false} +rusty-hook = { version = "^0.11.2", default-features = false } +criterion = { version = "0.5.1", default-features = false } +tempfile = { version = "3.10.1", default-features = false } [build-dependencies] -lightningcss = {version="1.0.0-alpha.52", default-features=false, features=["grid"]} -minify-js = {version="0.6.0", default-features=false} +lightningcss = { version = "1.0.0-alpha.52", default-features = false, features = [ + "grid", +] } +minify-js = { version = "0.6.0", default-features = false } [profile.dev] opt-level = 0 @@ -80,11 +128,13 @@ rpath = false strip = "symbols" [features] +use-synonyms-search = ["thesaurus/static"] default = ["memory-cache"] -dhat-heap = ["dep:dhat"] +dhat-heap = ["dep:dhat"] memory-cache = ["dep:mini-moka"] -redis-cache = ["dep:redis","dep:base64"] -compress-cache-results = ["dep:async-compression","dep:cfg-if"] -encrypt-cache-results = ["dep:chacha20poly1305","dep:chacha20"] -cec-cache-results = ["compress-cache-results","encrypt-cache-results"] +redis-cache = ["dep:redis", "dep:base64"] +compress-cache-results = ["dep:async-compression", "dep:cfg-if"] +encrypt-cache-results = ["dep:chacha20poly1305", "dep:chacha20"] +cec-cache-results = ["compress-cache-results", "encrypt-cache-results"] experimental-io-uring = ["actix-web/experimental-io-uring"] +use-non-static-synonyms-search = ["thesaurus"] diff --git a/src/models/aggregation_models.rs b/src/models/aggregation_models.rs index 6be3958..c046b1c 100644 --- a/src/models/aggregation_models.rs +++ b/src/models/aggregation_models.rs @@ -4,7 +4,11 @@ use super::engine_models::EngineError; use serde::{Deserialize, Serialize}; use smallvec::SmallVec; - +#[cfg(any( + feature = "use-synonyms-search", + feature = "use-non-static-synonyms-search" +))] +use thesaurus::synonyms; /// A named struct to store the raw scraped search results scraped search results from the /// upstream search engines before aggregating it.It derives the Clone trait which is needed /// to write idiomatic rust using `Iterators`. @@ -20,6 +24,8 @@ pub struct SearchResult { pub description: String, /// The names of the upstream engines from which this results were provided. pub engine: SmallVec<[String; 0]>, + /// The td-tdf score of the result in regards to the title, url and description and the user's query + pub relevance_score: f32, } impl SearchResult { @@ -37,9 +43,49 @@ impl SearchResult { title: title.to_owned(), url: url.to_owned(), description: description.to_owned(), + relevance_score: 0.0, engine: engine.iter().map(|name| name.to_string()).collect(), } } + /// calculates and update the relevance score of the current search. + + /// # Arguments + /// + /// * query - the query string used to obtain the results + /// + /// + + pub fn calculate_relevance(&mut self, query: &str) { + use stop_words::{get, LANGUAGE}; + // when language settings can change to any of the ones supported on this crate: https://docs.rs/crate/stop-words/0.8.0 + let documents = [ + self.title.clone(), + self.url.clone(), + self.description.clone(), + ]; + + let stop_words = get(LANGUAGE::English); + let punctuation = [ + ".".to_owned(), + ",".to_owned(), + ":".to_owned(), + ";".to_owned(), + "!".to_owned(), + "?".to_owned(), + "(".to_owned(), + ")".to_owned(), + "[".to_owned(), + "]".to_owned(), + "{".to_owned(), + "}".to_owned(), + "\"".to_owned(), + "'".to_owned(), + "<".to_owned(), + ">".to_owned(), + ]; + + self.relevance_score = calculate_tf_idf(query, &documents, &stop_words, &punctuation); + } /// A function which adds the engine name provided as a string into a vector of strings. /// @@ -182,3 +228,53 @@ impl SearchResults { self.no_engines_selected = true; } } + +/// Helper function to calculate the tf-idf for the search query. +///
The approach is as [`as`](https://en.wikipedia.org/wiki/Tf%E2%80%93idf). +///
Find a sample article about TF-IDF [`here`](https://medium.com/analytics-vidhya/tf-idf-term-frequency-technique-easiest-explanation-for-text-classification-in-nlp-with-code-8ca3912e58c3) +/// ### Arguments +/// * `query` - a user's search query +/// * `documents` - a list of text used for comparision (url, title, description) +/// * `stop_words` - A list of language specific stop words. +/// * `punctuation` - list of punctuation symbols. +/// ### Returns +/// * `score` - The average tf-idf score of the word tokens (and synonyms) in the query +fn calculate_tf_idf( + query: &str, + documents: &[String], + stop_words: &[String], + punctuation: &[String], +) -> f32 { + use keyword_extraction::{ + tf_idf::{TfIdf, TfIdfParams}, + tokenizer::Tokenizer, + }; + + let params = TfIdfParams::UnprocessedDocuments(documents, stop_words, Some(punctuation)); + let tf_idf = TfIdf::new(params); + let tokener = Tokenizer::new(query, stop_words, Some(punctuation)); + let query_tokens = tokener.split_into_words(); + let mut search_tokens = vec![]; + + for token in query_tokens { + #[cfg(any( + feature = "use-synonyms-search", + feature = "use-non-static-synonyms-search" + ))] + { + // find some synonyms and add them to the search (from wordnet or moby if feature is enabled) + let synonyms = synonyms(&token); + search_tokens.extend(synonyms) + } + search_tokens.push(token); + } + + let mut total_score = 0.0f32; + for token in search_tokens.iter() { + total_score += tf_idf.get_score(token); + } + + let result = total_score / (search_tokens.len() as f32); + + f32::from(!result.is_nan()) * result +} diff --git a/src/results/aggregator.rs b/src/results/aggregator.rs index d827b94..9044307 100644 --- a/src/results/aggregator.rs +++ b/src/results/aggregator.rs @@ -8,6 +8,7 @@ use crate::models::{ aggregation_models::{EngineErrorInfo, SearchResult, SearchResults}, engine_models::{EngineError, EngineHandler}, }; + use error_stack::Report; use futures::stream::FuturesUnordered; use regex::Regex; @@ -184,7 +185,17 @@ pub async fn aggregate( drop(blacklist_map); } - let results: Vec = result_map.iter().map(|(_, value)| value.clone()).collect(); + let mut results: Vec = result_map + .iter() + .map(|(_, value)| { + let mut copy = value.clone(); + if !copy.url.contains("temu.com") { + copy.calculate_relevance(query.as_str()) + } + copy + }) + .collect(); + sort_search_results(&mut results); Ok(SearchResults::new(results, &engine_errors_info)) } @@ -233,7 +244,21 @@ pub async fn filter_with_lists( Ok(()) } +/// Sorts SearchResults by relevance score. +///
sort_unstable is used as its faster,stability is not an issue on our side. +/// For reasons why, check out [`this`](https://rust-lang.github.io/rfcs/1884-unstable-sort.html) +/// # Arguments +/// * `results` - A mutable slice or Vec of SearchResults +/// +fn sort_search_results(results: &mut [SearchResult]) { + results.sort_unstable_by(|a, b| { + use std::cmp::Ordering; + b.relevance_score + .partial_cmp(&a.relevance_score) + .unwrap_or(Ordering::Less) + }) +} #[cfg(test)] mod tests { use super::*; @@ -252,6 +277,7 @@ mod tests { url: "https://www.example.com".to_owned(), description: "This domain is for use in illustrative examples in documents." .to_owned(), + relevance_score: 0.0, engine: smallvec!["Google".to_owned(), "Bing".to_owned()], }, )); @@ -262,6 +288,7 @@ mod tests { url: "https://www.rust-lang.org/".to_owned(), description: "A systems programming language that runs blazingly fast, prevents segfaults, and guarantees thread safety.".to_owned(), engine: smallvec!["Google".to_owned(), "DuckDuckGo".to_owned()], + relevance_score:0.0 },) ); @@ -302,6 +329,7 @@ mod tests { description: "This domain is for use in illustrative examples in documents." .to_owned(), engine: smallvec!["Google".to_owned(), "Bing".to_owned()], + relevance_score: 0.0, }, )); map_to_be_filtered.push(( @@ -311,6 +339,7 @@ mod tests { url: "https://www.rust-lang.org/".to_owned(), description: "A systems programming language that runs blazingly fast, prevents segfaults, and guarantees thread safety.".to_owned(), engine: smallvec!["Google".to_owned(), "DuckDuckGo".to_owned()], + relevance_score:0.0 }, )); @@ -367,6 +396,7 @@ mod tests { description: "This domain is for use in illustrative examples in documents." .to_owned(), engine: smallvec!["Google".to_owned(), "Bing".to_owned()], + relevance_score: 0.0, }, ));