websurfx/src/engines/searx.rs

//! The `searx` module handles the scraping of results from the searx search engine instance
//! by querying the upstream searx search engine instance with user provided query and with a page
//! number if provided.

use reqwest::header::HeaderMap;
use scraper::Html;
use std::collections::HashMap;

use super::search_result_parser::SearchResultParser;
use crate::models::aggregation_models::SearchResult;
use crate::models::engine_models::{EngineError, SearchEngine};
use error_stack::{Report, Result, ResultExt};

/// A new Searx engine type defined in-order to implement the `SearchEngine` trait which allows to
/// reduce code duplication as well as allows to create vector of different search engines easily.
pub struct Searx {
    parser: SearchResultParser,
}

impl Searx {
    // new Searchx engine
    pub fn new() -> Result<Searx, EngineError> {
        Ok(Self {
            parser: SearchResultParser::new(
                "#urls>.dialog-error>p",
                ".result",
                "h3>a",
                "h3>a",
                ".content",
            )?,
        })
    }
}

#[async_trait::async_trait]
impl SearchEngine for Searx {
    async fn results(
        &self,
        query: &str,
        page: u32,
        user_agent: &str,
        request_timeout: u8,
        mut safe_search: u8,
    ) -> Result<HashMap<String, SearchResult>, EngineError> {
        // Page number can be missing or empty string and so appropriate handling is required
        // so that upstream server recieves valid page number.
        if safe_search == 3 {
            safe_search = 2;
        };

        let url: String = match page {
            0 | 1 => {
                format!("https://searx.work/search?q={query}&pageno=1&safesearch={safe_search}")
            }
            _ => format!(
                "https://searx.work/search?q={query}&pageno={page}&safesearch={safe_search}"
            ),
        };

        // initializing headers and adding appropriate headers.
        let header_map = HeaderMap::try_from(&HashMap::from([
            ("USER_AGENT".to_string(), user_agent.to_string()),
            ("REFERER".to_string(), "https://google.com/".to_string()),
            ("CONTENT_TYPE".to_string(), "application/x-www-form-urlencoded".to_string()),
            ("COOKIE".to_string(), "categories=general; language=auto; locale=en; autocomplete=duckduckgo; image_proxy=1; method=POST; safesearch=2; theme=simple; results_on_new_tab=1; doi_resolver=oadoi.org; simple_style=auto; center_alignment=1; query_in_title=1; infinite_scroll=0; disabled_engines=; enabled_engines=\"archive is__general\\054yep__general\\054curlie__general\\054currency__general\\054ddg definitions__general\\054wikidata__general\\054duckduckgo__general\\054tineye__general\\054lingva__general\\054startpage__general\\054yahoo__general\\054wiby__general\\054marginalia__general\\054alexandria__general\\054wikibooks__general\\054wikiquote__general\\054wikisource__general\\054wikiversity__general\\054wikivoyage__general\\054dictzone__general\\054seznam__general\\054mojeek__general\\054naver__general\\054wikimini__general\\054brave__general\\054petalsearch__general\\054goo__general\"; disabled_plugins=; enabled_plugins=\"searx.plugins.hostname_replace\\054searx.plugins.oa_doi_rewrite\\054searx.plugins.vim_hotkeys\"; tokens=; maintab=on; enginetab=on".to_string())
        ]))
        .change_context(EngineError::UnexpectedError)?;

        let document: Html = Html::parse_document(
            &Searx::fetch_html_from_upstream(self, &url, header_map, request_timeout).await?,
        );

        if let Some(no_result_msg) = document.select(&self.parser.no_result).nth(1) {
            if no_result_msg.inner_html()
            == "we didn't find any results. Please use another query or search in more categories"
        {
            return Err(Report::new(EngineError::EmptyResultSet));
        }
        }

        // scrape all the results from the html
        Ok(document
            .select(&self.parser.results)
            .map(|result| {
                SearchResult::new(
                    result
                        .select(&self.parser.result_title)
                        .next()
                        .unwrap()
                        .inner_html()
                        .trim(),
                    result
                        .select(&self.parser.result_url)
                        .next()
                        .unwrap()
                        .value()
                        .attr("href")
                        .unwrap(),
                    result
                        .select(&self.parser.result_desc)
                        .next()
                        .unwrap()
                        .inner_html()
                        .trim(),
                    &["searx"],
                )
            })
            .map(|search_result| (search_result.url.clone(), search_result))
            .collect())
    }
}
updating and improving README.org 2023-04-27 09:06:59 -04:00			//! The `searx` module handles the scraping of results from the searx search engine instance
			`//! by querying the upstream searx search engine instance with user provided query and with a page`
			`//! number if provided.`

⚙️ refactor: add several optimizations to the engine code (#180)(#178) 2023-08-27 13:59:08 -04:00			`use reqwest::header::HeaderMap;`
Create separate search_result_parser 2023-09-24 07:54:08 -04:00			`use scraper::Html;`
optimise code for large scale server use and closes #7 2023-05-07 14:18:19 -04:00			`use std::collections::HashMap;`
initial commit 2023-04-22 07:35:07 -04:00
Create separate search_result_parser 2023-09-24 07:54:08 -04:00			`use super::search_result_parser::SearchResultParser;`
⚙️ refactor: reorganize code & restructure codebase for better maintainability (#207) 2023-09-03 13:50:50 -04:00			`use crate::models::aggregation_models::SearchResult;`
			`use crate::models::engine_models::{EngineError, SearchEngine};`
⚙️ refactor: add several optimizations to the engine code (#180)(#178) 2023-08-27 13:59:08 -04:00			`use error_stack::{Report, Result, ResultExt};`
chore: provide a better and more standardized way to handle engine errors 2023-05-31 12:54:51 -04:00
✨ feat: add documentation to code 2023-07-15 06:36:46 -04:00			/// A new Searx engine type defined in-order to implement the `SearchEngine` trait which allows to
			`/// reduce code duplication as well as allows to create vector of different search engines easily.`
Create separate search_result_parser 2023-09-24 07:54:08 -04:00			`pub struct Searx {`
			`parser: SearchResultParser,`
			`}`

			`impl Searx {`
			`// new Searchx engine`
			`pub fn new() -> Result<Searx, EngineError> {`
			`Ok(Self {`
			`parser: SearchResultParser::new(`
			`"#urls>.dialog-error>p",`
			`".result",`
			`"h3>a",`
			`"h3>a",`
			`".content",`
			`)?,`
			`})`
			`}`
			`}`
initial commit 2023-04-22 07:35:07 -04:00
✨ feat: rewrite code by implementing common engine trait `SearchEngine` 2023-07-11 12:42:17 -04:00			`#[async_trait::async_trait]`
			`impl SearchEngine for Searx {`
			`async fn results(`
			`&self,`
⚙️ refactor: add several optimizations to the engine code (#180)(#178) 2023-08-27 13:59:08 -04:00			`query: &str,`
✨ feat: rewrite code by implementing common engine trait `SearchEngine` 2023-07-11 12:42:17 -04:00			`page: u32,`
⚙️ refactor: add several optimizations to the engine code (#180)(#178) 2023-08-27 13:59:08 -04:00			`user_agent: &str,`
✨ feat: provide the functionality to use the new config option 2023-07-30 03:53:48 -04:00			`request_timeout: u8,`
✨ feat: pass the config option into the results function (#201) 2023-09-02 10:44:05 -04:00			`mut safe_search: u8,`
Improve aggregation Adds the EngineHandler struct Removes vulnerability where an attacker could send requests cookies with fake engine names and crash the server. Merged RawSearchResult and SearchResult, as they were functionally identical. 2023-08-18 04:43:53 -04:00			`) -> Result<HashMap<String, SearchResult>, EngineError> {`
✨ feat: rewrite code by implementing common engine trait `SearchEngine` 2023-07-11 12:42:17 -04:00			`// Page number can be missing or empty string and so appropriate handling is required`
			`// so that upstream server recieves valid page number.`
✨ feat: pass the config option into the results function (#201) 2023-09-02 10:44:05 -04:00			`if safe_search == 3 {`
			`safe_search = 2;`
			`};`

🛠️ fix: add & improve code to handle page handling in searx & duckduckgo 2023-07-30 13:14:40 -04:00			`let url: String = match page {`
✨ feat: pass the config option into the results function (#201) 2023-09-02 10:44:05 -04:00			`0 \| 1 => {`
			`format!("https://searx.work/search?q={query}&pageno=1&safesearch={safe_search}")`
			`}`
			`_ => format!(`
			`"https://searx.work/search?q={query}&pageno={page}&safesearch={safe_search}"`
			`),`
🛠️ fix: add & improve code to handle page handling in searx & duckduckgo 2023-07-30 13:14:40 -04:00			`};`
initial commit 2023-04-22 07:35:07 -04:00
✨ feat: rewrite code by implementing common engine trait `SearchEngine` 2023-07-11 12:42:17 -04:00			`// initializing headers and adding appropriate headers.`
⚙️ refactor: add several optimizations to the engine code (#180)(#178) 2023-08-27 13:59:08 -04:00			`let header_map = HeaderMap::try_from(&HashMap::from([`
			`("USER_AGENT".to_string(), user_agent.to_string()),`
			`("REFERER".to_string(), "https://google.com/".to_string()),`
			`("CONTENT_TYPE".to_string(), "application/x-www-form-urlencoded".to_string()),`
			("COOKIE".to_string(), "categories=general; language=auto; locale=en; autocomplete=duckduckgo; image_proxy=1; method=POST; safesearch=2; theme=simple; results_on_new_tab=1; doi_resolver=oadoi.org; simple_style=auto; center_alignment=1; query_in_title=1; infinite_scroll=0; disabled_engines=; enabled_engines=\"archive is__general\\054yep__general\\054curlie__general\\054currency__general\\054ddg definitions__general\\054wikidata__general\\054duckduckgo__general\\054tineye__general\\054lingva__general\\054startpage__general\\054yahoo__general\\054wiby__general\\054marginalia__general\\054alexandria__general\\054wikibooks__general\\054wikiquote__general\\054wikisource__general\\054wikiversity__general\\054wikivoyage__general\\054dictzone__general\\054seznam__general\\054mojeek__general\\054naver__general\\054wikimini__general\\054brave__general\\054petalsearch__general\\054goo__general\"; disabled_plugins=; enabled_plugins=\"searx.plugins.hostname_replace\\054searx.plugins.oa_doi_rewrite\\054searx.plugins.vim_hotkeys\"; tokens=; maintab=on; enginetab=on".to_string())
			`]))`
			`.change_context(EngineError::UnexpectedError)?;`
chore: provide a better and more standardized way to handle engine errors 2023-05-31 12:54:51 -04:00
✨ feat: provide the functionality to use the new config option 2023-07-30 03:53:48 -04:00			`let document: Html = Html::parse_document(`
⚙️ refactor: add several optimizations to the engine code (#180)(#178) 2023-08-27 13:59:08 -04:00			`&Searx::fetch_html_from_upstream(self, &url, header_map, request_timeout).await?,`
✨ feat: provide the functionality to use the new config option 2023-07-30 03:53:48 -04:00			`);`
chore: provide a better and more standardized way to handle engine errors 2023-05-31 12:54:51 -04:00
Create separate search_result_parser 2023-09-24 07:54:08 -04:00			`if let Some(no_result_msg) = document.select(&self.parser.no_result).nth(1) {`
✨ feat: rewrite code by implementing common engine trait `SearchEngine` 2023-07-11 12:42:17 -04:00			`if no_result_msg.inner_html()`
chore: provide a better and more standardized way to handle engine errors 2023-05-31 12:54:51 -04:00			`== "we didn't find any results. Please use another query or search in more categories"`
			`{`
improve error handling by using `error-stack` crate 2023-06-14 08:42:30 -04:00			`return Err(Report::new(EngineError::EmptyResultSet));`
chore: provide a better and more standardized way to handle engine errors 2023-05-31 12:54:51 -04:00			`}`
✨ feat: rewrite code by implementing common engine trait `SearchEngine` 2023-07-11 12:42:17 -04:00			`}`
chore: provide a better and more standardized way to handle engine errors 2023-05-31 12:54:51 -04:00
✨ feat: rewrite code by implementing common engine trait `SearchEngine` 2023-07-11 12:42:17 -04:00			`// scrape all the results from the html`
			`Ok(document`
Create separate search_result_parser 2023-09-24 07:54:08 -04:00			`.select(&self.parser.results)`
✨ feat: rewrite code by implementing common engine trait `SearchEngine` 2023-07-11 12:42:17 -04:00			`.map(\|result\| {`
Improve aggregation Adds the EngineHandler struct Removes vulnerability where an attacker could send requests cookies with fake engine names and crash the server. Merged RawSearchResult and SearchResult, as they were functionally identical. 2023-08-18 04:43:53 -04:00			`SearchResult::new(`
✨ feat: rewrite code by implementing common engine trait `SearchEngine` 2023-07-11 12:42:17 -04:00			`result`
Create separate search_result_parser 2023-09-24 07:54:08 -04:00			`.select(&self.parser.result_title)`
✨ feat: rewrite code by implementing common engine trait `SearchEngine` 2023-07-11 12:42:17 -04:00			`.next()`
			`.unwrap()`
			`.inner_html()`
⚙️ refactor: add several optimizations to the engine code (#180)(#178) 2023-08-27 13:59:08 -04:00			`.trim(),`
✨ feat: rewrite code by implementing common engine trait `SearchEngine` 2023-07-11 12:42:17 -04:00			`result`
Create separate search_result_parser 2023-09-24 07:54:08 -04:00			`.select(&self.parser.result_url)`
✨ feat: rewrite code by implementing common engine trait `SearchEngine` 2023-07-11 12:42:17 -04:00			`.next()`
			`.unwrap()`
			`.value()`
			`.attr("href")`
⚙️ refactor: add several optimizations to the engine code (#180)(#178) 2023-08-27 13:59:08 -04:00			`.unwrap(),`
✨ feat: rewrite code by implementing common engine trait `SearchEngine` 2023-07-11 12:42:17 -04:00			`result`
Create separate search_result_parser 2023-09-24 07:54:08 -04:00			`.select(&self.parser.result_desc)`
✨ feat: rewrite code by implementing common engine trait `SearchEngine` 2023-07-11 12:42:17 -04:00			`.next()`
			`.unwrap()`
			`.inner_html()`
⚙️ refactor: add several optimizations to the engine code (#180)(#178) 2023-08-27 13:59:08 -04:00			`.trim(),`
			`&["searx"],`
✨ feat: rewrite code by implementing common engine trait `SearchEngine` 2023-07-11 12:42:17 -04:00			`)`
			`})`
Improve Aggregation function & config parser Refactor aggregation function Rename visiting_url to url, as they are always the same (see upstream engine scalping). Refactor parsing function to be more readable. 2023-08-17 16:48:20 -04:00			`.map(\|search_result\| (search_result.url.clone(), search_result))`
✨ feat: rewrite code by implementing common engine trait `SearchEngine` 2023-07-11 12:42:17 -04:00			`.collect())`
			`}`
initial commit 2023-04-22 07:35:07 -04:00			`}`