updated Url::parse callsites to use the new utility function

2024-07-08 19:45:45 +00:00 · 2023-04-25 07:09:56 -05:00 · 2023-04-25 07:09:56 -05:00 · f1fd2fc379
commit f1fd2fc379
parent 3dd070a0db
8 changed files with 54 additions and 35 deletions
--- a/src/banner/container.rs
+++ b/src/banner/container.rs
@ -2,12 +2,11 @@ use super::entry::BannerEntry;
 use crate::{
    config::Configuration,
    event_handlers::Handles,
-    utils::{logged_request, status_colorizer},
+    utils::{logged_request, parse_url_with_raw_path, status_colorizer},
    DEFAULT_IGNORED_EXTENSIONS, DEFAULT_METHOD, DEFAULT_STATUS_CODES, VERSION,
 };
 use anyhow::{bail, Result};
 use console::{style, Emoji};
-use reqwest::Url;
 use serde_json::Value;
 use std::{io::Write, sync::Arc};

@ -478,7 +477,7 @@ by Ben "epi" Risher {}                 ver: {}"#,
    pub async fn check_for_updates(&mut self, url: &str, handles: Arc<Handles>) -> Result<()> {
        log::trace!("enter: needs_update({}, {:?})", url, handles);

-        let api_url = Url::parse(url)?;
+        let api_url = parse_url_with_raw_path(url)?;

        let result = logged_request(&api_url, DEFAULT_METHOD, None, handles.clone()).await?;
        let body = result.text().await?;
--- a/src/config/container.rs
+++ b/src/config/container.rs
@ -6,7 +6,10 @@ use super::utils::{
 use crate::config::determine_output_level;
 use crate::config::utils::determine_requester_policy;
 use crate::{
-    client, parser, scan_manager::resume_scan, traits::FeroxSerialize, utils::fmt_err,
+    client, parser,
+    scan_manager::resume_scan,
+    traits::FeroxSerialize,
+    utils::{fmt_err, parse_url_with_raw_path},
    DEFAULT_CONFIG_NAME,
 };
 use anyhow::{anyhow, Context, Result};
@ -673,7 +676,7 @@ impl Configuration {
            for denier in arg {
                // could be an absolute url or a regex, need to determine which and populate the
                // appropriate vector
-                match Url::parse(denier.trim_end_matches('/')) {
+                match parse_url_with_raw_path(denier.trim_end_matches('/')) {
                    Ok(absolute) => {
                        // denier is an absolute url and can be parsed as such
                        config.url_denylist.push(absolute);
--- a/src/event_handlers/scans.rs
+++ b/src/event_handlers/scans.rs
@ -16,7 +16,7 @@ use crate::{
 use super::command::Command::AddToUsizeField;
 use super::*;
 use crate::statistics::StatField;
-use reqwest::Url;
+use crate::utils::parse_url_with_raw_path;
 use tokio::time::Duration;

 #[derive(Debug)]
@ -325,7 +325,9 @@ impl ScanHandler {
                self.data.add_directory_scan(&target, order).1 // add the new target; return FeroxScan
            };

-            if should_test_deny && should_deny_url(&Url::parse(&target)?, self.handles.clone())? {
+            if should_test_deny
+                && should_deny_url(&parse_url_with_raw_path(&target)?, self.handles.clone())?
+            {
                // response was caught by a user-provided deny list
                // checking this last, since it's most susceptible to longer runtimes due to what
                // input is received
--- a/src/extractor/container.rs
+++ b/src/extractor/container.rs
@ -11,7 +11,10 @@ use crate::{
        StatField::{LinksExtracted, TotalExpected},
    },
    url::FeroxUrl,
-    utils::{logged_request, make_request, send_try_recursion_command, should_deny_url},
+    utils::{
+        logged_request, make_request, parse_url_with_raw_path, send_try_recursion_command,
+        should_deny_url,
+    },
    ExtractionResult, DEFAULT_METHOD,
 };
 use anyhow::{bail, Context, Result};
@ -122,7 +125,7 @@ impl<'a> Extractor<'a> {
    ) -> Result<()> {
        log::trace!("enter: parse_url_and_add_subpaths({:?})", links);

-        match Url::parse(url_to_parse) {
+        match parse_url_with_raw_path(url_to_parse) {
            Ok(absolute) => {
                if absolute.domain() != original_url.domain()
                    || absolute.host() != original_url.host()
@ -475,7 +478,7 @@ impl<'a> Extractor<'a> {
            ExtractionTarget::ResponseBody | ExtractionTarget::DirectoryListing => {
                self.response.unwrap().url().clone()
            }
-            ExtractionTarget::RobotsTxt => match Url::parse(&self.url) {
+            ExtractionTarget::RobotsTxt => match parse_url_with_raw_path(&self.url) {
                Ok(u) => u,
                Err(e) => {
                    bail!("Could not parse {}: {}", self.url, e);
@ -524,7 +527,7 @@ impl<'a> Extractor<'a> {

        for capture in self.robots_regex.captures_iter(body) {
            if let Some(new_path) = capture.name("url_path") {
-                let mut new_url = Url::parse(&self.url)?;
+                let mut new_url = parse_url_with_raw_path(&self.url)?;

                new_url.set_path(new_path.as_str());

@ -654,7 +657,7 @@ impl<'a> Extractor<'a> {
            &client
        };

-        let mut url = Url::parse(&self.url)?;
+        let mut url = parse_url_with_raw_path(&self.url)?;
        url.set_path(location); // overwrite existing path

        // purposefully not using logged_request here due to using the special client
--- a/src/filters/utils.rs
+++ b/src/filters/utils.rs
@ -4,11 +4,10 @@ use crate::event_handlers::Handles;
 use crate::filters::similarity::SIM_HASHER;
 use crate::nlp::preprocess;
 use crate::response::FeroxResponse;
-use crate::utils::logged_request;
+use crate::utils::{logged_request, parse_url_with_raw_path};
 use crate::DEFAULT_METHOD;
 use anyhow::Result;
 use regex::Regex;
-use reqwest::Url;
 use std::sync::Arc;

 /// wrapper around logic necessary to create a SimilarityFilter
@ -23,7 +22,7 @@ pub(crate) async fn create_similarity_filter(
    handles: Arc<Handles>,
 ) -> Result<SimilarityFilter> {
    // url as-is based on input, ignores user-specified url manipulation options (add-slash etc)
-    let url = Url::parse(similarity_filter)?;
+    let url = parse_url_with_raw_path(similarity_filter)?;

    // attempt to request the given url
    let resp = logged_request(&url, DEFAULT_METHOD, None, handles.clone()).await?;
--- a/src/response.rs
+++ b/src/response.rs
@ -21,7 +21,7 @@ use crate::{
    event_handlers::{Command, Handles},
    traits::FeroxSerialize,
    url::FeroxUrl,
-    utils::{self, fmt_err, status_colorizer},
+    utils::{self, fmt_err, parse_url_with_raw_path, status_colorizer},
    CommandSender,
 };

@ -140,7 +140,7 @@ impl FeroxResponse {

    /// Set `FeroxResponse`'s `url` attribute, has no affect if an error occurs
    pub fn set_url(&mut self, url: &str) {
-        match Url::parse(url) {
+        match parse_url_with_raw_path(url) {
            Ok(url) => {
                self.url = url;
            }
@ -599,7 +599,7 @@ impl<'de> Deserialize<'de> for FeroxResponse {
            match key.as_str() {
                "url" => {
                    if let Some(url) = value.as_str() {
-                        if let Ok(parsed) = Url::parse(url) {
+                        if let Ok(parsed) = parse_url_with_raw_path(url) {
                            response.url = parsed;
                        }
                    }
--- a/src/url.rs
+++ b/src/url.rs
@ -1,3 +1,4 @@
+use crate::utils::parse_url_with_raw_path;
 use crate::{event_handlers::Handles, statistics::StatError::UrlFormat, Command::AddError};
 use anyhow::{anyhow, bail, Result};
 use reqwest::Url;
@ -142,19 +143,19 @@ impl FeroxUrl {
            word = word.trim_start_matches('/').to_string();
        };

-        let base_url = Url::parse(&url)?;
-        let joined = base_url.join(&word)?;
+        let base_url = parse_url_with_raw_path(&url)?;
+        let mut joined = base_url.join(&word)?;

-        if self.handles.config.queries.is_empty() {
-            // no query params to process
-            log::trace!("exit: format -> {}", joined);
-            Ok(joined)
-        } else {
-            let with_params =
-                Url::parse_with_params(joined.as_str(), &self.handles.config.queries)?;
-            log::trace!("exit: format_url -> {}", with_params);
-            Ok(with_params) // request with params attached
+        if !self.handles.config.queries.is_empty() {
+            // if called, this adds a '?' to the url, whether or not there are queries to be added
+            // so we need to check if there are queries to be added before blindly adding the '?'
+            joined
+                .query_pairs_mut()
+                .extend_pairs(self.handles.config.queries.iter());
        }
+
+        log::trace!("exit: format_url -> {}", joined);
+        Ok(joined)
    }

    /// Simple helper to abstract away adding a forward-slash to a url if not present
@ -189,7 +190,7 @@ impl FeroxUrl {

        let target = self.normalize();

-        let parsed = Url::parse(&target)?;
+        let parsed = parse_url_with_raw_path(&target)?;
        let parts = parsed
            .path_segments()
            .ok_or_else(|| anyhow!("No path segments found"))?;
--- a/src/utils.rs
+++ b/src/utils.rs
@ -425,9 +425,14 @@ fn should_deny_absolute(url_to_test: &Url, denier: &Url, handles: Arc<Handles>)
        // current deny-url, now we just need to check to see if this deny-url is a parent
        // to a scanned url that is also a parent of the given url
        for ferox_scan in handles.ferox_scans()?.get_active_scans() {
-            let scanner = Url::parse(ferox_scan.url().trim_end_matches('/'))
+            let scanner = parse_url_with_raw_path(ferox_scan.url().trim_end_matches('/'))
                .with_context(|| format!("Could not parse {ferox_scan} as a url"))?;

+            // by calling the new parse_url_with_raw_path, and reaching this point without an
+            // error, we know we have an authority and therefore a host. leaving the code
+            // below, but we should never hit the else condition. leaving it in so if we find
+            // a case where i'm mistaken, we'll know about it and can address it
+
            if let Some(scan_host) = scanner.host() {
                // same domain/ip check we perform on the denier above
                if tested_host != scan_host {
@ -436,7 +441,7 @@ fn should_deny_absolute(url_to_test: &Url, denier: &Url, handles: Arc<Handles>)
                }
            } else {
                // couldn't process .host from scanner
-                continue;
+                unreachable!("should_deny_absolute: scanner.host() returned None, which shouldn't be possible");
            };

            let scan_path = scanner.path();
@ -487,7 +492,7 @@ pub fn should_deny_url(url: &Url, handles: Arc<Handles>) -> Result<bool> {

    // normalization for comparison is to remove the trailing / if one exists, this is done for
    // the given url and any url to which it's compared
-    let normed_url = Url::parse(url.to_string().trim_end_matches('/'))?;
+    let normed_url = parse_url_with_raw_path(url.to_string().trim_end_matches('/'))?;

    for denier in &handles.config.url_denylist {
        // note to self: it may seem as though we can use regex only for --dont-scan, however, in
@ -555,6 +560,7 @@ pub fn parse_url_with_raw_path(url: &str) -> Result<Url> {
    if !parsed.has_authority() {
        // parsed correctly, but no authority, meaning mailto: or tel: or
        // some other url that we don't care about
+        println!("url to parse has no authority and is therefore invalid");
        bail!("url to parse has no authority and is therefore invalid");
    }

@ -1004,6 +1010,13 @@ mod tests {
    /// provide a denier from which we can't check a host, which results in no comparison, expect false
    /// because the denier is a parent to the tested, even tho the scanned doesn't compare, it
    /// still returns true
+    ///
+    /// note: adding parse_url_with_raw_path changed the behavior of this test, it used to return
+    /// true, now it returns false. see my note in should_deny_absolute and the unreachable!
+    /// call block to see why
+    ///
+    /// leaving this test here to document the behavior change and to catch regressions in the
+    /// new expected behavior
    fn should_deny_url_doesnt_compare_non_domains_in_scanned() {
        let deny_url = "https://testdomain.com/";
        let scan_url = "unix:/run/foo.socket";
@ -1017,8 +1030,7 @@ mod tests {
        let config = Arc::new(config);

        let handles = Arc::new(Handles::for_testing(Some(scans), Some(config)).0);
-
-        assert!(should_deny_url(&tested_url, handles).unwrap());
+        assert!(!should_deny_url(&tested_url, handles).unwrap());
    }

    #[test]