From 2f83d5f13639efebf5db819136f3aebca825fe78 Mon Sep 17 00:00:00 2001 From: JMARyA Date: Sun, 29 Dec 2024 17:22:57 +0100 Subject: [PATCH] internal urls --- Cargo.lock | 22 +++++++++++----------- Cargo.toml | 2 +- src/archive.rs | 33 +++++++++++++++++++++++++++++---- 3 files changed, 41 insertions(+), 16 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 0f322b6..b8c2028 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3082,7 +3082,17 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "943aab3fdaaa029a6e0271b35ea10b72b943135afe9bffca82384098ad0e06a6" [[package]] -name = "watchdogs" +name = "web-sys" +version = "0.3.76" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "04dd7223427d52553d3702c004d3b2fe07c148165faa56313cb00211e31c12bc" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "webarc" version = "0.1.0" dependencies = [ "based", @@ -3106,16 +3116,6 @@ dependencies = [ "walkdir", ] -[[package]] -name = "web-sys" -version = "0.3.76" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "04dd7223427d52553d3702c004d3b2fe07c148165faa56313cb00211e31c12bc" -dependencies = [ - "js-sys", - "wasm-bindgen", -] - [[package]] name = "whoami" version = "1.5.2" diff --git a/Cargo.toml b/Cargo.toml index 7caeefd..5dc3b24 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,5 +1,5 @@ [package] -name = "watchdogs" +name = "webarc" version = "0.1.0" edition = "2021" diff --git a/src/archive.rs b/src/archive.rs index 88327ac..149372d 100644 --- a/src/archive.rs +++ b/src/archive.rs @@ -4,6 +4,20 @@ use std::{ path::{Path, PathBuf}, }; +fn internalize_urls(input: &str) -> String { + let url_pattern = r"https?://([a-zA-Z0-9.-]+)(/[\w./-]*)"; + let re = regex::Regex::new(url_pattern).unwrap(); + + re.replace_all(input, |caps: ®ex::Captures| { + format!( + "/s/{}/{}", + &caps[1].trim_start_matches("www."), // Domain + &caps[2] // Path + ) + }) + .to_string() +} + pub struct WebsiteArchive { pub dir: PathBuf, } @@ -58,8 +72,13 @@ impl Document { file_path = file_path.join(latest_version); - // TODO : Replace links with local ones - return std::fs::read_to_string(file_path).ok(); + let content = std::fs::read_to_string(file_path).ok()?; + + if std::env::var("ROUTE_INTERNAL").unwrap_or("false".to_string()) == "true" { + Some(internalize_urls(&content)) + } else { + Some(content) + } } pub fn versions(path: &PathBuf) -> Vec { @@ -94,7 +113,7 @@ impl WebsiteArchive { pub fn archive_url(&self, url: &str) { let parsed_url = url::Url::parse(url).unwrap(); - let domain = parsed_url.domain().unwrap().trim_start_matches("www"); + let domain = parsed_url.domain().unwrap().trim_start_matches("www."); let path = parsed_url.path(); let mut folder_name = self.dir.join(&domain); @@ -110,6 +129,8 @@ impl WebsiteArchive { let timestamp = chrono::Utc::now().format("%Y-%m-%d").to_string(); let filename = folder_name.join(&format!("index_{timestamp}.html")); + log::info!("Archiving {url} to {}", filename.to_str().unwrap()); + run_command(&vec![ "monolith", "-I", @@ -127,7 +148,11 @@ impl WebsiteArchive { fn run_command(cmd: &[&str]) { let mut cmd_setup = std::process::Command::new(cmd[0].clone()); - let cmd_setup = cmd_setup.args(cmd.into_iter().skip(1).collect::>()); + let cmd_setup = cmd_setup + .args(cmd.into_iter().skip(1).collect::>()) + .stdout(std::process::Stdio::inherit()) + .stderr(std::process::Stdio::inherit()); + let child = cmd_setup.spawn().unwrap(); let status = child.wait_with_output().unwrap();