internal urls
Some checks failed
ci/woodpecker/push/build Pipeline failed

This commit is contained in:
JMARyA 2024-12-29 17:22:57 +01:00
parent d8ca94bd0b
commit 2f83d5f136
Signed by: jmarya
GPG key ID: 901B2ADDF27C2263
3 changed files with 41 additions and 16 deletions

22
Cargo.lock generated
View file

@ -3082,7 +3082,17 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "943aab3fdaaa029a6e0271b35ea10b72b943135afe9bffca82384098ad0e06a6"
[[package]]
name = "watchdogs"
name = "web-sys"
version = "0.3.76"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "04dd7223427d52553d3702c004d3b2fe07c148165faa56313cb00211e31c12bc"
dependencies = [
"js-sys",
"wasm-bindgen",
]
[[package]]
name = "webarc"
version = "0.1.0"
dependencies = [
"based",
@ -3106,16 +3116,6 @@ dependencies = [
"walkdir",
]
[[package]]
name = "web-sys"
version = "0.3.76"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "04dd7223427d52553d3702c004d3b2fe07c148165faa56313cb00211e31c12bc"
dependencies = [
"js-sys",
"wasm-bindgen",
]
[[package]]
name = "whoami"
version = "1.5.2"

View file

@ -1,5 +1,5 @@
[package]
name = "watchdogs"
name = "webarc"
version = "0.1.0"
edition = "2021"

View file

@ -4,6 +4,20 @@ use std::{
path::{Path, PathBuf},
};
fn internalize_urls(input: &str) -> String {
let url_pattern = r"https?://([a-zA-Z0-9.-]+)(/[\w./-]*)";
let re = regex::Regex::new(url_pattern).unwrap();
re.replace_all(input, |caps: &regex::Captures| {
format!(
"/s/{}/{}",
&caps[1].trim_start_matches("www."), // Domain
&caps[2] // Path
)
})
.to_string()
}
pub struct WebsiteArchive {
pub dir: PathBuf,
}
@ -58,8 +72,13 @@ impl Document {
file_path = file_path.join(latest_version);
// TODO : Replace links with local ones
return std::fs::read_to_string(file_path).ok();
let content = std::fs::read_to_string(file_path).ok()?;
if std::env::var("ROUTE_INTERNAL").unwrap_or("false".to_string()) == "true" {
Some(internalize_urls(&content))
} else {
Some(content)
}
}
pub fn versions(path: &PathBuf) -> Vec<String> {
@ -94,7 +113,7 @@ impl WebsiteArchive {
pub fn archive_url(&self, url: &str) {
let parsed_url = url::Url::parse(url).unwrap();
let domain = parsed_url.domain().unwrap().trim_start_matches("www");
let domain = parsed_url.domain().unwrap().trim_start_matches("www.");
let path = parsed_url.path();
let mut folder_name = self.dir.join(&domain);
@ -110,6 +129,8 @@ impl WebsiteArchive {
let timestamp = chrono::Utc::now().format("%Y-%m-%d").to_string();
let filename = folder_name.join(&format!("index_{timestamp}.html"));
log::info!("Archiving {url} to {}", filename.to_str().unwrap());
run_command(&vec![
"monolith",
"-I",
@ -127,7 +148,11 @@ impl WebsiteArchive {
fn run_command(cmd: &[&str]) {
let mut cmd_setup = std::process::Command::new(cmd[0].clone());
let cmd_setup = cmd_setup.args(cmd.into_iter().skip(1).collect::<Vec<_>>());
let cmd_setup = cmd_setup
.args(cmd.into_iter().skip(1).collect::<Vec<_>>())
.stdout(std::process::Stdio::inherit())
.stderr(std::process::Stdio::inherit());
let child = cmd_setup.spawn().unwrap();
let status = child.wait_with_output().unwrap();