From 3696f61b02680e61baf78fab67e314d77daf4b34 Mon Sep 17 00:00:00 2001 From: JMARyA Date: Sat, 11 Jan 2025 16:21:15 +0100 Subject: [PATCH] fix + refactor --- Cargo.lock | 10 + Cargo.toml | 1 + config.toml | 17 +- src/archive.rs | 441 ---------------------------------------- src/archive/document.rs | 126 ++++++++++++ src/archive/domain.rs | 126 ++++++++++++ src/archive/mod.rs | 207 +++++++++++++++++++ src/pages/mod.rs | 42 +++- 8 files changed, 524 insertions(+), 446 deletions(-) delete mode 100644 src/archive.rs create mode 100644 src/archive/document.rs create mode 100644 src/archive/domain.rs create mode 100644 src/archive/mod.rs diff --git a/Cargo.lock b/Cargo.lock index 05db4e7..e5ec9d4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3569,6 +3569,15 @@ dependencies = [ "percent-encoding", ] +[[package]] +name = "url-escape" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "44e0ce4d1246d075ca5abec4b41d33e87a6054d08e2366b63205665e950db218" +dependencies = [ + "percent-encoding", +] + [[package]] name = "utf-8" version = "0.7.6" @@ -3752,6 +3761,7 @@ dependencies = [ "tokio", "toml", "url", + "url-escape", "uuid", ] diff --git a/Cargo.toml b/Cargo.toml index cdb2de4..e7ae6d1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -24,3 +24,4 @@ pgvector = { version = "0.4", features = ["sqlx"] } html2md = "0.2.14" clap = { version = "4.5.23", features = ["cargo", "derive"] } toml = "0.8.19" +url-escape = "0.1.1" diff --git a/config.toml b/config.toml index 349ff95..ab8cedb 100644 --- a/config.toml +++ b/config.toml @@ -8,7 +8,8 @@ DOWNLOAD_ON_DEMAND=true [websites] # You can blacklist sites which wont work well BLACKLIST_DOMAINS = [ - "^gitlab" # All domains starting with gitlab + "^gitlab", # All domains starting with gitlab + "youtube" # YouTube ] # Domain configuration (Example) @@ -56,3 +57,17 @@ no_javascript = true [[websites.domains]] domain = "github.com" no_javascript = true + +[[websites.domains]] +domain = "en.wikipedia.org" +no_javascript = true + +[[websites.domains]] +domain = "api.flutter.dev" +no_javascript = true +no_video = true + +[[websites.domains]] +domain = "docs.flutter.dev" +no_javascript = true +no_video = true \ No newline at end of file diff --git a/src/archive.rs b/src/archive.rs deleted file mode 100644 index cca93c3..0000000 --- a/src/archive.rs +++ /dev/null @@ -1,441 +0,0 @@ -use std::{collections::HashSet, io::Read, path::PathBuf}; - -use based::{request::RequestContext, result::LogAndIgnore}; -use maud::html; - -use crate::{ - blacklist::{check_blacklist, check_blacklist_path}, - conf::get_config, - favicon::download_fav_for, - render_page, -}; - -/// Read directory entries into `Vec` -pub fn read_dir(dir: &PathBuf) -> Vec { - let mut list = Vec::new(); - - if let Ok(entries) = std::fs::read_dir(dir) { - for entry in entries.flatten() { - if let Some(file_name) = entry.file_name().to_str() { - list.push(file_name.to_string()); - } - } - } - - list -} - -/// Rewrite all URLs in `input` to the format `/s//` -fn internalize_urls(input: &str) -> String { - let url_pattern = r"https?://([a-zA-Z0-9.-]+)(/[\w./-]*)"; - let re = regex::Regex::new(url_pattern).unwrap(); - - re.replace_all(input, |caps: ®ex::Captures| { - let domain = caps[1].trim_start_matches("www."); - let path = &caps[2]; - - // Dont transform if in blacklist - if check_blacklist(domain) { - return format!("https://{domain}/{path}"); - } - - format!("/s/{domain}/{path}") - }) - .to_string() -} - -/// Extract all domains -pub fn extract_domains(input: &str) -> Vec { - let url_pattern = r"https?://([a-zA-Z0-9.-]+)(/[\w./-]*)?"; - let re = regex::Regex::new(url_pattern).unwrap(); - - let mut domains = HashSet::new(); - for caps in re.captures_iter(input) { - let domain = caps[1].trim_start_matches("www."); - domains.insert(domain.to_string()); - } - - let mut domains: Vec<_> = domains.into_iter().collect(); - domains.sort(); - - domains -} - -/// Represents a directory containg archived websites -#[derive(Debug, Clone)] -pub struct WebsiteArchive { - pub dir: PathBuf, -} - -/// Represents a domain within the website archive -pub struct Domain { - /// Domain name - pub name: String, - dir: PathBuf, -} - -impl Domain { - /// Creates a new `Domain` instance. - /// - /// If the domain name is not blacklisted, a directory is created. - /// - /// # Parameters - /// - `name`: The name of the domain. - /// - `dir`: The directory path for the domain. - /// - /// # Returns - /// A new `Domain` instance. - pub fn new(name: &str, dir: PathBuf) -> Self { - if !check_blacklist(name) { - std::fs::create_dir_all(&dir) - .log_err_and_ignore(&format!("Could not create domain dir {name}")); - } - Self { - name: name.to_string(), - dir, - } - } - - /// Resolves a specific path within the domain and returns a `Document` representing it. - /// - /// # Parameters - /// - `path`: The path to resolve within the domain. - /// - /// # Returns - /// A `Document` instance corresponding to the given path. - pub fn path(&self, path: &str) -> Document { - Document::new(&self.name, path, self.dir.parent().unwrap().to_path_buf()) - } - - /// Get all paths associated with the domain - pub fn all_paths(&self) -> Vec { - let mut queue = self.paths("/").0; - - let mut ret = Vec::new(); - - ret.push(PathEntry(self.name.clone(), "/".to_string())); - - while let Some(el) = queue.pop() { - ret.push(el.clone()); - let paths = self.paths(&el.1).0; - queue.extend(paths); - } - - ret - } - - /// Retrieves entries and metadata for a given path within the domain. - /// - /// # Parameters - /// - `path`: The path to inspect. - /// - /// # Returns - /// A tuple containing: - /// - A vector of `PathEntry` instances representing the contents of the path. - /// - A boolean indicating whether the path is itself a `Document` - pub fn paths(&self, path: &str) -> (Vec, bool) { - let mut base_path = self.dir.clone(); - - for p in path.split('/') { - base_path = base_path.join(p); - } - - let path = path - .split("/") - .filter(|x| !x.is_empty()) - .collect::>() - .join("/"); - - let dir_content = read_dir(&base_path); - - let mut ret = Vec::new(); - - let mut is_doc = false; - - for entry in dir_content { - let url_path = format!("{path}/{entry}"); - let url_path = url_path - .split("/") - .filter(|x| !x.is_empty()) - .collect::>() - .join("/"); - if entry.starts_with("index_") && entry.ends_with(".html") { - is_doc = true; - continue; - } - - ret.push(PathEntry(self.name.clone(), url_path)); - } - - (ret, is_doc) - } -} - -/// Represents an entry within a domain's path, containing its name and URL path. -#[derive(Debug, Clone)] -pub struct PathEntry(String, String); - -impl PathEntry { - pub fn url(&self) -> String { - format!("/d/{}/{}", self.0, self.1) - } - - pub fn path(&self) -> &String { - &self.1 - } -} - -/// Represents a document within a domain -pub struct Document { - /// The domain associated with the document. - pub domain: String, - /// The path of the document within the domain. - pub path: String, - base_dir: PathBuf, -} - -impl Document { - /// Creates a new `Document` instance. - /// - /// # Parameters - /// - `domain`: The domain to which the document belongs. - /// - `path`: The path of the document within the domain. - /// - `base_dir`: The base directory of the archive storage. - /// - /// # Returns - /// A new `Document` instance. - pub fn new(domain: &str, path: &str, base_dir: PathBuf) -> Self { - let split = path - .split('/') - .filter(|x| !x.is_empty()) - .collect::>(); - - Self { - domain: domain.to_string(), - path: if split.is_empty() { - "/".to_string() - } else { - split.join("/") - }, - base_dir, - } - } - - /// Renders the document, returning its content as a string. - /// - /// If the environment variable `$ROUTE_INTERNAL` is set to `true`, all links will be rewritten to point to internal archived routes. - /// - /// # Parameters - /// - `version`: An optional version of the document to render in the format `YYYY-MM-DD`. - /// - /// # Returns - /// An `Option` containing the rendered content as a string, or `None` if nothing could be rendered. - pub async fn render_local(&self, version: Option) -> Option { - if check_blacklist(&self.domain) { - let content = html! { - h3 { "This site is blacklisted" }; - }; - return Some(render_page(content, RequestContext::default()).await.1 .1); - } - - let mut file_path = self.doc_dir(); - - let latest_version = if let Some(version) = version { - format!("index_{version}.html") - } else { - let versions = self.versions(); - let version = versions.first().cloned()?; - format!("index_{version}.html") - }; - - file_path = file_path.join(latest_version); - - let mut buf = Vec::new(); - std::fs::File::open(file_path) - .ok()? - .read_to_end(&mut buf) - .unwrap(); - let content = String::from_utf8_lossy(&buf); - - if get_config().ROUTE_INTERNAL { - Some(internalize_urls(&content)) - } else { - Some(content.to_string()) - } - } - - /// Determines the directory where the document is stored. - /// - /// # Returns - /// A `PathBuf` representing the document directory. - pub fn doc_dir(&self) -> PathBuf { - let mut file_path = self.base_dir.join(&self.domain); - - for p in self.path.split('/').filter(|x| !x.is_empty()) { - file_path = file_path.join(p); - } - - file_path - } - - /// Retrieves available versions of the document. - /// - /// # Returns - /// A vector of strings representing the available versions of the document, sorted in descending order. - pub fn versions(&self) -> Vec { - let mut res: Vec = read_dir(&self.doc_dir()) - .into_iter() - .filter_map(|x| { - if x.starts_with("index_") && x.ends_with(".html") { - return Some( - x.trim_start_matches("index_") - .trim_end_matches(".html") - .to_string(), - ); - } - - None - }) - .collect(); - res.sort(); - res.reverse(); - res - } -} - -impl WebsiteArchive { - /// Creates a new `WebsiteArchive` instance. - /// - /// # Parameters - /// - `dir`: The directory path where the archive will be stored. - /// - /// # Returns - /// A new `WebsiteArchive` instance. - pub fn new(dir: &str) -> Self { - Self { - dir: PathBuf::from(dir), - } - } - - /// Retrieves the list of domain names stored in the archive. - /// - /// # Returns - /// A vector of domain names as strings. - pub fn domains(&self) -> Vec { - read_dir(&self.dir) - } - - /// Retrieves a `Domain` instance for a specified domain name. - /// - /// # Parameters - /// - `domain`: The name of the domain to retrieve. - /// - /// # Returns - /// A `Domain` instance corresponding to the specified domain. - pub fn get_domain(&self, domain: &str) -> Domain { - Domain::new(domain, self.dir.join(domain)) - } - - /// Archives a URL by downloading and storing its content. - /// - /// If the URL does not pass the blacklist check, it will not be archived. - /// - /// # Parameters - /// - `url`: The URL to archive. - /// - /// This function downloads the content of the URL, processes it, and saves it to the archive. - pub async fn archive_url(&self, url: &str) { - let parsed_url = url::Url::parse(url).unwrap(); - - let domain = parsed_url.domain().unwrap().trim_start_matches("www."); - - // Deny blacklist - if check_blacklist(domain) { - return; - } - - let path = parsed_url.path(); - - if check_blacklist_path(domain, path) { - return; - } - - let mut folder_name = self.dir.join(domain); - - download_fav_for(domain).await; - - for paths in path.split('/') { - if !paths.is_empty() { - folder_name = folder_name.join(paths); - } - } - - std::fs::create_dir_all(&folder_name).unwrap(); - - let timestamp = chrono::Utc::now().format("%Y-%m-%d").to_string(); - let filename = folder_name.join(format!("index_{timestamp}.html")); - - log::info!("Archiving {url} to {}", filename.to_str().unwrap()); - - let conf = get_config() - .get_domain_config(domain) - .cloned() - .unwrap_or_default(); - - let mut cmd = vec!["monolith", "--isolate", "-o", filename.to_str().unwrap()]; - - if conf.no_audio.unwrap_or_default() { - cmd.push("--no-audio"); - } - - if conf.no_css.unwrap_or_default() { - cmd.push("--no-css"); - } - - if conf.no_frames.unwrap_or_default() { - cmd.push("--no-frames"); - } - - if conf.no_fonts.unwrap_or_default() { - cmd.push("--no-frames"); - } - - if conf.no_image.unwrap_or_default() { - cmd.push("--no-images"); - } - - if conf.no_javascript.unwrap_or_default() { - cmd.push("--no-js"); - cmd.push("--unwrap-noscript"); - } - - if conf.no_video.unwrap_or_default() { - cmd.push("--no-video"); - } - - if let Some(ua) = &conf.user_agent { - cmd.push("--user-agent"); - cmd.push(ua.as_str()); - } - - let mut url = url::Url::parse(&format!("https://{domain}")).unwrap(); - url = url.join(path).unwrap(); - let url = url.to_string(); - cmd.push(&url); - - run_command(&cmd); - } -} - -fn run_command(cmd: &[&str]) { - let mut cmd_setup = std::process::Command::new(cmd[0]); - let cmd_setup = cmd_setup - .args(cmd.iter().skip(1).collect::>()) - .stdout(std::process::Stdio::inherit()) - .stderr(std::process::Stdio::inherit()); - - let child = cmd_setup.spawn().unwrap(); - - let status = child.wait_with_output().unwrap(); - assert!(status.status.success()); -} diff --git a/src/archive/document.rs b/src/archive/document.rs new file mode 100644 index 0000000..aa08772 --- /dev/null +++ b/src/archive/document.rs @@ -0,0 +1,126 @@ +use std::{io::Read, path::PathBuf}; + +use based::request::RequestContext; +use maud::html; + +use crate::{blacklist::check_blacklist, conf::get_config, render_page}; + +use super::{internalize_urls, read_dir}; + +/// Represents a document within a domain +pub struct Document { + /// The domain associated with the document. + pub domain: String, + /// The path of the document within the domain. + pub path: String, + base_dir: PathBuf, +} + +impl Document { + /// Creates a new `Document` instance. + /// + /// # Parameters + /// - `domain`: The domain to which the document belongs. + /// - `path`: The path of the document within the domain. + /// - `base_dir`: The base directory of the archive storage. + /// + /// # Returns + /// A new `Document` instance. + pub fn new(domain: &str, path: &str, base_dir: PathBuf) -> Self { + let split = path + .split('/') + .filter(|x| !x.is_empty()) + .collect::>(); + + Self { + domain: domain.to_string(), + path: if split.is_empty() { + "/".to_string() + } else { + split.join("/") + }, + base_dir, + } + } + + /// Renders the document, returning its content as a string. + /// + /// If the environment variable `$ROUTE_INTERNAL` is set to `true`, all links will be rewritten to point to internal archived routes. + /// + /// # Parameters + /// - `version`: An optional version of the document to render in the format `YYYY-MM-DD`. + /// + /// # Returns + /// An `Option` containing the rendered content as a string, or `None` if nothing could be rendered. + pub async fn render_local(&self, version: Option) -> Option { + if check_blacklist(&self.domain) { + let content = html! { + h3 { "This site is blacklisted" }; + }; + return Some(render_page(content, RequestContext::default()).await.1 .1); + } + + let mut file_path = self.doc_dir(); + + let latest_version = if let Some(version) = version { + format!("index_{version}.html") + } else { + let versions = self.versions(); + let version = versions.first().cloned()?; + format!("index_{version}.html") + }; + + file_path = file_path.join(latest_version); + + let mut buf = Vec::new(); + std::fs::File::open(file_path) + .ok()? + .read_to_end(&mut buf) + .unwrap(); + let content = String::from_utf8_lossy(&buf); + + if get_config().ROUTE_INTERNAL { + Some(internalize_urls(&content)) + } else { + Some(content.to_string()) + } + } + + /// Determines the directory where the document is stored. + /// + /// # Returns + /// A `PathBuf` representing the document directory. + pub fn doc_dir(&self) -> PathBuf { + let mut file_path = self.base_dir.join(&self.domain); + + for p in self.path.split('/').filter(|x| !x.is_empty()) { + file_path = file_path.join(p); + } + + file_path + } + + /// Retrieves available versions of the document. + /// + /// # Returns + /// A vector of strings representing the available versions of the document, sorted in descending order. + pub fn versions(&self) -> Vec { + let mut res: Vec = read_dir(&self.doc_dir()) + .into_iter() + .filter_map(|x| { + if x.starts_with("index_") && x.ends_with(".html") { + return Some( + x.trim_start_matches("index_") + .trim_end_matches(".html") + .to_string(), + ); + } + + None + }) + .collect(); + res.sort(); + res.reverse(); + res + } +} \ No newline at end of file diff --git a/src/archive/domain.rs b/src/archive/domain.rs new file mode 100644 index 0000000..79239aa --- /dev/null +++ b/src/archive/domain.rs @@ -0,0 +1,126 @@ +use std::path::PathBuf; + +use based::result::LogAndIgnore; + +use crate::blacklist::check_blacklist; + +use super::{read_dir, Document}; + + +/// Represents a domain within the website archive +pub struct Domain { + /// Domain name + pub name: String, + dir: PathBuf, +} + +impl Domain { + /// Creates a new `Domain` instance. + /// + /// If the domain name is not blacklisted, a directory is created. + /// + /// # Parameters + /// - `name`: The name of the domain. + /// - `dir`: The directory path for the domain. + /// + /// # Returns + /// A new `Domain` instance. + pub fn new(name: &str, dir: PathBuf) -> Self { + if !check_blacklist(name) { + std::fs::create_dir_all(&dir) + .log_err_and_ignore(&format!("Could not create domain dir {name}")); + } + Self { + name: name.to_string(), + dir, + } + } + + /// Resolves a specific path within the domain and returns a `Document` representing it. + /// + /// # Parameters + /// - `path`: The path to resolve within the domain. + /// + /// # Returns + /// A `Document` instance corresponding to the given path. + pub fn path(&self, path: &str) -> Document { + Document::new(&self.name, path, self.dir.parent().unwrap().to_path_buf()) + } + + /// Get all paths associated with the domain + pub fn all_paths(&self) -> Vec { + let mut queue = self.paths("/").0; + + let mut ret = Vec::new(); + + ret.push(PathEntry(self.name.clone(), "/".to_string())); + + while let Some(el) = queue.pop() { + ret.push(el.clone()); + let paths = self.paths(&el.1).0; + queue.extend(paths); + } + + ret + } + + /// Retrieves entries and metadata for a given path within the domain. + /// + /// # Parameters + /// - `path`: The path to inspect. + /// + /// # Returns + /// A tuple containing: + /// - A vector of `PathEntry` instances representing the contents of the path. + /// - A boolean indicating whether the path is itself a `Document` + pub fn paths(&self, path: &str) -> (Vec, bool) { + let mut base_path = self.dir.clone(); + + for p in path.split('/') { + base_path = base_path.join(p); + } + + let path = path + .split("/") + .filter(|x| !x.is_empty()) + .collect::>() + .join("/"); + + let dir_content = read_dir(&base_path); + + let mut ret = Vec::new(); + + let mut is_doc = false; + + for entry in dir_content { + let url_path = format!("{path}/{entry}"); + let url_path = url_path + .split("/") + .filter(|x| !x.is_empty()) + .collect::>() + .join("/"); + if entry.starts_with("index_") && entry.ends_with(".html") { + is_doc = true; + continue; + } + + ret.push(PathEntry(self.name.clone(), url_path)); + } + + (ret, is_doc) + } +} + +/// Represents an entry within a domain's path, containing its name and URL path. +#[derive(Debug, Clone)] +pub struct PathEntry(String, String); + +impl PathEntry { + pub fn url(&self) -> String { + format!("/d/{}/{}", self.0, self.1) + } + + pub fn path(&self) -> &String { + &self.1 + } +} \ No newline at end of file diff --git a/src/archive/mod.rs b/src/archive/mod.rs new file mode 100644 index 0000000..7dcaa04 --- /dev/null +++ b/src/archive/mod.rs @@ -0,0 +1,207 @@ +use std::{collections::HashSet, path::PathBuf}; + +use crate::{ + blacklist::{check_blacklist, check_blacklist_path}, + conf::get_config, + favicon::download_fav_for +}; + +mod document; +mod domain; +pub use document::Document; +pub use domain::*; + +/// Read directory entries into `Vec` +pub fn read_dir(dir: &PathBuf) -> Vec { + let mut list = Vec::new(); + + if let Ok(entries) = std::fs::read_dir(dir) { + for entry in entries.flatten() { + if let Some(file_name) = entry.file_name().to_str() { + list.push(file_name.to_string()); + } + } + } + + list +} + +/// Rewrite all URLs in `input` to the format `/s//` +fn internalize_urls(input: &str) -> String { + let url_pattern = r"https?://([a-zA-Z0-9.-]+)(/[\w./-]*)"; + let re = regex::Regex::new(url_pattern).unwrap(); + + re.replace_all(input, |caps: ®ex::Captures| { + let domain = caps[1].trim_start_matches("www."); + let path = &caps[2]; + + // Dont transform if in blacklist + if check_blacklist(domain) { + return format!("https://{domain}/{path}"); + } + + format!("/s/{domain}/{path}") + }) + .to_string() +} + +/// Extract all domains +pub fn extract_domains(input: &str) -> Vec { + let url_pattern = r"https?://([a-zA-Z0-9.-]+)(/[\w./-]*)?"; + let re = regex::Regex::new(url_pattern).unwrap(); + + let mut domains = HashSet::new(); + for caps in re.captures_iter(input) { + let domain = caps[1].trim_start_matches("www."); + domains.insert(domain.to_string()); + } + + let mut domains: Vec<_> = domains.into_iter().collect(); + domains.sort(); + + domains +} + +/// Represents a directory containg archived websites +#[derive(Debug, Clone)] +pub struct WebsiteArchive { + pub dir: PathBuf, +} + +impl WebsiteArchive { + /// Creates a new `WebsiteArchive` instance. + /// + /// # Parameters + /// - `dir`: The directory path where the archive will be stored. + /// + /// # Returns + /// A new `WebsiteArchive` instance. + pub fn new(dir: &str) -> Self { + Self { + dir: PathBuf::from(dir), + } + } + + /// Retrieves the list of domain names stored in the archive. + /// + /// # Returns + /// A vector of domain names as strings. + pub fn domains(&self) -> Vec { + read_dir(&self.dir) + } + + /// Retrieves a `Domain` instance for a specified domain name. + /// + /// # Parameters + /// - `domain`: The name of the domain to retrieve. + /// + /// # Returns + /// A `Domain` instance corresponding to the specified domain. + pub fn get_domain(&self, domain: &str) -> Domain { + Domain::new(domain, self.dir.join(domain)) + } + + /// Archives a URL by downloading and storing its content. + /// + /// If the URL does not pass the blacklist check, it will not be archived. + /// + /// # Parameters + /// - `url`: The URL to archive. + /// + /// This function downloads the content of the URL, processes it, and saves it to the archive. + pub async fn archive_url(&self, url: &str) { + let parsed_url = url::Url::parse(url).unwrap(); + + let domain = parsed_url.domain().unwrap().trim_start_matches("www."); + + // Deny blacklist + if check_blacklist(domain) { + return; + } + + let path = parsed_url.path(); + + if check_blacklist_path(domain, path) { + return; + } + + let mut folder_name = self.dir.join(domain); + + download_fav_for(domain).await; + + for paths in path.split('/') { + let paths = url_escape::decode(paths).to_string(); + if !paths.is_empty() { + folder_name = folder_name.join(paths); + } + } + + std::fs::create_dir_all(&folder_name).unwrap(); + + let timestamp = chrono::Utc::now().format("%Y-%m-%d").to_string(); + let filename = folder_name.join(format!("index_{timestamp}.html")); + + log::info!("Archiving {url} to {}", filename.to_str().unwrap()); + + let conf = get_config() + .get_domain_config(domain) + .cloned() + .unwrap_or_default(); + + let mut cmd = vec!["monolith", "--isolate", "-o", filename.to_str().unwrap()]; + + if conf.no_audio.unwrap_or_default() { + cmd.push("--no-audio"); + } + + if conf.no_css.unwrap_or_default() { + cmd.push("--no-css"); + } + + if conf.no_frames.unwrap_or_default() { + cmd.push("--no-frames"); + } + + if conf.no_fonts.unwrap_or_default() { + cmd.push("--no-frames"); + } + + if conf.no_image.unwrap_or_default() { + cmd.push("--no-images"); + } + + if conf.no_javascript.unwrap_or_default() { + cmd.push("--no-js"); + cmd.push("--unwrap-noscript"); + } + + if conf.no_video.unwrap_or_default() { + cmd.push("--no-video"); + } + + if let Some(ua) = &conf.user_agent { + cmd.push("--user-agent"); + cmd.push(ua.as_str()); + } + + let mut url = url::Url::parse(&format!("https://{domain}")).unwrap(); + url = url.join(path).unwrap(); + let url = url.to_string(); + cmd.push(&url); + + run_command(&cmd); + } +} + +fn run_command(cmd: &[&str]) { + let mut cmd_setup = std::process::Command::new(cmd[0]); + let cmd_setup = cmd_setup + .args(cmd.iter().skip(1).collect::>()) + .stdout(std::process::Stdio::inherit()) + .stderr(std::process::Stdio::inherit()); + + let child = cmd_setup.spawn().unwrap(); + + let status = child.wait_with_output().unwrap(); + assert!(status.status.success()); +} diff --git a/src/pages/mod.rs b/src/pages/mod.rs index 003325b..a0cc978 100644 --- a/src/pages/mod.rs +++ b/src/pages/mod.rs @@ -7,7 +7,7 @@ use based::{ }, }; use maud::{html, PreEscaped}; -use rocket::{get, State}; +use rocket::{get, request::FromSegments, State}; pub mod component; use component::*; @@ -174,15 +174,49 @@ pub async fn render_txt_website( Some(html2md::parse_html(&content)) } +pub struct PathSegment { + segments: Vec +} + +impl PathSegment { + pub fn to_str(&self) -> String { + self.segments.join("/") + } +} + +impl<'r> FromSegments<'r> for PathSegment { + type Error = (); + + fn from_segments(segments: rocket::http::uri::Segments<'r, rocket::http::uri::fmt::Path>) -> Result { + let paths: Vec<_> = segments + .filter_map(|x| { + if x == "." { + return None; + } + + if x == ".." { + return None + } + + Some(x.to_string()) + }) + .collect(); + + Ok(PathSegment { + segments: paths + }) + } +} + /// Return archived version of `domain` / `path` at `time` #[get("/s//?