use std::path::PathBuf; use based::request::RequestContext; use maud::html; use crate::{blacklist::check_blacklist, favicon::download_fav_for, pages::render_page}; pub fn read_dir(dir: &PathBuf) -> Vec { let mut list = Vec::new(); if let Ok(entries) = std::fs::read_dir(dir) { for entry in entries { if let Ok(entry) = entry { if let Some(file_name) = entry.file_name().to_str() { list.push(file_name.to_string()); } } } } list } fn internalize_urls(input: &str) -> String { let url_pattern = r"https?://([a-zA-Z0-9.-]+)(/[\w./-]*)"; let re = regex::Regex::new(url_pattern).unwrap(); re.replace_all(input, |caps: ®ex::Captures| { format!( "/s/{}/{}", &caps[1].trim_start_matches("www."), // Domain &caps[2] // Path ) }) .to_string() } #[derive(Debug, Clone)] pub struct WebsiteArchive { pub dir: PathBuf, } pub struct Domain { pub name: String, dir: PathBuf, } impl Domain { pub fn new(name: &str, dir: PathBuf) -> Self { if !check_blacklist(name) { std::fs::create_dir_all(&dir).unwrap(); } Self { name: name.to_string(), dir, } } pub fn path(&self, path: &str) -> Document { Document::new(&self.name, path, self.dir.parent().unwrap().to_path_buf()) } pub fn paths(&self, path: &str) -> Vec { let mut base_path = self.dir.clone(); for p in path.split('/') { base_path = base_path.join(p); } let dir_content = read_dir(&base_path); let mut ret = Vec::new(); for entry in dir_content { let url_path = format!("{path}/{entry}"); let is_doc = read_dir(&base_path.join(entry)) .into_iter() .any(|x| x.starts_with("index_") && x.ends_with(".html")); if is_doc { ret.push(PathEntry::Document(Document::new( &self.name, &url_path, self.dir.parent().unwrap().to_path_buf(), ))); } else { ret.push(PathEntry::Path(self.name.clone(), url_path)); } } ret } } pub enum PathEntry { Path(String, String), Document(Document), } impl PathEntry { pub fn url(&self) -> String { match self { PathEntry::Path(domain, path) => format!("/d/{domain}/{path}"), PathEntry::Document(document) => document.url(), } } pub fn path(&self) -> String { match self { PathEntry::Path(_, path) => path.to_string(), PathEntry::Document(document) => document.path.clone(), } } } pub struct Document { pub domain: String, pub path: String, base_dir: PathBuf, } impl Document { pub fn new(domain: &str, path: &str, base_dir: PathBuf) -> Self { Self { domain: domain.to_string(), path: path.to_string(), base_dir, } } pub fn url(&self) -> String { format!("/s/{}/{}", self.domain, self.path) } pub async fn render_local(&self, version: Option) -> Option { if check_blacklist(&self.domain) { let content = html! { h3 { "This site is blacklisted" }; }; return Some(render_page(content, RequestContext::default()).await.1 .1); } let mut file_path = self.doc_dir(); let latest_version = if let Some(version) = version { format!("index_{version}.html") } else { let versions = self.versions(); versions.first().cloned()? }; file_path = file_path.join(latest_version); let content = std::fs::read_to_string(file_path).ok()?; if std::env::var("ROUTE_INTERNAL").unwrap_or("false".to_string()) == "true" { Some(internalize_urls(&content)) } else { Some(content) } } pub fn doc_dir(&self) -> PathBuf { let mut file_path = self.base_dir.join(&self.domain); for p in self.path.split('/') { file_path = file_path.join(p); } file_path } pub fn versions(&self) -> Vec { read_dir(&self.doc_dir()) } } impl WebsiteArchive { pub fn new(dir: &str) -> Self { Self { dir: PathBuf::from(dir), } } pub fn domains(&self) -> Vec { read_dir(&self.dir) } pub fn get_domain(&self, domain: &str) -> Domain { Domain::new(domain, self.dir.join(domain)) } /// Archive a URL pub async fn archive_url(&self, url: &str) { let parsed_url = url::Url::parse(url).unwrap(); let domain = parsed_url.domain().unwrap().trim_start_matches("www."); // Deny blacklist if check_blacklist(domain) { return; } let path = parsed_url.path(); let mut folder_name = self.dir.join(&domain); if !std::fs::exists(&folder_name).unwrap() { download_fav_for(domain).await; } for paths in path.split('/') { if !paths.is_empty() { folder_name = folder_name.join(paths); } } std::fs::create_dir_all(&folder_name).unwrap(); let timestamp = chrono::Utc::now().format("%Y-%m-%d").to_string(); let filename = folder_name.join(&format!("index_{timestamp}.html")); log::info!("Archiving {url} to {}", filename.to_str().unwrap()); run_command(&vec![ "monolith", "-I", "-o", filename.to_str().unwrap(), &format!("https://{}/{}", domain, path), ]); } } // full text search // add new sites? // transparent auto page downloading // redownload after threshold fn run_command(cmd: &[&str]) { let mut cmd_setup = std::process::Command::new(cmd[0]); let cmd_setup = cmd_setup .args(cmd.into_iter().skip(1).collect::>()) .stdout(std::process::Stdio::inherit()) .stderr(std::process::Stdio::inherit()); let child = cmd_setup.spawn().unwrap(); let status = child.wait_with_output().unwrap(); assert!(status.status.success()); }