use std::{ collections::{HashMap, HashSet}, path::PathBuf, }; use crate::{ blacklist::{check_blacklist, check_blacklist_path}, conf::get_config, extract_data_urls, favicon::download_fav_for, get_mime_type, sha256_hash, }; mod document; mod domain; mod fragment; use based::{ get_pg, ui::{components::prelude::Shell, prelude::Nothing}, }; use chrono::NaiveDate; pub use document::Document; pub use domain::*; pub use fragment::*; use sqlx::prelude::FromRow; /// Read directory entries into `Vec` pub fn read_dir(dir: &PathBuf) -> Vec { let mut list = Vec::new(); if let Ok(entries) = std::fs::read_dir(dir) { for entry in entries.flatten() { if let Some(file_name) = entry.file_name().to_str() { list.push(file_name.to_string()); } } } list } /// Rewrite all URLs in `input` to the format `/s//` pub fn internalize_urls(input: &str, base: &str) -> String { // todo : fix regex, domains without path are not captured let url_pattern = r#"(\ |"|')(?:( Vec { let url_pattern = r"https?://([a-zA-Z0-9.-]+)(/[\w./-]*)?"; let re = regex::Regex::new(url_pattern).unwrap(); let mut domains = HashSet::new(); for caps in re.captures_iter(input) { let domain = caps[1].trim_start_matches("www."); domains.insert(domain.to_string()); } let mut domains: Vec<_> = domains.into_iter().collect(); domains.sort(); domains } // TODO : impl archive index to db /// Represents a directory containg archived websites #[derive(Debug, Clone)] pub struct WebsiteArchive { pub dir: PathBuf, } impl WebsiteArchive { /// Creates a new `WebsiteArchive` instance. /// /// # Parameters /// - `dir`: The directory path where the archive will be stored. /// /// # Returns /// A new `WebsiteArchive` instance. pub fn new(dir: &str) -> Self { Self { dir: PathBuf::from(dir), } } /// Retrieves the list of domain names stored in the archive. /// /// # Returns /// A vector of domain names as strings. pub fn domains(&self) -> Vec { read_dir(&self.dir) } /// Retrieves a `Domain` instance for a specified domain name. /// /// # Parameters /// - `domain`: The name of the domain to retrieve. /// /// # Returns /// A `Domain` instance corresponding to the specified domain. pub fn get_domain(&self, domain: &str) -> Domain { Domain::new(domain, self.dir.join(domain)) } /// Archives a URL by downloading and storing its content. /// /// If the URL does not pass the blacklist check, it will not be archived. /// /// # Parameters /// - `url`: The URL to archive. /// /// This function downloads the content of the URL, processes it, and saves it to the archive. pub async fn archive_url(&self, url: &str) { let parsed_url = url::Url::parse(url).unwrap(); let domain = parsed_url.domain().unwrap().trim_start_matches("www."); // Deny blacklist if check_blacklist(domain) { return; } let path = parsed_url.path(); if check_blacklist_path(domain, path) { return; } let mut folder_name = self.dir.join(domain); download_fav_for(domain).await; for paths in path.split('/') { let paths = url_escape::decode(paths).to_string(); if !paths.is_empty() { folder_name = folder_name.join(paths); } } std::fs::create_dir_all(&folder_name).unwrap(); let timestamp = chrono::Utc::now().format("%Y-%m-%d").to_string(); let filename = folder_name.join(format!("index_{timestamp}")); log::info!("Archiving {url} to {}", filename.to_str().unwrap()); let conf = get_config() .get_domain_config(domain) .cloned() .unwrap_or_default(); let mut cmd = vec!["monolith", "--isolate", "-o", filename.to_str().unwrap()]; if conf.no_audio.unwrap_or_default() { cmd.push("--no-audio"); } if conf.no_css.unwrap_or_default() { cmd.push("--no-css"); } if conf.no_frames.unwrap_or_default() { cmd.push("--no-frames"); } if conf.no_fonts.unwrap_or_default() { cmd.push("--no-frames"); } if conf.no_image.unwrap_or_default() { cmd.push("--no-images"); } if conf.no_javascript.unwrap_or_default() { cmd.push("--no-js"); cmd.push("--unwrap-noscript"); } if conf.no_video.unwrap_or_default() { cmd.push("--no-video"); } if let Some(ua) = &conf.user_agent { cmd.push("--user-agent"); cmd.push(ua.as_str()); } let mut url = url::Url::parse(&format!("https://{domain}")).unwrap(); url = url.join(path).unwrap(); let url = url.to_string(); cmd.push(&url); run_command(&cmd); index_path(&self.get_domain(domain), path).await; } } fn run_command(cmd: &[&str]) { let mut cmd_setup = std::process::Command::new(cmd[0]); let cmd_setup = cmd_setup .args(cmd.iter().skip(1).collect::>()) .stdout(std::process::Stdio::inherit()) .stderr(std::process::Stdio::inherit()); let child = cmd_setup.spawn().unwrap(); let status = child.wait_with_output().unwrap(); if !status.status.success() { log::warn!( "Command {cmd:?} exited with code {}", status.status.code().unwrap_or_default() ) } } pub async fn index_archive_db(arc: &WebsiteArchive) { log::info!("Indexing archive"); for dom in arc.domains() { let dom = arc.get_domain(&dom); index_path(&dom, "/").await; } log::info!("Done indexing archive"); } pub async fn index_path(dom: &Domain, path: &str) { let (paths, is_doc) = dom.paths(path); // If the path is a document, process the root path. if is_doc { let doc = dom.path("/"); index_document(&doc).await; } // Create a queue to process paths iteratively let mut queue = std::collections::VecDeque::new(); // Add the initial paths to the queue queue.extend(paths); while let Some(next_path) = queue.pop_front() { let (next_paths, is_doc) = dom.paths(next_path.path()); if is_doc { let doc = dom.path(next_path.path()); log::info!( "Indexing {} / {} [{} queued]", doc.domain, doc.path, queue.len() ); index_document(&doc).await; } queue.extend(next_paths); } } pub async fn index_document(doc: &Document) { for version_str in &doc.versions() { let domain = &doc.domain; let path = &doc.path; let version = if let Ok(version) = chrono::NaiveDate::parse_from_str(&version_str, "%Y-%m-%d") { version } else { log::error!( "Could not parse version {version_str} as valid date for {} / {}", domain, path ); continue; }; if DocumentIndex::exists(domain, path, &version).await { log::info!( "Document {} / {} @ {} already indexed", domain, path, version ); continue; } if let Ok(content) = doc .render_local( Some(version_str.to_string()), &Shell::new(Nothing(), Nothing(), Nothing()), ) .await { let size = content.len(); let mime = get_mime_type(&content).unwrap_or("text/html".to_string()); if mime.as_str() == "text/html" { // TODO : domain links index let mut hashes = Vec::new(); for (mime, data) in extract_data_urls(&String::from_utf8_lossy(&content)) { let hash = sha256_hash(&data); log::info!("{} / {}: Indexing fragment {hash}", doc.domain, doc.path); hashes.push(hash.clone()); sqlx::query("INSERT INTO fragments (id, mime, blob) VALUES ($1, $2, $3) ON CONFLICT DO NOTHING") .bind(&hash) .bind(&mime) .bind(data) .execute(get_pg!()).await.unwrap(); } for hash in hashes { sqlx::query("INSERT INTO document_fragments (domain, path, version, fragment) VALUES ($1, $2, $3, $4) ON CONFLICT DO NOTHING") .bind(&doc.domain) .bind(&doc.path) .bind(chrono::NaiveDate::parse_from_str(version_str, "%Y-%m-%d").unwrap()) .bind(&hash) .execute(get_pg!()).await.unwrap(); } } if let Ok(version) = chrono::NaiveDate::parse_from_str(version_str, "%Y-%m-%d") { sqlx::query( r#" INSERT INTO document_index (domain, path, version, size, mime) VALUES ($1, $2, $3, $4, $5) ON CONFLICT (domain, path, version) DO NOTHING "#, ) .bind(&doc.domain) .bind(&doc.path) .bind(version) .bind(size as i64) .bind(mime) .execute(get_pg!()) .await .unwrap(); } } } } #[derive(Debug, Clone, FromRow)] pub struct DocumentIndex { pub domain: String, pub path: String, pub version: chrono::NaiveDate, pub size: i64, pub mime: String, } impl DocumentIndex { pub async fn exists(domain: &str, path: &str, version: &chrono::NaiveDate) -> bool { let res: Option = sqlx::query_as( "SELECT * FROM document_index WHERE domain = $1 AND path = $2 AND version = $3", ) .bind(domain) .bind(path) .bind(version) .fetch_optional(get_pg!()) .await .unwrap(); res.is_some() } pub fn url(&self) -> String { format!( "/s/{}/{}?time={}", self.domain, self.path, self.version.to_string() ) } pub async fn get_documents_of_day( day: NaiveDate, domain: Option<&str>, ) -> HashMap> { let res: Vec<(String, String)> = if let Some(domain) = domain { sqlx::query_as( "SELECT domain, path FROM document_index WHERE version = $1 WHERE domain = $2", ) .bind(day) .bind(domain) .fetch_all(get_pg!()) .await .unwrap() } else { sqlx::query_as("SELECT domain, path FROM document_index WHERE version = $1") .bind(day) .fetch_all(get_pg!()) .await .unwrap() }; let mut ret = HashMap::new(); for (domain, path) in res { let d: &mut Vec = ret.entry(domain).or_default(); d.push(path); } ret } pub async fn get_documents_of_other_mime(domain: &str) -> Vec { sqlx::query_as("SELECT * FROM document_index WHERE mime != 'text/html' AND domain = $1") .bind(domain) .fetch_all(get_pg!()) .await .unwrap() } }