diff --git a/Cargo.lock b/Cargo.lock index 2699cb4..db10bb9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -202,7 +202,7 @@ checksum = "8c3c1a368f70d6cf7302d78f8f7093da241fb8e8807c05cc9e51a125895a6d5b" [[package]] name = "based" version = "0.1.0" -source = "git+https://git.hydrar.de/jmarya/based#9afe75bc8fd961f050ee31fc9e86e37eb6f8ffb6" +source = "git+https://git.hydrar.de/jmarya/based#696b34f2f17ef2d86f0bc77993f9b0b8b652c0f6" dependencies = [ "bcrypt", "chrono", @@ -307,9 +307,9 @@ checksum = "f61dac84819c6588b558454b194026eb1f09c293b9036ae9b159e74e73ab6cf9" [[package]] name = "cc" -version = "1.2.14" +version = "1.2.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c3d1b2e905a3a7b00a6141adb0e4c0bb941d11caf55349d863942a1cc44e3c9" +checksum = "c736e259eea577f443d5c86c304f9f4ae0295c43f3ba05c21f1d66b5f06001af" dependencies = [ "shlex", ] @@ -320,6 +320,17 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6d43a04d8753f35258c91f8ec639f792891f748a1edbd759cf1dcea3382ad83c" +[[package]] +name = "cfb" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d38f2da7a0a2c4ccf0065be06397cc26a81f4e528be095826eee9d4adbb8c60f" +dependencies = [ + "byteorder", + "fnv", + "uuid", +] + [[package]] name = "cfg-if" version = "1.0.0" @@ -353,9 +364,9 @@ dependencies = [ [[package]] name = "clap" -version = "4.5.30" +version = "4.5.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "92b7b18d71fad5313a1e320fa9897994228ce274b60faa4d694fe0ea89cd9e6d" +checksum = "027bb0d98429ae334a8698531da7077bdf906419543a35a55c2cb1b66437d767" dependencies = [ "clap_builder", "clap_derive", @@ -363,9 +374,9 @@ dependencies = [ [[package]] name = "clap_builder" -version = "4.5.30" +version = "4.5.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a35db2071778a7344791a4fb4f95308b5673d219dee3ae348b86642574ecc90c" +checksum = "5589e0cba072e0f3d23791efac0fd8627b49c829c196a492e88168e6a669d863" dependencies = [ "anstream", "anstyle", @@ -627,9 +638,9 @@ checksum = "feeef44e73baff3a26d371801df019877a9866a8c493d315ab00177843314f35" [[package]] name = "either" -version = "1.13.0" +version = "1.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0" +checksum = "b7914353092ddf589ad78f25c5c1c21b7f80b0ff8621e7c814c3485b5306da9d" dependencies = [ "serde", ] @@ -950,9 +961,9 @@ dependencies = [ [[package]] name = "h2" -version = "0.4.7" +version = "0.4.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ccae279728d634d083c00f6099cb58f01cc99c145b84b8be2f6c74618d79922e" +checksum = "5017294ff4bb30944501348f6f8e42e6ad28f42c8bbef7a74029aff064a4e3c2" dependencies = [ "atomic-waker", "bytes", @@ -1185,7 +1196,7 @@ dependencies = [ "bytes", "futures-channel", "futures-util", - "h2 0.4.7", + "h2 0.4.8", "http 1.2.0", "http-body 1.0.1", "httparse", @@ -1445,6 +1456,15 @@ dependencies = [ "serde", ] +[[package]] +name = "infer" +version = "0.19.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a588916bfdfd92e71cacef98a63d9b1f0d74d6599980d11894290e7ddefffcf7" +dependencies = [ + "cfb", +] + [[package]] name = "inlinable_string" version = "0.1.15" @@ -1453,9 +1473,9 @@ checksum = "c8fae54786f62fb2918dcfae3d568594e50eb9b5c25bf04371af6fe7516452fb" [[package]] name = "inout" -version = "0.1.3" +version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a0c10553d664a4d0bcff9f4215d0aac67a639cc68ef660840afe309b807bc9f5" +checksum = "879f10e63c20629ecabbb64a8010319738c66a5cd0c29b02d63d272b03751d01" dependencies = [ "generic-array", ] @@ -1530,9 +1550,9 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.169" +version = "0.2.170" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a" +checksum = "875b3680cb2f8f71bdcf9a30f38d48282f5d3c95cbf9b3fa57269bb5d5c06828" [[package]] name = "libm" @@ -1574,9 +1594,9 @@ dependencies = [ [[package]] name = "log" -version = "0.4.25" +version = "0.4.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "04cbf5b083de1c7e0222a7a51dbfdba1cbe1c6ab0b15e29fff3f6c077fd9cd9f" +checksum = "30bde2b3dc3671ae49d8e2e9f044c7c005836e7a023ee57cffa25ab82764bb9e" [[package]] name = "loom" @@ -1680,9 +1700,9 @@ checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" [[package]] name = "miniz_oxide" -version = "0.8.4" +version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b3b1c9bd4fe1f0f8b387f6eb9eb3b4a1aa26185e5750efb9140301703f62cd1b" +checksum = "8e3e04debbb59698c15bacbb6d93584a8c0ca9cc3213cb423d31f760d8843ce5" dependencies = [ "adler2", ] @@ -1719,9 +1739,9 @@ dependencies = [ [[package]] name = "native-tls" -version = "0.2.13" +version = "0.2.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0dab59f8e050d5df8e4dd87d9206fb6f65a483e20ac9fda365ade4fab353196c" +checksum = "87de3442987e9dbec73158d5c715e7ad9072fda936bb03d19d7fa10e00520f0e" dependencies = [ "libc", "log", @@ -2175,9 +2195,9 @@ dependencies = [ [[package]] name = "redox_syscall" -version = "0.5.8" +version = "0.5.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "03a862b389f93e68874fbf580b9de08dd02facb9a788ebadaf4a3fd33cf58834" +checksum = "82b568323e98e49e2a0899dcee453dd679fae22d69adf9b11dd508d1549b7e2f" dependencies = [ "bitflags 2.8.0", ] @@ -2297,7 +2317,7 @@ dependencies = [ "encoding_rs", "futures-core", "futures-util", - "h2 0.4.7", + "h2 0.4.8", "http 1.2.0", "http-body 1.0.1", "http-body-util", @@ -2347,9 +2367,9 @@ dependencies = [ [[package]] name = "ring" -version = "0.17.9" +version = "0.17.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e75ec5e92c4d8aede845126adc388046234541629e76029599ed35a003c7ed24" +checksum = "da5349ae27d3887ca812fb375b45a4fbb36d8d12d2df394968cd86e35683fe73" dependencies = [ "cc", "cfg-if", @@ -2540,7 +2560,7 @@ version = "0.102.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "64ca1bc8749bd4cf37b5ce386cc146580777b4e8572c7b97baf22c83f444bee9" dependencies = [ - "ring 0.17.9", + "ring 0.17.11", "rustls-pki-types", "untrusted 0.9.0", ] @@ -2637,18 +2657,18 @@ dependencies = [ [[package]] name = "serde" -version = "1.0.217" +version = "1.0.218" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "02fc4265df13d6fa1d00ecff087228cc0a2b5f3c0e87e258d8b94a156e984c70" +checksum = "e8dfc9d19bdbf6d17e22319da49161d5d0108e4188e8b680aef6299eed22df60" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.217" +version = "1.0.218" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a9bf7cf98d04a2b28aead066b7496853d4779c9cc183c440dbac457641e19a0" +checksum = "f09503e191f4e797cb8aac08e9a4a4695c5edf6a2e70e376d961ddd5c969f82b" dependencies = [ "proc-macro2", "quote", @@ -2668,9 +2688,9 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.138" +version = "1.0.139" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d434192e7da787e94a6ea7e9670b26a036d0ca41e0b7efb2676dd32bae872949" +checksum = "44f86c3acccc9c65b153fe1b85a3be07fe5515274ec9f0653b4a0875731c72a6" dependencies = [ "itoa", "memchr", @@ -3055,9 +3075,9 @@ dependencies = [ [[package]] name = "string_cache_codegen" -version = "0.5.3" +version = "0.5.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "244292f3441c89febe5b5bdfbb6863aeaf4f64da810ea3050fd927b27b8d92ce" +checksum = "c711928715f1fe0fe509c53b43e993a9a557babc2d0a3567d0a3006f1ac931a0" dependencies = [ "phf_generator", "phf_shared", @@ -3558,9 +3578,9 @@ checksum = "5c1cb5db39152898a79168971543b1cb5020dff7fe43c8dc468b0885f5e29df5" [[package]] name = "unicode-ident" -version = "1.0.16" +version = "1.0.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a210d160f08b701c8721ba1c726c11662f877ea6b7094007e1ca9a1041945034" +checksum = "00e2473a93778eb0bad35909dff6a10d28e63f792f16ed15e404fca9d5eeedbe" [[package]] name = "unicode-normalization" @@ -3641,9 +3661,9 @@ checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" [[package]] name = "uuid" -version = "1.13.2" +version = "1.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c1f41ffb7cf259f1ecc2876861a17e7142e63ead296f671f81f6ae85903e0d6" +checksum = "93d59ca99a559661b96bf898d8fce28ed87935fd2bea9f05983c1464dd6c71b1" dependencies = [ "getrandom 0.3.1", "serde", @@ -3798,6 +3818,7 @@ dependencies = [ "env_logger", "futures", "html2md", + "infer", "log", "maud", "ollama-rs", @@ -4054,9 +4075,9 @@ checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" [[package]] name = "winnow" -version = "0.7.2" +version = "0.7.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "59690dea168f2198d1a3b0cac23b8063efcd11012f10ae4698f284808c8ef603" +checksum = "0e7f4ea97f6f78012141bcdb6a216b2609f0979ada50b20ca5b52dde2eac2bb1" dependencies = [ "memchr", ] diff --git a/Cargo.toml b/Cargo.toml index 1493c06..da4025e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -25,3 +25,4 @@ html2md = "0.2.14" clap = { version = "4.5.23", features = ["cargo", "derive"] } toml = "0.8.19" url-escape = "0.1.1" +infer = "0.19.0" diff --git a/README.md b/README.md index 784512c..7aa24d1 100644 --- a/README.md +++ b/README.md @@ -9,11 +9,11 @@ web_archive/ ├─ domain.com/ │ ├─ sub/ │ │ ├─ path/ -│ │ │ ├─ index_YYYY_MM_DD.html +│ │ │ ├─ index_YYYY_MM_DD ├─ sub.domain.com/ ``` -Every document of this web archive can then be found at `archive/domain/paths.../index_YYYY_MM_DD.html`. +Every document of this web archive can then be found at `archive/domain/paths.../index_YYYY_MM_DD`. ## Usage webarc provides a CLI tool to work with the archive structure. diff --git a/src/ai.rs b/src/ai.rs index 9739d4f..d782eb8 100644 --- a/src/ai.rs +++ b/src/ai.rs @@ -140,7 +140,8 @@ impl Embedding for Document { ver.as_ref().unwrap_or(&latest) ); - let content_html = self.render_local(ver.clone(), shell).await?; + let content_html = self.render_local(ver.clone(), shell).await.ok()?; + let content_html = String::from_utf8_lossy(&content_html); let content = remove_data_urls(&html2md::parse_html(&content_html)); let mut embeddings = Vec::new(); diff --git a/src/archive/document.rs b/src/archive/document.rs index bd6a136..1637762 100644 --- a/src/archive/document.rs +++ b/src/archive/document.rs @@ -3,9 +3,9 @@ use std::{io::Read, path::PathBuf}; use based::{request::RequestContext, ui::components::prelude::Shell}; use maud::html; -use crate::{blacklist::check_blacklist, conf::get_config, render_page}; +use crate::{blacklist::check_blacklist, render_page}; -use super::{internalize_urls, read_dir}; +use super::read_dir; /// Represents a document within a domain pub struct Document { @@ -52,43 +52,39 @@ impl Document { /// /// # Returns /// An `Option` containing the rendered content as a string, or `None` if nothing could be rendered. - pub async fn render_local(&self, version: Option, shell: &Shell) -> Option { + pub async fn render_local( + &self, + version: Option, + shell: &Shell, + ) -> Result, String> { if check_blacklist(&self.domain) { let content = html! { h3 { "This site is blacklisted" }; }; - return Some( - render_page(content, RequestContext::default(), shell) - .await - .1 - .1, - ); + return Err(render_page(content, RequestContext::default(), shell) + .await + .1 + .1); } let mut file_path = self.doc_dir(); let latest_version = if let Some(version) = version { - format!("index_{version}.html") + format!("index_{version}") } else { let versions = self.versions(); - let version = versions.first().cloned()?; - format!("index_{version}.html") + let version = versions.first().cloned().ok_or(String::new())?; + format!("index_{version}") }; file_path = file_path.join(latest_version); let mut buf = Vec::new(); std::fs::File::open(file_path) - .ok()? + .map_err(|_| String::new())? .read_to_end(&mut buf) .unwrap(); - let content = String::from_utf8_lossy(&buf); - - if get_config().ROUTE_INTERNAL { - Some(internalize_urls(&content, &self.domain)) - } else { - Some(content.to_string()) - } + Ok(buf) } /// Determines the directory where the document is stored. @@ -127,12 +123,8 @@ impl Document { let mut res: Vec = read_dir(&self.doc_dir()) .into_iter() .filter_map(|x| { - if x.starts_with("index_") && x.ends_with(".html") { - return Some( - x.trim_start_matches("index_") - .trim_end_matches(".html") - .to_string(), - ); + if x.starts_with("index_") { + return Some(x.trim_start_matches("index_").to_string()); } None diff --git a/src/archive/domain.rs b/src/archive/domain.rs index 98d6f16..94274f9 100644 --- a/src/archive/domain.rs +++ b/src/archive/domain.rs @@ -98,7 +98,7 @@ impl Domain { .filter(|x| !x.is_empty()) .collect::>() .join("/"); - if entry.starts_with("index_") && entry.ends_with(".html") { + if entry.starts_with("index_") { is_doc = true; continue; } diff --git a/src/archive/mod.rs b/src/archive/mod.rs index 0240ad8..c9bf183 100644 --- a/src/archive/mod.rs +++ b/src/archive/mod.rs @@ -32,7 +32,7 @@ pub fn read_dir(dir: &PathBuf) -> Vec { } /// Rewrite all URLs in `input` to the format `/s//` -fn internalize_urls(input: &str, base: &str) -> String { +pub fn internalize_urls(input: &str, base: &str) -> String { // todo : fix regex, domains without path are not captured let url_pattern = r#"(\ |"|')(?:( {} diff --git a/src/pages/mod.rs b/src/pages/mod.rs index 535b130..3da2127 100644 --- a/src/pages/mod.rs +++ b/src/pages/mod.rs @@ -18,7 +18,7 @@ pub mod component; use component::*; use serde_json::json; -use webarc::archive::{Document, DocumentIndex}; +use webarc::archive::{internalize_urls, Document, DocumentIndex}; use webarc::{ ai::{generate_embedding, remove_data_urls, EmbedStore, SearchResult}, archive::{extract_domains, WebsiteArchive}, @@ -26,6 +26,8 @@ use webarc::{ render_page, }; +// TODO : PDF view + const SEARCH_BAR_STYLE: &str = "w-full px-4 mb-4 py-2 text-white bg-black border-2 border-neon-blue placeholder-neon-blue focus:ring-2 focus:ring-neon-pink focus:outline-none font-mono text-lg"; #[allow(non_snake_case)] @@ -226,12 +228,12 @@ pub async fn domain_info_route( let (path_entries, is_doc) = domain.paths(paths.to_str().unwrap()); let path_seperations: Vec<&str> = paths.to_str().unwrap().split('/').collect(); - let domains = extract_domains( + let domains = extract_domains(&String::from_utf8_lossy( &document .render_local(None, &shell) .await .unwrap_or_default(), - ); + )); let content = html! { h2 class="text-xl font-bold mb-4 flex items-center w-fit mx-auto" { @@ -307,9 +309,12 @@ pub async fn render_txt_website( ) -> Option { let document = arc.get_domain(domain).path(path.to_str().unwrap()); - let mut content = document + let content = document .render_local(time.map(|time| time.to_string()), &shell) - .await?; + .await + .ok()?; + + let mut content = String::from_utf8_lossy(&content).to_string(); if no_data_urls.is_some() { content = remove_data_urls(&content); @@ -376,11 +381,23 @@ pub async fn redownload( arc.archive_url(&format!("https://{domain}/{}", path.to_str())) .await; - let content = document.render_local(None, &shell).await?; + let mut content = document.render_local(None, &shell).await.ok()?; + + let mime = infer::get(&content) + .map(|x| x.mime_type()) + .unwrap_or("text/html"); + + if mime == "text/html" { + if get_config().ROUTE_INTERNAL { + content = internalize_urls(&String::from_utf8_lossy(&content), &domain) + .as_bytes() + .to_vec(); + } + } return Some(DataResponse::new( - content.as_bytes().to_vec(), - "text/html".to_string(), + content.to_vec(), + mime.to_string(), Some(60 * 60 * 24), )); } @@ -414,10 +431,22 @@ pub async fn render_website( // TODO : keep n versions - if let Some(content) = content { + if let Ok(mut content) = content { + let mime = infer::get(&content) + .map(|x| x.mime_type()) + .unwrap_or("text/html"); + + if mime == "text/html" { + if get_config().ROUTE_INTERNAL { + content = internalize_urls(&String::from_utf8_lossy(&content), &domain) + .as_bytes() + .to_vec(); + } + } + return Some(DataResponse::new( - content.as_bytes().to_vec(), - "text/html".to_string(), + content, + mime.to_string(), Some(60 * 60 * 24), )); } else if get_config().DOWNLOAD_ON_DEMAND && time.is_none() {