diff --git a/Cargo.lock b/Cargo.lock index db10bb9..d101835 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -320,17 +320,6 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6d43a04d8753f35258c91f8ec639f792891f748a1edbd759cf1dcea3382ad83c" -[[package]] -name = "cfb" -version = "0.7.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d38f2da7a0a2c4ccf0065be06397cc26a81f4e528be095826eee9d4adbb8c60f" -dependencies = [ - "byteorder", - "fnv", - "uuid", -] - [[package]] name = "cfg-if" version = "1.0.0" @@ -1456,15 +1445,6 @@ dependencies = [ "serde", ] -[[package]] -name = "infer" -version = "0.19.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a588916bfdfd92e71cacef98a63d9b1f0d74d6599980d11894290e7ddefffcf7" -dependencies = [ - "cfb", -] - [[package]] name = "inlinable_string" version = "0.1.15" @@ -3818,7 +3798,6 @@ dependencies = [ "env_logger", "futures", "html2md", - "infer", "log", "maud", "ollama-rs", diff --git a/Cargo.toml b/Cargo.toml index da4025e..1493c06 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -25,4 +25,3 @@ html2md = "0.2.14" clap = { version = "4.5.23", features = ["cargo", "derive"] } toml = "0.8.19" url-escape = "0.1.1" -infer = "0.19.0" diff --git a/Dockerfile b/Dockerfile index aa6b421..be93d43 100644 --- a/Dockerfile +++ b/Dockerfile @@ -13,7 +13,7 @@ RUN cargo build --release FROM debian:buster RUN apt-get update && apt-get upgrade -y -RUN apt-get install -y ca-certificates openssl +RUN apt-get install -y ca-certificates openssl file COPY --from=builder /app/target/release/webarc /webarc COPY --from=builder /monolith/target/release/monolith /usr/bin/monolith diff --git a/migrations/0003_size_mime_index.sql b/migrations/0003_size_mime_index.sql new file mode 100644 index 0000000..d71e2cf --- /dev/null +++ b/migrations/0003_size_mime_index.sql @@ -0,0 +1,3 @@ +ALTER TABLE document_index +ADD COLUMN size BIGINT, +ADD COLUMN mime TEXT; diff --git a/src/archive/mod.rs b/src/archive/mod.rs index c9bf183..52b0f35 100644 --- a/src/archive/mod.rs +++ b/src/archive/mod.rs @@ -7,11 +7,15 @@ use crate::{ blacklist::{check_blacklist, check_blacklist_path}, conf::get_config, favicon::download_fav_for, + get_mime_type, }; mod document; mod domain; -use based::get_pg; +use based::{ + get_pg, + ui::{components::prelude::Shell, prelude::Nothing}, +}; use chrono::NaiveDate; pub use document::Document; pub use domain::*; @@ -288,20 +292,38 @@ pub async fn index_path(dom: &Domain, path: &str) { pub async fn index_document(doc: &Document) { for version_str in &doc.versions() { - if let Ok(version) = chrono::NaiveDate::parse_from_str(version_str, "%Y-%m-%d") { - sqlx::query( - r#" - INSERT INTO document_index (domain, path, version) - VALUES ($1, $2, $3) + if let Ok(content) = doc + .render_local( + Some(version_str.to_string()), + &Shell::new(Nothing(), Nothing(), Nothing()), + ) + .await + { + let size = content.len(); + let mime = get_mime_type(&content).unwrap_or_default(); + + if mime.as_str() == "text/html" { + // TODO : domain links index + // TODO : data fragments + } + + if let Ok(version) = chrono::NaiveDate::parse_from_str(version_str, "%Y-%m-%d") { + sqlx::query( + r#" + INSERT INTO document_index (domain, path, version, size, mime) + VALUES ($1, $2, $3, $4, $5) ON CONFLICT (domain, path, version) DO NOTHING "#, - ) - .bind(&doc.domain) - .bind(&doc.path) - .bind(version) - .execute(get_pg!()) - .await - .unwrap(); + ) + .bind(&doc.domain) + .bind(&doc.path) + .bind(version) + .bind(size as i64) + .bind(mime) + .execute(get_pg!()) + .await + .unwrap(); + } } } } diff --git a/src/lib.rs b/src/lib.rs index 42087b4..912171e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -10,6 +10,26 @@ pub mod blacklist; pub mod conf; pub mod favicon; +use std::io::Write; +use std::process::{Command, Stdio}; + +pub fn get_mime_type(content: &[u8]) -> std::io::Result { + let mut child = Command::new("file") + .arg("--mime-type") + .arg("--brief") + .arg("-") // Read from stdin + .stdin(Stdio::piped()) + .stdout(Stdio::piped()) + .spawn()?; + + if let Some(mut stdin) = child.stdin.take() { + stdin.write_all(content)?; + } + + let output = child.wait_with_output()?; + Ok(String::from_utf8_lossy(&output.stdout).trim().to_string()) +} + pub async fn render_page( content: PreEscaped, ctx: RequestContext, diff --git a/src/pages/mod.rs b/src/pages/mod.rs index 3da2127..d5a73cb 100644 --- a/src/pages/mod.rs +++ b/src/pages/mod.rs @@ -19,6 +19,7 @@ use component::*; use serde_json::json; use webarc::archive::{internalize_urls, Document, DocumentIndex}; +use webarc::get_mime_type; use webarc::{ ai::{generate_embedding, remove_data_urls, EmbedStore, SearchResult}, archive::{extract_domains, WebsiteArchive}, @@ -383,9 +384,7 @@ pub async fn redownload( let mut content = document.render_local(None, &shell).await.ok()?; - let mime = infer::get(&content) - .map(|x| x.mime_type()) - .unwrap_or("text/html"); + let mime = get_mime_type(&content).unwrap_or("text/html".to_string()); if mime == "text/html" { if get_config().ROUTE_INTERNAL { @@ -395,11 +394,7 @@ pub async fn redownload( } } - return Some(DataResponse::new( - content.to_vec(), - mime.to_string(), - Some(60 * 60 * 24), - )); + return Some(DataResponse::new(content, mime, Some(60 * 60 * 24))); } /// Return archived version of `domain` / `path` at `time` @@ -432,9 +427,7 @@ pub async fn render_website( // TODO : keep n versions if let Ok(mut content) = content { - let mime = infer::get(&content) - .map(|x| x.mime_type()) - .unwrap_or("text/html"); + let mime = get_mime_type(&content).unwrap_or("text/html".to_string()); if mime == "text/html" { if get_config().ROUTE_INTERNAL { @@ -444,11 +437,7 @@ pub async fn render_website( } } - return Some(DataResponse::new( - content, - mime.to_string(), - Some(60 * 60 * 24), - )); + return Some(DataResponse::new(content, mime, Some(60 * 60 * 24))); } else if get_config().DOWNLOAD_ON_DEMAND && time.is_none() { return redownload(&arc, domain, path, &document, &shell).await; }