better mime + more index
Some checks are pending
ci/woodpecker/push/build Pipeline is pending

This commit is contained in:
JMARyA 2025-02-25 00:06:48 +01:00
parent 2e5b4fc3d2
commit aba031a047
Signed by: jmarya
GPG key ID: 901B2ADDF27C2263
7 changed files with 64 additions and 52 deletions

21
Cargo.lock generated
View file

@ -320,17 +320,6 @@ version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6d43a04d8753f35258c91f8ec639f792891f748a1edbd759cf1dcea3382ad83c"
[[package]]
name = "cfb"
version = "0.7.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d38f2da7a0a2c4ccf0065be06397cc26a81f4e528be095826eee9d4adbb8c60f"
dependencies = [
"byteorder",
"fnv",
"uuid",
]
[[package]]
name = "cfg-if"
version = "1.0.0"
@ -1456,15 +1445,6 @@ dependencies = [
"serde",
]
[[package]]
name = "infer"
version = "0.19.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a588916bfdfd92e71cacef98a63d9b1f0d74d6599980d11894290e7ddefffcf7"
dependencies = [
"cfb",
]
[[package]]
name = "inlinable_string"
version = "0.1.15"
@ -3818,7 +3798,6 @@ dependencies = [
"env_logger",
"futures",
"html2md",
"infer",
"log",
"maud",
"ollama-rs",

View file

@ -25,4 +25,3 @@ html2md = "0.2.14"
clap = { version = "4.5.23", features = ["cargo", "derive"] }
toml = "0.8.19"
url-escape = "0.1.1"
infer = "0.19.0"

View file

@ -13,7 +13,7 @@ RUN cargo build --release
FROM debian:buster
RUN apt-get update && apt-get upgrade -y
RUN apt-get install -y ca-certificates openssl
RUN apt-get install -y ca-certificates openssl file
COPY --from=builder /app/target/release/webarc /webarc
COPY --from=builder /monolith/target/release/monolith /usr/bin/monolith

View file

@ -0,0 +1,3 @@
ALTER TABLE document_index
ADD COLUMN size BIGINT,
ADD COLUMN mime TEXT;

View file

@ -7,11 +7,15 @@ use crate::{
blacklist::{check_blacklist, check_blacklist_path},
conf::get_config,
favicon::download_fav_for,
get_mime_type,
};
mod document;
mod domain;
use based::get_pg;
use based::{
get_pg,
ui::{components::prelude::Shell, prelude::Nothing},
};
use chrono::NaiveDate;
pub use document::Document;
pub use domain::*;
@ -288,23 +292,41 @@ pub async fn index_path(dom: &Domain, path: &str) {
pub async fn index_document(doc: &Document) {
for version_str in &doc.versions() {
if let Ok(content) = doc
.render_local(
Some(version_str.to_string()),
&Shell::new(Nothing(), Nothing(), Nothing()),
)
.await
{
let size = content.len();
let mime = get_mime_type(&content).unwrap_or_default();
if mime.as_str() == "text/html" {
// TODO : domain links index
// TODO : data fragments
}
if let Ok(version) = chrono::NaiveDate::parse_from_str(version_str, "%Y-%m-%d") {
sqlx::query(
r#"
INSERT INTO document_index (domain, path, version)
VALUES ($1, $2, $3)
INSERT INTO document_index (domain, path, version, size, mime)
VALUES ($1, $2, $3, $4, $5)
ON CONFLICT (domain, path, version) DO NOTHING
"#,
)
.bind(&doc.domain)
.bind(&doc.path)
.bind(version)
.bind(size as i64)
.bind(mime)
.execute(get_pg!())
.await
.unwrap();
}
}
}
}
pub struct DocumentIndex {}

View file

@ -10,6 +10,26 @@ pub mod blacklist;
pub mod conf;
pub mod favicon;
use std::io::Write;
use std::process::{Command, Stdio};
pub fn get_mime_type(content: &[u8]) -> std::io::Result<String> {
let mut child = Command::new("file")
.arg("--mime-type")
.arg("--brief")
.arg("-") // Read from stdin
.stdin(Stdio::piped())
.stdout(Stdio::piped())
.spawn()?;
if let Some(mut stdin) = child.stdin.take() {
stdin.write_all(content)?;
}
let output = child.wait_with_output()?;
Ok(String::from_utf8_lossy(&output.stdout).trim().to_string())
}
pub async fn render_page(
content: PreEscaped<String>,
ctx: RequestContext,

View file

@ -19,6 +19,7 @@ use component::*;
use serde_json::json;
use webarc::archive::{internalize_urls, Document, DocumentIndex};
use webarc::get_mime_type;
use webarc::{
ai::{generate_embedding, remove_data_urls, EmbedStore, SearchResult},
archive::{extract_domains, WebsiteArchive},
@ -383,9 +384,7 @@ pub async fn redownload(
let mut content = document.render_local(None, &shell).await.ok()?;
let mime = infer::get(&content)
.map(|x| x.mime_type())
.unwrap_or("text/html");
let mime = get_mime_type(&content).unwrap_or("text/html".to_string());
if mime == "text/html" {
if get_config().ROUTE_INTERNAL {
@ -395,11 +394,7 @@ pub async fn redownload(
}
}
return Some(DataResponse::new(
content.to_vec(),
mime.to_string(),
Some(60 * 60 * 24),
));
return Some(DataResponse::new(content, mime, Some(60 * 60 * 24)));
}
/// Return archived version of `domain` / `path` at `time`
@ -432,9 +427,7 @@ pub async fn render_website(
// TODO : keep n versions
if let Ok(mut content) = content {
let mime = infer::get(&content)
.map(|x| x.mime_type())
.unwrap_or("text/html");
let mime = get_mime_type(&content).unwrap_or("text/html".to_string());
if mime == "text/html" {
if get_config().ROUTE_INTERNAL {
@ -444,11 +437,7 @@ pub async fn render_website(
}
}
return Some(DataResponse::new(
content,
mime.to_string(),
Some(60 * 60 * 24),
));
return Some(DataResponse::new(content, mime, Some(60 * 60 * 24)));
} else if get_config().DOWNLOAD_ON_DEMAND && time.is_none() {
return redownload(&arc, domain, path, &document, &shell).await;
}