better mime + more index
Some checks are pending
ci/woodpecker/push/build Pipeline is pending

This commit is contained in:
JMARyA 2025-02-25 00:06:48 +01:00
parent 2e5b4fc3d2
commit aba031a047
Signed by: jmarya
GPG key ID: 901B2ADDF27C2263
7 changed files with 64 additions and 52 deletions

View file

@ -7,11 +7,15 @@ use crate::{
blacklist::{check_blacklist, check_blacklist_path},
conf::get_config,
favicon::download_fav_for,
get_mime_type,
};
mod document;
mod domain;
use based::get_pg;
use based::{
get_pg,
ui::{components::prelude::Shell, prelude::Nothing},
};
use chrono::NaiveDate;
pub use document::Document;
pub use domain::*;
@ -288,20 +292,38 @@ pub async fn index_path(dom: &Domain, path: &str) {
pub async fn index_document(doc: &Document) {
for version_str in &doc.versions() {
if let Ok(version) = chrono::NaiveDate::parse_from_str(version_str, "%Y-%m-%d") {
sqlx::query(
r#"
INSERT INTO document_index (domain, path, version)
VALUES ($1, $2, $3)
if let Ok(content) = doc
.render_local(
Some(version_str.to_string()),
&Shell::new(Nothing(), Nothing(), Nothing()),
)
.await
{
let size = content.len();
let mime = get_mime_type(&content).unwrap_or_default();
if mime.as_str() == "text/html" {
// TODO : domain links index
// TODO : data fragments
}
if let Ok(version) = chrono::NaiveDate::parse_from_str(version_str, "%Y-%m-%d") {
sqlx::query(
r#"
INSERT INTO document_index (domain, path, version, size, mime)
VALUES ($1, $2, $3, $4, $5)
ON CONFLICT (domain, path, version) DO NOTHING
"#,
)
.bind(&doc.domain)
.bind(&doc.path)
.bind(version)
.execute(get_pg!())
.await
.unwrap();
)
.bind(&doc.domain)
.bind(&doc.path)
.bind(version)
.bind(size as i64)
.bind(mime)
.execute(get_pg!())
.await
.unwrap();
}
}
}
}

View file

@ -10,6 +10,26 @@ pub mod blacklist;
pub mod conf;
pub mod favicon;
use std::io::Write;
use std::process::{Command, Stdio};
pub fn get_mime_type(content: &[u8]) -> std::io::Result<String> {
let mut child = Command::new("file")
.arg("--mime-type")
.arg("--brief")
.arg("-") // Read from stdin
.stdin(Stdio::piped())
.stdout(Stdio::piped())
.spawn()?;
if let Some(mut stdin) = child.stdin.take() {
stdin.write_all(content)?;
}
let output = child.wait_with_output()?;
Ok(String::from_utf8_lossy(&output.stdout).trim().to_string())
}
pub async fn render_page(
content: PreEscaped<String>,
ctx: RequestContext,

View file

@ -19,6 +19,7 @@ use component::*;
use serde_json::json;
use webarc::archive::{internalize_urls, Document, DocumentIndex};
use webarc::get_mime_type;
use webarc::{
ai::{generate_embedding, remove_data_urls, EmbedStore, SearchResult},
archive::{extract_domains, WebsiteArchive},
@ -383,9 +384,7 @@ pub async fn redownload(
let mut content = document.render_local(None, &shell).await.ok()?;
let mime = infer::get(&content)
.map(|x| x.mime_type())
.unwrap_or("text/html");
let mime = get_mime_type(&content).unwrap_or("text/html".to_string());
if mime == "text/html" {
if get_config().ROUTE_INTERNAL {
@ -395,11 +394,7 @@ pub async fn redownload(
}
}
return Some(DataResponse::new(
content.to_vec(),
mime.to_string(),
Some(60 * 60 * 24),
));
return Some(DataResponse::new(content, mime, Some(60 * 60 * 24)));
}
/// Return archived version of `domain` / `path` at `time`
@ -432,9 +427,7 @@ pub async fn render_website(
// TODO : keep n versions
if let Ok(mut content) = content {
let mime = infer::get(&content)
.map(|x| x.mime_type())
.unwrap_or("text/html");
let mime = get_mime_type(&content).unwrap_or("text/html".to_string());
if mime == "text/html" {
if get_config().ROUTE_INTERNAL {
@ -444,11 +437,7 @@ pub async fn render_website(
}
}
return Some(DataResponse::new(
content,
mime.to_string(),
Some(60 * 60 * 24),
));
return Some(DataResponse::new(content, mime, Some(60 * 60 * 24)));
} else if get_config().DOWNLOAD_ON_DEMAND && time.is_none() {
return redownload(&arc, domain, path, &document, &shell).await;
}