This commit is contained in:
parent
2e5b4fc3d2
commit
aba031a047
7 changed files with 64 additions and 52 deletions
21
Cargo.lock
generated
21
Cargo.lock
generated
|
@ -320,17 +320,6 @@ version = "1.1.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "6d43a04d8753f35258c91f8ec639f792891f748a1edbd759cf1dcea3382ad83c"
|
checksum = "6d43a04d8753f35258c91f8ec639f792891f748a1edbd759cf1dcea3382ad83c"
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "cfb"
|
|
||||||
version = "0.7.3"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "d38f2da7a0a2c4ccf0065be06397cc26a81f4e528be095826eee9d4adbb8c60f"
|
|
||||||
dependencies = [
|
|
||||||
"byteorder",
|
|
||||||
"fnv",
|
|
||||||
"uuid",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "cfg-if"
|
name = "cfg-if"
|
||||||
version = "1.0.0"
|
version = "1.0.0"
|
||||||
|
@ -1456,15 +1445,6 @@ dependencies = [
|
||||||
"serde",
|
"serde",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "infer"
|
|
||||||
version = "0.19.0"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "a588916bfdfd92e71cacef98a63d9b1f0d74d6599980d11894290e7ddefffcf7"
|
|
||||||
dependencies = [
|
|
||||||
"cfb",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "inlinable_string"
|
name = "inlinable_string"
|
||||||
version = "0.1.15"
|
version = "0.1.15"
|
||||||
|
@ -3818,7 +3798,6 @@ dependencies = [
|
||||||
"env_logger",
|
"env_logger",
|
||||||
"futures",
|
"futures",
|
||||||
"html2md",
|
"html2md",
|
||||||
"infer",
|
|
||||||
"log",
|
"log",
|
||||||
"maud",
|
"maud",
|
||||||
"ollama-rs",
|
"ollama-rs",
|
||||||
|
|
|
@ -25,4 +25,3 @@ html2md = "0.2.14"
|
||||||
clap = { version = "4.5.23", features = ["cargo", "derive"] }
|
clap = { version = "4.5.23", features = ["cargo", "derive"] }
|
||||||
toml = "0.8.19"
|
toml = "0.8.19"
|
||||||
url-escape = "0.1.1"
|
url-escape = "0.1.1"
|
||||||
infer = "0.19.0"
|
|
||||||
|
|
|
@ -13,7 +13,7 @@ RUN cargo build --release
|
||||||
FROM debian:buster
|
FROM debian:buster
|
||||||
|
|
||||||
RUN apt-get update && apt-get upgrade -y
|
RUN apt-get update && apt-get upgrade -y
|
||||||
RUN apt-get install -y ca-certificates openssl
|
RUN apt-get install -y ca-certificates openssl file
|
||||||
|
|
||||||
COPY --from=builder /app/target/release/webarc /webarc
|
COPY --from=builder /app/target/release/webarc /webarc
|
||||||
COPY --from=builder /monolith/target/release/monolith /usr/bin/monolith
|
COPY --from=builder /monolith/target/release/monolith /usr/bin/monolith
|
||||||
|
|
3
migrations/0003_size_mime_index.sql
Normal file
3
migrations/0003_size_mime_index.sql
Normal file
|
@ -0,0 +1,3 @@
|
||||||
|
ALTER TABLE document_index
|
||||||
|
ADD COLUMN size BIGINT,
|
||||||
|
ADD COLUMN mime TEXT;
|
|
@ -7,11 +7,15 @@ use crate::{
|
||||||
blacklist::{check_blacklist, check_blacklist_path},
|
blacklist::{check_blacklist, check_blacklist_path},
|
||||||
conf::get_config,
|
conf::get_config,
|
||||||
favicon::download_fav_for,
|
favicon::download_fav_for,
|
||||||
|
get_mime_type,
|
||||||
};
|
};
|
||||||
|
|
||||||
mod document;
|
mod document;
|
||||||
mod domain;
|
mod domain;
|
||||||
use based::get_pg;
|
use based::{
|
||||||
|
get_pg,
|
||||||
|
ui::{components::prelude::Shell, prelude::Nothing},
|
||||||
|
};
|
||||||
use chrono::NaiveDate;
|
use chrono::NaiveDate;
|
||||||
pub use document::Document;
|
pub use document::Document;
|
||||||
pub use domain::*;
|
pub use domain::*;
|
||||||
|
@ -288,20 +292,38 @@ pub async fn index_path(dom: &Domain, path: &str) {
|
||||||
|
|
||||||
pub async fn index_document(doc: &Document) {
|
pub async fn index_document(doc: &Document) {
|
||||||
for version_str in &doc.versions() {
|
for version_str in &doc.versions() {
|
||||||
if let Ok(version) = chrono::NaiveDate::parse_from_str(version_str, "%Y-%m-%d") {
|
if let Ok(content) = doc
|
||||||
sqlx::query(
|
.render_local(
|
||||||
r#"
|
Some(version_str.to_string()),
|
||||||
INSERT INTO document_index (domain, path, version)
|
&Shell::new(Nothing(), Nothing(), Nothing()),
|
||||||
VALUES ($1, $2, $3)
|
)
|
||||||
|
.await
|
||||||
|
{
|
||||||
|
let size = content.len();
|
||||||
|
let mime = get_mime_type(&content).unwrap_or_default();
|
||||||
|
|
||||||
|
if mime.as_str() == "text/html" {
|
||||||
|
// TODO : domain links index
|
||||||
|
// TODO : data fragments
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Ok(version) = chrono::NaiveDate::parse_from_str(version_str, "%Y-%m-%d") {
|
||||||
|
sqlx::query(
|
||||||
|
r#"
|
||||||
|
INSERT INTO document_index (domain, path, version, size, mime)
|
||||||
|
VALUES ($1, $2, $3, $4, $5)
|
||||||
ON CONFLICT (domain, path, version) DO NOTHING
|
ON CONFLICT (domain, path, version) DO NOTHING
|
||||||
"#,
|
"#,
|
||||||
)
|
)
|
||||||
.bind(&doc.domain)
|
.bind(&doc.domain)
|
||||||
.bind(&doc.path)
|
.bind(&doc.path)
|
||||||
.bind(version)
|
.bind(version)
|
||||||
.execute(get_pg!())
|
.bind(size as i64)
|
||||||
.await
|
.bind(mime)
|
||||||
.unwrap();
|
.execute(get_pg!())
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
20
src/lib.rs
20
src/lib.rs
|
@ -10,6 +10,26 @@ pub mod blacklist;
|
||||||
pub mod conf;
|
pub mod conf;
|
||||||
pub mod favicon;
|
pub mod favicon;
|
||||||
|
|
||||||
|
use std::io::Write;
|
||||||
|
use std::process::{Command, Stdio};
|
||||||
|
|
||||||
|
pub fn get_mime_type(content: &[u8]) -> std::io::Result<String> {
|
||||||
|
let mut child = Command::new("file")
|
||||||
|
.arg("--mime-type")
|
||||||
|
.arg("--brief")
|
||||||
|
.arg("-") // Read from stdin
|
||||||
|
.stdin(Stdio::piped())
|
||||||
|
.stdout(Stdio::piped())
|
||||||
|
.spawn()?;
|
||||||
|
|
||||||
|
if let Some(mut stdin) = child.stdin.take() {
|
||||||
|
stdin.write_all(content)?;
|
||||||
|
}
|
||||||
|
|
||||||
|
let output = child.wait_with_output()?;
|
||||||
|
Ok(String::from_utf8_lossy(&output.stdout).trim().to_string())
|
||||||
|
}
|
||||||
|
|
||||||
pub async fn render_page(
|
pub async fn render_page(
|
||||||
content: PreEscaped<String>,
|
content: PreEscaped<String>,
|
||||||
ctx: RequestContext,
|
ctx: RequestContext,
|
||||||
|
|
|
@ -19,6 +19,7 @@ use component::*;
|
||||||
use serde_json::json;
|
use serde_json::json;
|
||||||
|
|
||||||
use webarc::archive::{internalize_urls, Document, DocumentIndex};
|
use webarc::archive::{internalize_urls, Document, DocumentIndex};
|
||||||
|
use webarc::get_mime_type;
|
||||||
use webarc::{
|
use webarc::{
|
||||||
ai::{generate_embedding, remove_data_urls, EmbedStore, SearchResult},
|
ai::{generate_embedding, remove_data_urls, EmbedStore, SearchResult},
|
||||||
archive::{extract_domains, WebsiteArchive},
|
archive::{extract_domains, WebsiteArchive},
|
||||||
|
@ -383,9 +384,7 @@ pub async fn redownload(
|
||||||
|
|
||||||
let mut content = document.render_local(None, &shell).await.ok()?;
|
let mut content = document.render_local(None, &shell).await.ok()?;
|
||||||
|
|
||||||
let mime = infer::get(&content)
|
let mime = get_mime_type(&content).unwrap_or("text/html".to_string());
|
||||||
.map(|x| x.mime_type())
|
|
||||||
.unwrap_or("text/html");
|
|
||||||
|
|
||||||
if mime == "text/html" {
|
if mime == "text/html" {
|
||||||
if get_config().ROUTE_INTERNAL {
|
if get_config().ROUTE_INTERNAL {
|
||||||
|
@ -395,11 +394,7 @@ pub async fn redownload(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return Some(DataResponse::new(
|
return Some(DataResponse::new(content, mime, Some(60 * 60 * 24)));
|
||||||
content.to_vec(),
|
|
||||||
mime.to_string(),
|
|
||||||
Some(60 * 60 * 24),
|
|
||||||
));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Return archived version of `domain` / `path` at `time`
|
/// Return archived version of `domain` / `path` at `time`
|
||||||
|
@ -432,9 +427,7 @@ pub async fn render_website(
|
||||||
// TODO : keep n versions
|
// TODO : keep n versions
|
||||||
|
|
||||||
if let Ok(mut content) = content {
|
if let Ok(mut content) = content {
|
||||||
let mime = infer::get(&content)
|
let mime = get_mime_type(&content).unwrap_or("text/html".to_string());
|
||||||
.map(|x| x.mime_type())
|
|
||||||
.unwrap_or("text/html");
|
|
||||||
|
|
||||||
if mime == "text/html" {
|
if mime == "text/html" {
|
||||||
if get_config().ROUTE_INTERNAL {
|
if get_config().ROUTE_INTERNAL {
|
||||||
|
@ -444,11 +437,7 @@ pub async fn render_website(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return Some(DataResponse::new(
|
return Some(DataResponse::new(content, mime, Some(60 * 60 * 24)));
|
||||||
content,
|
|
||||||
mime.to_string(),
|
|
||||||
Some(60 * 60 * 24),
|
|
||||||
));
|
|
||||||
} else if get_config().DOWNLOAD_ON_DEMAND && time.is_none() {
|
} else if get_config().DOWNLOAD_ON_DEMAND && time.is_none() {
|
||||||
return redownload(&arc, domain, path, &document, &shell).await;
|
return redownload(&arc, domain, path, &document, &shell).await;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Reference in a new issue