This commit is contained in:
parent
2e5b4fc3d2
commit
aba031a047
7 changed files with 64 additions and 52 deletions
21
Cargo.lock
generated
21
Cargo.lock
generated
|
@ -320,17 +320,6 @@ version = "1.1.0"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6d43a04d8753f35258c91f8ec639f792891f748a1edbd759cf1dcea3382ad83c"
|
||||
|
||||
[[package]]
|
||||
name = "cfb"
|
||||
version = "0.7.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d38f2da7a0a2c4ccf0065be06397cc26a81f4e528be095826eee9d4adbb8c60f"
|
||||
dependencies = [
|
||||
"byteorder",
|
||||
"fnv",
|
||||
"uuid",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cfg-if"
|
||||
version = "1.0.0"
|
||||
|
@ -1456,15 +1445,6 @@ dependencies = [
|
|||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "infer"
|
||||
version = "0.19.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a588916bfdfd92e71cacef98a63d9b1f0d74d6599980d11894290e7ddefffcf7"
|
||||
dependencies = [
|
||||
"cfb",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "inlinable_string"
|
||||
version = "0.1.15"
|
||||
|
@ -3818,7 +3798,6 @@ dependencies = [
|
|||
"env_logger",
|
||||
"futures",
|
||||
"html2md",
|
||||
"infer",
|
||||
"log",
|
||||
"maud",
|
||||
"ollama-rs",
|
||||
|
|
|
@ -25,4 +25,3 @@ html2md = "0.2.14"
|
|||
clap = { version = "4.5.23", features = ["cargo", "derive"] }
|
||||
toml = "0.8.19"
|
||||
url-escape = "0.1.1"
|
||||
infer = "0.19.0"
|
||||
|
|
|
@ -13,7 +13,7 @@ RUN cargo build --release
|
|||
FROM debian:buster
|
||||
|
||||
RUN apt-get update && apt-get upgrade -y
|
||||
RUN apt-get install -y ca-certificates openssl
|
||||
RUN apt-get install -y ca-certificates openssl file
|
||||
|
||||
COPY --from=builder /app/target/release/webarc /webarc
|
||||
COPY --from=builder /monolith/target/release/monolith /usr/bin/monolith
|
||||
|
|
3
migrations/0003_size_mime_index.sql
Normal file
3
migrations/0003_size_mime_index.sql
Normal file
|
@ -0,0 +1,3 @@
|
|||
ALTER TABLE document_index
|
||||
ADD COLUMN size BIGINT,
|
||||
ADD COLUMN mime TEXT;
|
|
@ -7,11 +7,15 @@ use crate::{
|
|||
blacklist::{check_blacklist, check_blacklist_path},
|
||||
conf::get_config,
|
||||
favicon::download_fav_for,
|
||||
get_mime_type,
|
||||
};
|
||||
|
||||
mod document;
|
||||
mod domain;
|
||||
use based::get_pg;
|
||||
use based::{
|
||||
get_pg,
|
||||
ui::{components::prelude::Shell, prelude::Nothing},
|
||||
};
|
||||
use chrono::NaiveDate;
|
||||
pub use document::Document;
|
||||
pub use domain::*;
|
||||
|
@ -288,23 +292,41 @@ pub async fn index_path(dom: &Domain, path: &str) {
|
|||
|
||||
pub async fn index_document(doc: &Document) {
|
||||
for version_str in &doc.versions() {
|
||||
if let Ok(content) = doc
|
||||
.render_local(
|
||||
Some(version_str.to_string()),
|
||||
&Shell::new(Nothing(), Nothing(), Nothing()),
|
||||
)
|
||||
.await
|
||||
{
|
||||
let size = content.len();
|
||||
let mime = get_mime_type(&content).unwrap_or_default();
|
||||
|
||||
if mime.as_str() == "text/html" {
|
||||
// TODO : domain links index
|
||||
// TODO : data fragments
|
||||
}
|
||||
|
||||
if let Ok(version) = chrono::NaiveDate::parse_from_str(version_str, "%Y-%m-%d") {
|
||||
sqlx::query(
|
||||
r#"
|
||||
INSERT INTO document_index (domain, path, version)
|
||||
VALUES ($1, $2, $3)
|
||||
INSERT INTO document_index (domain, path, version, size, mime)
|
||||
VALUES ($1, $2, $3, $4, $5)
|
||||
ON CONFLICT (domain, path, version) DO NOTHING
|
||||
"#,
|
||||
)
|
||||
.bind(&doc.domain)
|
||||
.bind(&doc.path)
|
||||
.bind(version)
|
||||
.bind(size as i64)
|
||||
.bind(mime)
|
||||
.execute(get_pg!())
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct DocumentIndex {}
|
||||
|
||||
|
|
20
src/lib.rs
20
src/lib.rs
|
@ -10,6 +10,26 @@ pub mod blacklist;
|
|||
pub mod conf;
|
||||
pub mod favicon;
|
||||
|
||||
use std::io::Write;
|
||||
use std::process::{Command, Stdio};
|
||||
|
||||
pub fn get_mime_type(content: &[u8]) -> std::io::Result<String> {
|
||||
let mut child = Command::new("file")
|
||||
.arg("--mime-type")
|
||||
.arg("--brief")
|
||||
.arg("-") // Read from stdin
|
||||
.stdin(Stdio::piped())
|
||||
.stdout(Stdio::piped())
|
||||
.spawn()?;
|
||||
|
||||
if let Some(mut stdin) = child.stdin.take() {
|
||||
stdin.write_all(content)?;
|
||||
}
|
||||
|
||||
let output = child.wait_with_output()?;
|
||||
Ok(String::from_utf8_lossy(&output.stdout).trim().to_string())
|
||||
}
|
||||
|
||||
pub async fn render_page(
|
||||
content: PreEscaped<String>,
|
||||
ctx: RequestContext,
|
||||
|
|
|
@ -19,6 +19,7 @@ use component::*;
|
|||
use serde_json::json;
|
||||
|
||||
use webarc::archive::{internalize_urls, Document, DocumentIndex};
|
||||
use webarc::get_mime_type;
|
||||
use webarc::{
|
||||
ai::{generate_embedding, remove_data_urls, EmbedStore, SearchResult},
|
||||
archive::{extract_domains, WebsiteArchive},
|
||||
|
@ -383,9 +384,7 @@ pub async fn redownload(
|
|||
|
||||
let mut content = document.render_local(None, &shell).await.ok()?;
|
||||
|
||||
let mime = infer::get(&content)
|
||||
.map(|x| x.mime_type())
|
||||
.unwrap_or("text/html");
|
||||
let mime = get_mime_type(&content).unwrap_or("text/html".to_string());
|
||||
|
||||
if mime == "text/html" {
|
||||
if get_config().ROUTE_INTERNAL {
|
||||
|
@ -395,11 +394,7 @@ pub async fn redownload(
|
|||
}
|
||||
}
|
||||
|
||||
return Some(DataResponse::new(
|
||||
content.to_vec(),
|
||||
mime.to_string(),
|
||||
Some(60 * 60 * 24),
|
||||
));
|
||||
return Some(DataResponse::new(content, mime, Some(60 * 60 * 24)));
|
||||
}
|
||||
|
||||
/// Return archived version of `domain` / `path` at `time`
|
||||
|
@ -432,9 +427,7 @@ pub async fn render_website(
|
|||
// TODO : keep n versions
|
||||
|
||||
if let Ok(mut content) = content {
|
||||
let mime = infer::get(&content)
|
||||
.map(|x| x.mime_type())
|
||||
.unwrap_or("text/html");
|
||||
let mime = get_mime_type(&content).unwrap_or("text/html".to_string());
|
||||
|
||||
if mime == "text/html" {
|
||||
if get_config().ROUTE_INTERNAL {
|
||||
|
@ -444,11 +437,7 @@ pub async fn render_website(
|
|||
}
|
||||
}
|
||||
|
||||
return Some(DataResponse::new(
|
||||
content,
|
||||
mime.to_string(),
|
||||
Some(60 * 60 * 24),
|
||||
));
|
||||
return Some(DataResponse::new(content, mime, Some(60 * 60 * 24)));
|
||||
} else if get_config().DOWNLOAD_ON_DEMAND && time.is_none() {
|
||||
return redownload(&arc, domain, path, &document, &shell).await;
|
||||
}
|
||||
|
|
Loading…
Add table
Reference in a new issue