any mime
Some checks are pending
ci/woodpecker/push/build Pipeline is pending

This commit is contained in:
JMARyA 2025-02-24 19:30:20 +01:00
parent a9f758cd9b
commit 2e5b4fc3d2
Signed by: jmarya
GPG key ID: 901B2ADDF27C2263
9 changed files with 141 additions and 89 deletions

105
Cargo.lock generated
View file

@ -202,7 +202,7 @@ checksum = "8c3c1a368f70d6cf7302d78f8f7093da241fb8e8807c05cc9e51a125895a6d5b"
[[package]]
name = "based"
version = "0.1.0"
source = "git+https://git.hydrar.de/jmarya/based#9afe75bc8fd961f050ee31fc9e86e37eb6f8ffb6"
source = "git+https://git.hydrar.de/jmarya/based#696b34f2f17ef2d86f0bc77993f9b0b8b652c0f6"
dependencies = [
"bcrypt",
"chrono",
@ -307,9 +307,9 @@ checksum = "f61dac84819c6588b558454b194026eb1f09c293b9036ae9b159e74e73ab6cf9"
[[package]]
name = "cc"
version = "1.2.14"
version = "1.2.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0c3d1b2e905a3a7b00a6141adb0e4c0bb941d11caf55349d863942a1cc44e3c9"
checksum = "c736e259eea577f443d5c86c304f9f4ae0295c43f3ba05c21f1d66b5f06001af"
dependencies = [
"shlex",
]
@ -320,6 +320,17 @@ version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6d43a04d8753f35258c91f8ec639f792891f748a1edbd759cf1dcea3382ad83c"
[[package]]
name = "cfb"
version = "0.7.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d38f2da7a0a2c4ccf0065be06397cc26a81f4e528be095826eee9d4adbb8c60f"
dependencies = [
"byteorder",
"fnv",
"uuid",
]
[[package]]
name = "cfg-if"
version = "1.0.0"
@ -353,9 +364,9 @@ dependencies = [
[[package]]
name = "clap"
version = "4.5.30"
version = "4.5.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "92b7b18d71fad5313a1e320fa9897994228ce274b60faa4d694fe0ea89cd9e6d"
checksum = "027bb0d98429ae334a8698531da7077bdf906419543a35a55c2cb1b66437d767"
dependencies = [
"clap_builder",
"clap_derive",
@ -363,9 +374,9 @@ dependencies = [
[[package]]
name = "clap_builder"
version = "4.5.30"
version = "4.5.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a35db2071778a7344791a4fb4f95308b5673d219dee3ae348b86642574ecc90c"
checksum = "5589e0cba072e0f3d23791efac0fd8627b49c829c196a492e88168e6a669d863"
dependencies = [
"anstream",
"anstyle",
@ -627,9 +638,9 @@ checksum = "feeef44e73baff3a26d371801df019877a9866a8c493d315ab00177843314f35"
[[package]]
name = "either"
version = "1.13.0"
version = "1.14.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0"
checksum = "b7914353092ddf589ad78f25c5c1c21b7f80b0ff8621e7c814c3485b5306da9d"
dependencies = [
"serde",
]
@ -950,9 +961,9 @@ dependencies = [
[[package]]
name = "h2"
version = "0.4.7"
version = "0.4.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ccae279728d634d083c00f6099cb58f01cc99c145b84b8be2f6c74618d79922e"
checksum = "5017294ff4bb30944501348f6f8e42e6ad28f42c8bbef7a74029aff064a4e3c2"
dependencies = [
"atomic-waker",
"bytes",
@ -1185,7 +1196,7 @@ dependencies = [
"bytes",
"futures-channel",
"futures-util",
"h2 0.4.7",
"h2 0.4.8",
"http 1.2.0",
"http-body 1.0.1",
"httparse",
@ -1445,6 +1456,15 @@ dependencies = [
"serde",
]
[[package]]
name = "infer"
version = "0.19.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a588916bfdfd92e71cacef98a63d9b1f0d74d6599980d11894290e7ddefffcf7"
dependencies = [
"cfb",
]
[[package]]
name = "inlinable_string"
version = "0.1.15"
@ -1453,9 +1473,9 @@ checksum = "c8fae54786f62fb2918dcfae3d568594e50eb9b5c25bf04371af6fe7516452fb"
[[package]]
name = "inout"
version = "0.1.3"
version = "0.1.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a0c10553d664a4d0bcff9f4215d0aac67a639cc68ef660840afe309b807bc9f5"
checksum = "879f10e63c20629ecabbb64a8010319738c66a5cd0c29b02d63d272b03751d01"
dependencies = [
"generic-array",
]
@ -1530,9 +1550,9 @@ dependencies = [
[[package]]
name = "libc"
version = "0.2.169"
version = "0.2.170"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a"
checksum = "875b3680cb2f8f71bdcf9a30f38d48282f5d3c95cbf9b3fa57269bb5d5c06828"
[[package]]
name = "libm"
@ -1574,9 +1594,9 @@ dependencies = [
[[package]]
name = "log"
version = "0.4.25"
version = "0.4.26"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "04cbf5b083de1c7e0222a7a51dbfdba1cbe1c6ab0b15e29fff3f6c077fd9cd9f"
checksum = "30bde2b3dc3671ae49d8e2e9f044c7c005836e7a023ee57cffa25ab82764bb9e"
[[package]]
name = "loom"
@ -1680,9 +1700,9 @@ checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a"
[[package]]
name = "miniz_oxide"
version = "0.8.4"
version = "0.8.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b3b1c9bd4fe1f0f8b387f6eb9eb3b4a1aa26185e5750efb9140301703f62cd1b"
checksum = "8e3e04debbb59698c15bacbb6d93584a8c0ca9cc3213cb423d31f760d8843ce5"
dependencies = [
"adler2",
]
@ -1719,9 +1739,9 @@ dependencies = [
[[package]]
name = "native-tls"
version = "0.2.13"
version = "0.2.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0dab59f8e050d5df8e4dd87d9206fb6f65a483e20ac9fda365ade4fab353196c"
checksum = "87de3442987e9dbec73158d5c715e7ad9072fda936bb03d19d7fa10e00520f0e"
dependencies = [
"libc",
"log",
@ -2175,9 +2195,9 @@ dependencies = [
[[package]]
name = "redox_syscall"
version = "0.5.8"
version = "0.5.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "03a862b389f93e68874fbf580b9de08dd02facb9a788ebadaf4a3fd33cf58834"
checksum = "82b568323e98e49e2a0899dcee453dd679fae22d69adf9b11dd508d1549b7e2f"
dependencies = [
"bitflags 2.8.0",
]
@ -2297,7 +2317,7 @@ dependencies = [
"encoding_rs",
"futures-core",
"futures-util",
"h2 0.4.7",
"h2 0.4.8",
"http 1.2.0",
"http-body 1.0.1",
"http-body-util",
@ -2347,9 +2367,9 @@ dependencies = [
[[package]]
name = "ring"
version = "0.17.9"
version = "0.17.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e75ec5e92c4d8aede845126adc388046234541629e76029599ed35a003c7ed24"
checksum = "da5349ae27d3887ca812fb375b45a4fbb36d8d12d2df394968cd86e35683fe73"
dependencies = [
"cc",
"cfg-if",
@ -2540,7 +2560,7 @@ version = "0.102.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "64ca1bc8749bd4cf37b5ce386cc146580777b4e8572c7b97baf22c83f444bee9"
dependencies = [
"ring 0.17.9",
"ring 0.17.11",
"rustls-pki-types",
"untrusted 0.9.0",
]
@ -2637,18 +2657,18 @@ dependencies = [
[[package]]
name = "serde"
version = "1.0.217"
version = "1.0.218"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "02fc4265df13d6fa1d00ecff087228cc0a2b5f3c0e87e258d8b94a156e984c70"
checksum = "e8dfc9d19bdbf6d17e22319da49161d5d0108e4188e8b680aef6299eed22df60"
dependencies = [
"serde_derive",
]
[[package]]
name = "serde_derive"
version = "1.0.217"
version = "1.0.218"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5a9bf7cf98d04a2b28aead066b7496853d4779c9cc183c440dbac457641e19a0"
checksum = "f09503e191f4e797cb8aac08e9a4a4695c5edf6a2e70e376d961ddd5c969f82b"
dependencies = [
"proc-macro2",
"quote",
@ -2668,9 +2688,9 @@ dependencies = [
[[package]]
name = "serde_json"
version = "1.0.138"
version = "1.0.139"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d434192e7da787e94a6ea7e9670b26a036d0ca41e0b7efb2676dd32bae872949"
checksum = "44f86c3acccc9c65b153fe1b85a3be07fe5515274ec9f0653b4a0875731c72a6"
dependencies = [
"itoa",
"memchr",
@ -3055,9 +3075,9 @@ dependencies = [
[[package]]
name = "string_cache_codegen"
version = "0.5.3"
version = "0.5.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "244292f3441c89febe5b5bdfbb6863aeaf4f64da810ea3050fd927b27b8d92ce"
checksum = "c711928715f1fe0fe509c53b43e993a9a557babc2d0a3567d0a3006f1ac931a0"
dependencies = [
"phf_generator",
"phf_shared",
@ -3558,9 +3578,9 @@ checksum = "5c1cb5db39152898a79168971543b1cb5020dff7fe43c8dc468b0885f5e29df5"
[[package]]
name = "unicode-ident"
version = "1.0.16"
version = "1.0.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a210d160f08b701c8721ba1c726c11662f877ea6b7094007e1ca9a1041945034"
checksum = "00e2473a93778eb0bad35909dff6a10d28e63f792f16ed15e404fca9d5eeedbe"
[[package]]
name = "unicode-normalization"
@ -3641,9 +3661,9 @@ checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
[[package]]
name = "uuid"
version = "1.13.2"
version = "1.14.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8c1f41ffb7cf259f1ecc2876861a17e7142e63ead296f671f81f6ae85903e0d6"
checksum = "93d59ca99a559661b96bf898d8fce28ed87935fd2bea9f05983c1464dd6c71b1"
dependencies = [
"getrandom 0.3.1",
"serde",
@ -3798,6 +3818,7 @@ dependencies = [
"env_logger",
"futures",
"html2md",
"infer",
"log",
"maud",
"ollama-rs",
@ -4054,9 +4075,9 @@ checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
[[package]]
name = "winnow"
version = "0.7.2"
version = "0.7.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "59690dea168f2198d1a3b0cac23b8063efcd11012f10ae4698f284808c8ef603"
checksum = "0e7f4ea97f6f78012141bcdb6a216b2609f0979ada50b20ca5b52dde2eac2bb1"
dependencies = [
"memchr",
]

View file

@ -25,3 +25,4 @@ html2md = "0.2.14"
clap = { version = "4.5.23", features = ["cargo", "derive"] }
toml = "0.8.19"
url-escape = "0.1.1"
infer = "0.19.0"

View file

@ -9,11 +9,11 @@ web_archive/
├─ domain.com/
│ ├─ sub/
│ │ ├─ path/
│ │ │ ├─ index_YYYY_MM_DD.html
│ │ │ ├─ index_YYYY_MM_DD
├─ sub.domain.com/
```
Every document of this web archive can then be found at `archive/domain/paths.../index_YYYY_MM_DD.html`.
Every document of this web archive can then be found at `archive/domain/paths.../index_YYYY_MM_DD`.
## Usage
webarc provides a CLI tool to work with the archive structure.

View file

@ -140,7 +140,8 @@ impl Embedding for Document {
ver.as_ref().unwrap_or(&latest)
);
let content_html = self.render_local(ver.clone(), shell).await?;
let content_html = self.render_local(ver.clone(), shell).await.ok()?;
let content_html = String::from_utf8_lossy(&content_html);
let content = remove_data_urls(&html2md::parse_html(&content_html));
let mut embeddings = Vec::new();

View file

@ -3,9 +3,9 @@ use std::{io::Read, path::PathBuf};
use based::{request::RequestContext, ui::components::prelude::Shell};
use maud::html;
use crate::{blacklist::check_blacklist, conf::get_config, render_page};
use crate::{blacklist::check_blacklist, render_page};
use super::{internalize_urls, read_dir};
use super::read_dir;
/// Represents a document within a domain
pub struct Document {
@ -52,43 +52,39 @@ impl Document {
///
/// # Returns
/// An `Option` containing the rendered content as a string, or `None` if nothing could be rendered.
pub async fn render_local(&self, version: Option<String>, shell: &Shell) -> Option<String> {
pub async fn render_local(
&self,
version: Option<String>,
shell: &Shell,
) -> Result<Vec<u8>, String> {
if check_blacklist(&self.domain) {
let content = html! {
h3 { "This site is blacklisted" };
};
return Some(
render_page(content, RequestContext::default(), shell)
.await
.1
.1,
);
return Err(render_page(content, RequestContext::default(), shell)
.await
.1
.1);
}
let mut file_path = self.doc_dir();
let latest_version = if let Some(version) = version {
format!("index_{version}.html")
format!("index_{version}")
} else {
let versions = self.versions();
let version = versions.first().cloned()?;
format!("index_{version}.html")
let version = versions.first().cloned().ok_or(String::new())?;
format!("index_{version}")
};
file_path = file_path.join(latest_version);
let mut buf = Vec::new();
std::fs::File::open(file_path)
.ok()?
.map_err(|_| String::new())?
.read_to_end(&mut buf)
.unwrap();
let content = String::from_utf8_lossy(&buf);
if get_config().ROUTE_INTERNAL {
Some(internalize_urls(&content, &self.domain))
} else {
Some(content.to_string())
}
Ok(buf)
}
/// Determines the directory where the document is stored.
@ -127,12 +123,8 @@ impl Document {
let mut res: Vec<String> = read_dir(&self.doc_dir())
.into_iter()
.filter_map(|x| {
if x.starts_with("index_") && x.ends_with(".html") {
return Some(
x.trim_start_matches("index_")
.trim_end_matches(".html")
.to_string(),
);
if x.starts_with("index_") {
return Some(x.trim_start_matches("index_").to_string());
}
None

View file

@ -98,7 +98,7 @@ impl Domain {
.filter(|x| !x.is_empty())
.collect::<Vec<&str>>()
.join("/");
if entry.starts_with("index_") && entry.ends_with(".html") {
if entry.starts_with("index_") {
is_doc = true;
continue;
}

View file

@ -32,7 +32,7 @@ pub fn read_dir(dir: &PathBuf) -> Vec<String> {
}
/// Rewrite all URLs in `input` to the format `/s/<domain>/<path..>`
fn internalize_urls(input: &str, base: &str) -> String {
pub fn internalize_urls(input: &str, base: &str) -> String {
// todo : fix regex, domains without path are not captured
let url_pattern = r#"(\ |"|')(?:(<?)(https?:\/\/([a-zA-Z0-9.-]+))?(\/[\w./-]*))"#;
let re = regex::Regex::new(url_pattern).unwrap();
@ -172,7 +172,7 @@ impl WebsiteArchive {
std::fs::create_dir_all(&folder_name).unwrap();
let timestamp = chrono::Utc::now().format("%Y-%m-%d").to_string();
let filename = folder_name.join(format!("index_{timestamp}.html"));
let filename = folder_name.join(format!("index_{timestamp}"));
log::info!("Archiving {url} to {}", filename.to_str().unwrap());
@ -238,10 +238,17 @@ fn run_command(cmd: &[&str]) {
let child = cmd_setup.spawn().unwrap();
let status = child.wait_with_output().unwrap();
assert!(status.status.success());
if !status.status.success() {
log::warn!(
"Command {cmd:?} exited with code {}",
status.status.code().unwrap_or_default()
)
}
}
pub async fn index_archive_db(arc: &WebsiteArchive) {
// TODO : more index attrs size,mime
log::info!("Indexing archive");
for dom in arc.domains() {

View file

@ -158,16 +158,17 @@ async fn main() {
let content = doc.render_local(Some(ver), &shell).await;
if content.is_none() {
if content.is_err() {
println!("No document found");
std::process::exit(1);
}
if md {
let markdown = html2md::parse_html(&content.unwrap());
let markdown =
html2md::parse_html(&String::from_utf8_lossy(&content.unwrap()));
println!("{markdown}");
} else {
println!("{}", content.unwrap());
println!("{}", String::from_utf8_lossy(&content.unwrap()));
}
}
Some((&_, _)) => {}

View file

@ -18,7 +18,7 @@ pub mod component;
use component::*;
use serde_json::json;
use webarc::archive::{Document, DocumentIndex};
use webarc::archive::{internalize_urls, Document, DocumentIndex};
use webarc::{
ai::{generate_embedding, remove_data_urls, EmbedStore, SearchResult},
archive::{extract_domains, WebsiteArchive},
@ -26,6 +26,8 @@ use webarc::{
render_page,
};
// TODO : PDF view
const SEARCH_BAR_STYLE: &str = "w-full px-4 mb-4 py-2 text-white bg-black border-2 border-neon-blue placeholder-neon-blue focus:ring-2 focus:ring-neon-pink focus:outline-none font-mono text-lg";
#[allow(non_snake_case)]
@ -226,12 +228,12 @@ pub async fn domain_info_route(
let (path_entries, is_doc) = domain.paths(paths.to_str().unwrap());
let path_seperations: Vec<&str> = paths.to_str().unwrap().split('/').collect();
let domains = extract_domains(
let domains = extract_domains(&String::from_utf8_lossy(
&document
.render_local(None, &shell)
.await
.unwrap_or_default(),
);
));
let content = html! {
h2 class="text-xl font-bold mb-4 flex items-center w-fit mx-auto" {
@ -307,9 +309,12 @@ pub async fn render_txt_website(
) -> Option<String> {
let document = arc.get_domain(domain).path(path.to_str().unwrap());
let mut content = document
let content = document
.render_local(time.map(|time| time.to_string()), &shell)
.await?;
.await
.ok()?;
let mut content = String::from_utf8_lossy(&content).to_string();
if no_data_urls.is_some() {
content = remove_data_urls(&content);
@ -376,11 +381,23 @@ pub async fn redownload(
arc.archive_url(&format!("https://{domain}/{}", path.to_str()))
.await;
let content = document.render_local(None, &shell).await?;
let mut content = document.render_local(None, &shell).await.ok()?;
let mime = infer::get(&content)
.map(|x| x.mime_type())
.unwrap_or("text/html");
if mime == "text/html" {
if get_config().ROUTE_INTERNAL {
content = internalize_urls(&String::from_utf8_lossy(&content), &domain)
.as_bytes()
.to_vec();
}
}
return Some(DataResponse::new(
content.as_bytes().to_vec(),
"text/html".to_string(),
content.to_vec(),
mime.to_string(),
Some(60 * 60 * 24),
));
}
@ -414,10 +431,22 @@ pub async fn render_website(
// TODO : keep n versions
if let Some(content) = content {
if let Ok(mut content) = content {
let mime = infer::get(&content)
.map(|x| x.mime_type())
.unwrap_or("text/html");
if mime == "text/html" {
if get_config().ROUTE_INTERNAL {
content = internalize_urls(&String::from_utf8_lossy(&content), &domain)
.as_bytes()
.to_vec();
}
}
return Some(DataResponse::new(
content.as_bytes().to_vec(),
"text/html".to_string(),
content,
mime.to_string(),
Some(60 * 60 * 24),
));
} else if get_config().DOWNLOAD_ON_DEMAND && time.is_none() {