diff --git a/Cargo.lock b/Cargo.lock index d101835..6f66167 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -202,7 +202,7 @@ checksum = "8c3c1a368f70d6cf7302d78f8f7093da241fb8e8807c05cc9e51a125895a6d5b" [[package]] name = "based" version = "0.1.0" -source = "git+https://git.hydrar.de/jmarya/based#696b34f2f17ef2d86f0bc77993f9b0b8b652c0f6" +source = "git+https://git.hydrar.de/jmarya/based#4688968a32a6812d4751a013ad1b605232fe12f8" dependencies = [ "bcrypt", "chrono", @@ -1483,6 +1483,15 @@ version = "1.70.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" +[[package]] +name = "itertools" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b192c782037fadd9cfa75548310488aabdbf3d2da73885b31bd0abd03351285" +dependencies = [ + "either", +] + [[package]] name = "itoa" version = "1.0.14" @@ -3792,12 +3801,14 @@ dependencies = [ name = "webarc" version = "0.1.0" dependencies = [ + "base64 0.22.1", "based", "chrono", "clap", "env_logger", "futures", "html2md", + "itertools", "log", "maud", "ollama-rs", @@ -3807,6 +3818,7 @@ dependencies = [ "rocket", "serde", "serde_json", + "sha2", "sqlx", "tokio", "toml", diff --git a/Cargo.toml b/Cargo.toml index 1493c06..e08aee1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -25,3 +25,6 @@ html2md = "0.2.14" clap = { version = "4.5.23", features = ["cargo", "derive"] } toml = "0.8.19" url-escape = "0.1.1" +base64 = "0.22.1" +sha2 = "0.10.8" +itertools = "0.14.0" diff --git a/config.toml b/config.toml index de3dc5b..871059d 100644 --- a/config.toml +++ b/config.toml @@ -56,9 +56,9 @@ no_frames = false # User Agent user_agent = "Safari" -[ai] +#[ai] # Ollama URL (Enables vector search) -OLLAMA_URL="127.0.0.1:11434" +#OLLAMA_URL="127.0.0.1:11434" # --- Website Config diff --git a/migrations/0004_fragments.sql b/migrations/0004_fragments.sql new file mode 100644 index 0000000..15e6064 --- /dev/null +++ b/migrations/0004_fragments.sql @@ -0,0 +1,13 @@ +CREATE TABLE IF NOT EXISTS fragments ( + id TEXT NOT NULL PRIMARY KEY, + mime TEXT NOT NULL, + blob BYTEA NOT NULL +); + +CREATE TABLE IF NOT EXISTS document_fragments ( + domain TEXT NOT NULL, + path TEXT NOT NULL, + version DATE NOT NULL, + fragment TEXT NOT NULL, + PRIMARY KEY (domain, path, version, fragment) +); diff --git a/src/archive/fragment.rs b/src/archive/fragment.rs new file mode 100644 index 0000000..c7589c4 --- /dev/null +++ b/src/archive/fragment.rs @@ -0,0 +1,37 @@ +use based::get_pg; + +pub async fn get_fragment(hash: &str) -> Option<(Vec, String)> { + sqlx::query_as("SELECT blob, mime FROM fragments WHERE id = $1") + .bind(hash) + .fetch_optional(get_pg!()) + .await + .unwrap() +} + +pub async fn get_random_fragment_id() -> String { + let random_id: (String,) = sqlx::query_as("SELECT id FROM fragments ORDER BY RANDOM() LIMIT 1") + .fetch_one(get_pg!()) + .await + .unwrap(); + + random_id.0 +} + +pub async fn domain_has_fragments(domain: &str) -> bool { + let exists: (bool,) = + sqlx::query_as("SELECT EXISTS(SELECT 1 FROM document_fragments WHERE domain = $1)") + .bind(domain) + .fetch_one(get_pg!()) + .await + .unwrap(); + + exists.0 +} + +pub async fn get_fragments_of_domain(domain: &str) -> Vec<(String, String)> { + let res: Vec<(String, String)> = sqlx::query_as("SELECT df.fragment, f.mime FROM document_fragments df JOIN fragments f ON df.fragment = f.id WHERE df.domain = $1") + .bind(domain) + .fetch_all(get_pg!()).await.unwrap(); + + res.into_iter().map(|x| (x.0, x.1)).collect() +} diff --git a/src/archive/mod.rs b/src/archive/mod.rs index 52b0f35..d0d3be9 100644 --- a/src/archive/mod.rs +++ b/src/archive/mod.rs @@ -6,12 +6,15 @@ use std::{ use crate::{ blacklist::{check_blacklist, check_blacklist_path}, conf::get_config, + extract_data_urls, favicon::download_fav_for, - get_mime_type, + get_mime_type, sha256_hash, }; mod document; mod domain; +mod fragment; + use based::{ get_pg, ui::{components::prelude::Shell, prelude::Nothing}, @@ -19,6 +22,8 @@ use based::{ use chrono::NaiveDate; pub use document::Document; pub use domain::*; +pub use fragment::*; +use sqlx::prelude::FromRow; /// Read directory entries into `Vec` pub fn read_dir(dir: &PathBuf) -> Vec { @@ -251,8 +256,6 @@ fn run_command(cmd: &[&str]) { } pub async fn index_archive_db(arc: &WebsiteArchive) { - // TODO : more index attrs size,mime - log::info!("Indexing archive"); for dom in arc.domains() { @@ -304,7 +307,27 @@ pub async fn index_document(doc: &Document) { if mime.as_str() == "text/html" { // TODO : domain links index - // TODO : data fragments + + let mut hashes = Vec::new(); + + for (mime, data) in extract_data_urls(&String::from_utf8_lossy(&content)) { + let hash = sha256_hash(&data); + hashes.push(hash.clone()); + sqlx::query("INSERT INTO fragments (id, mime, blob) VALUES ($1, $2, $3) ON CONFLICT DO NOTHING") + .bind(&hash) + .bind(&mime) + .bind(data) + .execute(get_pg!()).await.unwrap(); + } + + for hash in hashes { + sqlx::query("INSERT INTO document_fragments (domain, path, version, fragment) VALUES ($1, $2, $3, $4) ON CONFLICT DO NOTHING") + .bind(&doc.domain) + .bind(&doc.path) + .bind(chrono::NaiveDate::parse_from_str(version_str, "%Y-%m-%d").unwrap()) + .bind(&hash) + .execute(get_pg!()).await.unwrap(); + } } if let Ok(version) = chrono::NaiveDate::parse_from_str(version_str, "%Y-%m-%d") { @@ -328,9 +351,25 @@ pub async fn index_document(doc: &Document) { } } -pub struct DocumentIndex {} +#[derive(Debug, Clone, FromRow)] +pub struct DocumentIndex { + pub domain: String, + pub path: String, + pub version: chrono::NaiveDate, + pub size: i64, + pub mime: String, +} impl DocumentIndex { + pub fn url(&self) -> String { + format!( + "/s/{}/{}?time={}", + self.domain, + self.path, + self.version.to_string() + ) + } + pub async fn get_documents_of_day( day: NaiveDate, domain: Option<&str>, @@ -361,4 +400,12 @@ impl DocumentIndex { ret } + + pub async fn get_documents_of_other_mime(domain: &str) -> Vec { + sqlx::query_as("SELECT * FROM document_index WHERE mime != 'text/html' AND domain = $1") + .bind(domain) + .fetch_all(get_pg!()) + .await + .unwrap() + } } diff --git a/src/lib.rs b/src/lib.rs index 912171e..0f94acf 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -10,6 +10,8 @@ pub mod blacklist; pub mod conf; pub mod favicon; +use base64::prelude::*; +use sha2::{Digest, Sha256}; use std::io::Write; use std::process::{Command, Stdio}; @@ -30,6 +32,33 @@ pub fn get_mime_type(content: &[u8]) -> std::io::Result { Ok(String::from_utf8_lossy(&output.stdout).trim().to_string()) } +pub fn extract_data_urls(input: &str) -> Vec<(String, Vec)> { + let data_url_regex = + regex::Regex::new(r"data:([a-zA-Z0-9/+.-]+);base64,([a-zA-Z0-9+/=]+)").unwrap(); + + let mut results = Vec::new(); + + for cap in data_url_regex.captures_iter(input) { + if let (Some(mime), Some(encoded)) = (cap.get(1), cap.get(2)) { + if let Ok(decoded) = BASE64_STANDARD.decode(encoded.as_str()) { + results.push((mime.as_str().to_string(), decoded)); + } + } + } + + results +} + +pub fn from_base64(input: &str) -> Vec { + BASE64_STANDARD.decode(input).unwrap() +} + +pub fn sha256_hash(input: &[u8]) -> String { + let mut hasher = Sha256::new(); + hasher.update(input); + format!("{:x}", hasher.finalize()) +} + pub async fn render_page( content: PreEscaped, ctx: RequestContext, diff --git a/src/main.rs b/src/main.rs index b6f9b20..220c940 100644 --- a/src/main.rs +++ b/src/main.rs @@ -58,7 +58,11 @@ async fn main() { pages::favicon_route, pages::vector_search, pages::render_txt_website, - pages::timeline_route + pages::timeline_route, + pages::fragment_get, + pages::mime_overview, + pages::fragments_overview, + pages::fragments_domain_overview ], ) .manage(arc) @@ -187,7 +191,42 @@ pub fn get_shell() -> Shell { Background(MinHeight(ScreenValue::screen, Text("").white())).color(Zinc::_950), ) .use_ui() - .with_navbar(NavBar("Web Archive")) + .with_navbar( + NavBar("Web Archive").menu( + Div() + .vanish() + .push( + Link( + "/", + Row(vec![MaterialIcon("language"), Text("Websites").render()]) + .full_center() + .gap(ScreenValue::_2), + ) + .use_htmx(), + ) + .push( + Link( + "/mime", + Row(vec![ + MaterialIcon("description"), + Text("Media Documents").render(), + ]) + .full_center() + .gap(ScreenValue::_2), + ) + .use_htmx(), + ) + .push( + Link( + "/fragments", + Row(vec![MaterialIcon("database"), Text("Fragments").render()]) + .full_center() + .gap(ScreenValue::_2), + ) + .use_htmx(), + ), + ), + ) } // TODO : archive cleanup code diff --git a/src/pages/mod.rs b/src/pages/mod.rs index d5a73cb..8422291 100644 --- a/src/pages/mod.rs +++ b/src/pages/mod.rs @@ -1,3 +1,4 @@ +use std::vec; use std::{collections::HashMap, io::Read, path::PathBuf, sync::Arc}; use based::ui::components::prelude::*; @@ -11,14 +12,19 @@ use based::{ ui::primitives::flex::Column, }; use chrono::NaiveDate; +use itertools::Itertools; use maud::{html, PreEscaped}; +use rocket::response::Redirect; use rocket::{get, request::FromSegments, State}; pub mod component; use component::*; use serde_json::json; -use webarc::archive::{internalize_urls, Document, DocumentIndex}; +use webarc::archive::{ + domain_has_fragments, get_fragment, get_fragments_of_domain, get_random_fragment_id, + internalize_urls, Document, DocumentIndex, +}; use webarc::get_mime_type; use webarc::{ ai::{generate_embedding, remove_data_urls, EmbedStore, SearchResult}, @@ -27,8 +33,6 @@ use webarc::{ render_page, }; -// TODO : PDF view - const SEARCH_BAR_STYLE: &str = "w-full px-4 mb-4 py-2 text-white bg-black border-2 border-neon-blue placeholder-neon-blue focus:ring-2 focus:ring-neon-pink focus:outline-none font-mono text-lg"; #[allow(non_snake_case)] @@ -46,7 +50,12 @@ fn time_ago(naive_datetime: &chrono::NaiveDate) -> String { let duration = now.signed_duration_since(*naive_datetime); if duration.num_seconds() < 60 { - format!("{} seconds ago", duration.num_seconds()) + let s = duration.num_seconds(); + if s == 0 { + "today".to_string() + } else { + format!("{} seconds ago", duration.num_seconds()) + } } else if duration.num_minutes() < 60 { format!("{} minutes ago", duration.num_minutes()) } else if duration.num_hours() < 24 { @@ -236,6 +245,8 @@ pub async fn domain_info_route( .unwrap_or_default(), )); + // TODO : refactor + let content = html! { h2 class="text-xl font-bold mb-4 flex items-center w-fit mx-auto" { img class="p-2" src=(format!("/favicon/{}", &domain.name)) {}; @@ -430,6 +441,8 @@ pub async fn render_website( let mime = get_mime_type(&content).unwrap_or("text/html".to_string()); if mime == "text/html" { + // TODO : fragments url rewrite + if get_config().ROUTE_INTERNAL { content = internalize_urls(&String::from_utf8_lossy(&content), &domain) .as_bytes() @@ -460,6 +473,18 @@ pub fn gen_search_element(x: &SearchResult) -> PreEscaped { } } +#[get("/f/")] +pub async fn fragment_get(hash: &str) -> Option { + if hash == "random" { + let random_hash = get_random_fragment_id().await; + let (data, mime) = get_fragment(&random_hash).await?; + return Some(DataResponse::new(data, mime, None)); + } + + let (data, mime) = get_fragment(hash).await?; + Some(DataResponse::new(data, mime, Some(60 * 60 * 24))) +} + #[get("/vector_search?&")] pub async fn vector_search( query: Option<&str>, @@ -510,3 +535,217 @@ pub async fn vector_search( let content = search.build("", html! {}); Some(render_page(content, ctx, &shell).await) } + +// TODO : better mime awereness in ui + +#[allow(non_snake_case)] +pub fn MIMEIcon(mime: &str) -> PreEscaped { + if mime.starts_with("application") { + return MaterialIcon("description"); + } else if mime.starts_with("audio") { + return MaterialIcon("play_circle"); + } else if mime.starts_with("image") { + return MaterialIcon("photo"); + } else if mime.starts_with("text") { + return MaterialIcon("description"); + } else if mime.starts_with("video") { + return MaterialIcon("movie"); + } + + MaterialIcon("description") +} + +pub async fn mime_docs_of_domain(domain: &str) -> PreEscaped { + let other_mimes = DocumentIndex::get_documents_of_other_mime(domain).await; + let by_mime = other_mimes + .into_iter() + .filter(|x| !x.mime.is_empty()) + .into_group_map_by(|x| x.mime.clone()); + + println!("{domain} -> {by_mime:?}"); + + if by_mime.keys().len() == 0 { + return Nothing(); + } + + let first = by_mime.keys().next().unwrap(); + + let mut t = Tabs().active(&format!("domain-{domain}-{first}")); + + for (mime, doc) in by_mime { + t = t.add_tab( + &format!("domain-{}-{mime}", domain), + Row(vec![MIMEIcon(&mime), Text(&mime).render()]) + .items_center() + .gap(ScreenValue::_2), + UnorderedList().push_for_each(&doc, |doc: &_| { + Link( + &doc.url(), + Context( + Hover(Background(Nothing()).color(Gray::_800)).on(Background( + Rounded( + Padding( + Margin( + Column(vec![ + Text(&doc.domain), + Text(&doc.path), + Text(&doc.version.to_string()), + ]) + .gap(ScreenValue::_2), + ) + .all(ScreenValue::_2), + ) + .all(ScreenValue::_4), + ) + .size(Size::Large), + ) + .color(Gray::_700)), + ), + ) + }), + ); + } + + Column(vec![ + Margin( + Row(vec![favicon(domain), Text(domain)._3xl().bold().render()]).gap(ScreenValue::_2), + ) + .x(ScreenValue::_2) + .top(ScreenValue::_4) + .render(), + t.render(), + ]) + .gap(ScreenValue::_2) + .render() +} + +#[get("/mime")] +pub async fn mime_overview( + ctx: RequestContext, + arc: &State, + shell: &State, +) -> StringResponse { + let domains = arc.domains(); + let mut d_html = Vec::new(); + + for d in domains { + d_html.push(mime_docs_of_domain(&d).await); + } + + page!( + shell, + ctx, + "Media Items", + Margin( + Div() + .vanish() + .push(Text("Media Items")._2xl().bold()) + .push_for_each(&d_html, |x: &_| x.render()) + ) + .all(ScreenValue::_4) + ) +} + +#[get("/fragments/")] +pub async fn fragments_domain_overview( + domain: &str, + ctx: RequestContext, + arc: &State, + shell: &State, +) -> StringResponse { + let fragments = get_fragments_of_domain(domain).await; + + let d_html: Vec<_> = fragments + .into_iter() + .map(|(fragment, mime)| { + if mime.starts_with("image") { + return Card(Image(&format!("/f/{fragment}")).height(128).width(128)); + } + + Card( + Tooltip( + Link(&format!("/f/{fragment}"), Text(&fragment)), + Text(&mime), + ) + .white(), + ) + }) + .collect(); + + page!( + shell, + ctx, + "Media Items", + Margin( + Div() + .vanish() + .push( + Margin(Text(&format!("Fragments of {domain}"))._2xl().bold()) + .bottom(ScreenValue::_4) + ) + .push( + Flex(Div().vanish().push_for_each(&d_html, |x: &_| x.render())) + .wrap(Wrap::Wrap) + .gap(ScreenValue::_3) + .justify(Justify::Around) + ) + ) + .all(ScreenValue::_4) + ) +} + +#[get("/fragments")] +pub async fn fragments_overview( + ctx: RequestContext, + arc: &State, + shell: &State, +) -> StringResponse { + let domains = arc.domains(); + let mut d_html = Vec::new(); + + for x in domains { + if domain_has_fragments(&x).await { + d_html.push( + Link( + &format!("/fragments/{x}"), + Row(vec![favicon(&x), Text(&x).render()]) + .items_center() + .render(), + ) + .use_htmx() + .render(), + ) + } + } + + page!( + shell, + ctx, + "Media Items", + Margin( + Div() + .vanish() + .push( + Row(vec![ + Margin(Text("Fragments")._2xl().bold()) + .bottom(ScreenValue::_4) + .render(), + Margin(Button( + Row(vec![ + MaterialIcon("casino"), + Link(&format!("/f/random"), "Random Fragment").render() + ]) + .items_center() + .gap(ScreenValue::_2) + )) + .bottom(ScreenValue::_6) + .render() + ]) + .gap(ScreenValue::_4) + .items_center() + ) + .push_for_each(&d_html, |x: &_| x.render()) + ) + .all(ScreenValue::_4) + ) +}