fragments
Some checks are pending
ci/woodpecker/push/build Pipeline is pending

This commit is contained in:
JMARyA 2025-02-25 12:39:05 +01:00
parent aba031a047
commit dc4fb4079e
Signed by: jmarya
GPG key ID: 901B2ADDF27C2263
9 changed files with 433 additions and 14 deletions

14
Cargo.lock generated
View file

@ -202,7 +202,7 @@ checksum = "8c3c1a368f70d6cf7302d78f8f7093da241fb8e8807c05cc9e51a125895a6d5b"
[[package]]
name = "based"
version = "0.1.0"
source = "git+https://git.hydrar.de/jmarya/based#696b34f2f17ef2d86f0bc77993f9b0b8b652c0f6"
source = "git+https://git.hydrar.de/jmarya/based#4688968a32a6812d4751a013ad1b605232fe12f8"
dependencies = [
"bcrypt",
"chrono",
@ -1483,6 +1483,15 @@ version = "1.70.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf"
[[package]]
name = "itertools"
version = "0.14.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2b192c782037fadd9cfa75548310488aabdbf3d2da73885b31bd0abd03351285"
dependencies = [
"either",
]
[[package]]
name = "itoa"
version = "1.0.14"
@ -3792,12 +3801,14 @@ dependencies = [
name = "webarc"
version = "0.1.0"
dependencies = [
"base64 0.22.1",
"based",
"chrono",
"clap",
"env_logger",
"futures",
"html2md",
"itertools",
"log",
"maud",
"ollama-rs",
@ -3807,6 +3818,7 @@ dependencies = [
"rocket",
"serde",
"serde_json",
"sha2",
"sqlx",
"tokio",
"toml",

View file

@ -25,3 +25,6 @@ html2md = "0.2.14"
clap = { version = "4.5.23", features = ["cargo", "derive"] }
toml = "0.8.19"
url-escape = "0.1.1"
base64 = "0.22.1"
sha2 = "0.10.8"
itertools = "0.14.0"

View file

@ -56,9 +56,9 @@ no_frames = false
# User Agent
user_agent = "Safari"
[ai]
#[ai]
# Ollama URL (Enables vector search)
OLLAMA_URL="127.0.0.1:11434"
#OLLAMA_URL="127.0.0.1:11434"
# --- Website Config

View file

@ -0,0 +1,13 @@
CREATE TABLE IF NOT EXISTS fragments (
id TEXT NOT NULL PRIMARY KEY,
mime TEXT NOT NULL,
blob BYTEA NOT NULL
);
CREATE TABLE IF NOT EXISTS document_fragments (
domain TEXT NOT NULL,
path TEXT NOT NULL,
version DATE NOT NULL,
fragment TEXT NOT NULL,
PRIMARY KEY (domain, path, version, fragment)
);

37
src/archive/fragment.rs Normal file
View file

@ -0,0 +1,37 @@
use based::get_pg;
pub async fn get_fragment(hash: &str) -> Option<(Vec<u8>, String)> {
sqlx::query_as("SELECT blob, mime FROM fragments WHERE id = $1")
.bind(hash)
.fetch_optional(get_pg!())
.await
.unwrap()
}
pub async fn get_random_fragment_id() -> String {
let random_id: (String,) = sqlx::query_as("SELECT id FROM fragments ORDER BY RANDOM() LIMIT 1")
.fetch_one(get_pg!())
.await
.unwrap();
random_id.0
}
pub async fn domain_has_fragments(domain: &str) -> bool {
let exists: (bool,) =
sqlx::query_as("SELECT EXISTS(SELECT 1 FROM document_fragments WHERE domain = $1)")
.bind(domain)
.fetch_one(get_pg!())
.await
.unwrap();
exists.0
}
pub async fn get_fragments_of_domain(domain: &str) -> Vec<(String, String)> {
let res: Vec<(String, String)> = sqlx::query_as("SELECT df.fragment, f.mime FROM document_fragments df JOIN fragments f ON df.fragment = f.id WHERE df.domain = $1")
.bind(domain)
.fetch_all(get_pg!()).await.unwrap();
res.into_iter().map(|x| (x.0, x.1)).collect()
}

View file

@ -6,12 +6,15 @@ use std::{
use crate::{
blacklist::{check_blacklist, check_blacklist_path},
conf::get_config,
extract_data_urls,
favicon::download_fav_for,
get_mime_type,
get_mime_type, sha256_hash,
};
mod document;
mod domain;
mod fragment;
use based::{
get_pg,
ui::{components::prelude::Shell, prelude::Nothing},
@ -19,6 +22,8 @@ use based::{
use chrono::NaiveDate;
pub use document::Document;
pub use domain::*;
pub use fragment::*;
use sqlx::prelude::FromRow;
/// Read directory entries into `Vec<String>`
pub fn read_dir(dir: &PathBuf) -> Vec<String> {
@ -251,8 +256,6 @@ fn run_command(cmd: &[&str]) {
}
pub async fn index_archive_db(arc: &WebsiteArchive) {
// TODO : more index attrs size,mime
log::info!("Indexing archive");
for dom in arc.domains() {
@ -304,7 +307,27 @@ pub async fn index_document(doc: &Document) {
if mime.as_str() == "text/html" {
// TODO : domain links index
// TODO : data fragments
let mut hashes = Vec::new();
for (mime, data) in extract_data_urls(&String::from_utf8_lossy(&content)) {
let hash = sha256_hash(&data);
hashes.push(hash.clone());
sqlx::query("INSERT INTO fragments (id, mime, blob) VALUES ($1, $2, $3) ON CONFLICT DO NOTHING")
.bind(&hash)
.bind(&mime)
.bind(data)
.execute(get_pg!()).await.unwrap();
}
for hash in hashes {
sqlx::query("INSERT INTO document_fragments (domain, path, version, fragment) VALUES ($1, $2, $3, $4) ON CONFLICT DO NOTHING")
.bind(&doc.domain)
.bind(&doc.path)
.bind(chrono::NaiveDate::parse_from_str(version_str, "%Y-%m-%d").unwrap())
.bind(&hash)
.execute(get_pg!()).await.unwrap();
}
}
if let Ok(version) = chrono::NaiveDate::parse_from_str(version_str, "%Y-%m-%d") {
@ -328,9 +351,25 @@ pub async fn index_document(doc: &Document) {
}
}
pub struct DocumentIndex {}
#[derive(Debug, Clone, FromRow)]
pub struct DocumentIndex {
pub domain: String,
pub path: String,
pub version: chrono::NaiveDate,
pub size: i64,
pub mime: String,
}
impl DocumentIndex {
pub fn url(&self) -> String {
format!(
"/s/{}/{}?time={}",
self.domain,
self.path,
self.version.to_string()
)
}
pub async fn get_documents_of_day(
day: NaiveDate,
domain: Option<&str>,
@ -361,4 +400,12 @@ impl DocumentIndex {
ret
}
pub async fn get_documents_of_other_mime(domain: &str) -> Vec<DocumentIndex> {
sqlx::query_as("SELECT * FROM document_index WHERE mime != 'text/html' AND domain = $1")
.bind(domain)
.fetch_all(get_pg!())
.await
.unwrap()
}
}

View file

@ -10,6 +10,8 @@ pub mod blacklist;
pub mod conf;
pub mod favicon;
use base64::prelude::*;
use sha2::{Digest, Sha256};
use std::io::Write;
use std::process::{Command, Stdio};
@ -30,6 +32,33 @@ pub fn get_mime_type(content: &[u8]) -> std::io::Result<String> {
Ok(String::from_utf8_lossy(&output.stdout).trim().to_string())
}
pub fn extract_data_urls(input: &str) -> Vec<(String, Vec<u8>)> {
let data_url_regex =
regex::Regex::new(r"data:([a-zA-Z0-9/+.-]+);base64,([a-zA-Z0-9+/=]+)").unwrap();
let mut results = Vec::new();
for cap in data_url_regex.captures_iter(input) {
if let (Some(mime), Some(encoded)) = (cap.get(1), cap.get(2)) {
if let Ok(decoded) = BASE64_STANDARD.decode(encoded.as_str()) {
results.push((mime.as_str().to_string(), decoded));
}
}
}
results
}
pub fn from_base64(input: &str) -> Vec<u8> {
BASE64_STANDARD.decode(input).unwrap()
}
pub fn sha256_hash(input: &[u8]) -> String {
let mut hasher = Sha256::new();
hasher.update(input);
format!("{:x}", hasher.finalize())
}
pub async fn render_page(
content: PreEscaped<String>,
ctx: RequestContext,

View file

@ -58,7 +58,11 @@ async fn main() {
pages::favicon_route,
pages::vector_search,
pages::render_txt_website,
pages::timeline_route
pages::timeline_route,
pages::fragment_get,
pages::mime_overview,
pages::fragments_overview,
pages::fragments_domain_overview
],
)
.manage(arc)
@ -187,7 +191,42 @@ pub fn get_shell() -> Shell {
Background(MinHeight(ScreenValue::screen, Text("").white())).color(Zinc::_950),
)
.use_ui()
.with_navbar(NavBar("Web Archive"))
.with_navbar(
NavBar("Web Archive").menu(
Div()
.vanish()
.push(
Link(
"/",
Row(vec![MaterialIcon("language"), Text("Websites").render()])
.full_center()
.gap(ScreenValue::_2),
)
.use_htmx(),
)
.push(
Link(
"/mime",
Row(vec![
MaterialIcon("description"),
Text("Media Documents").render(),
])
.full_center()
.gap(ScreenValue::_2),
)
.use_htmx(),
)
.push(
Link(
"/fragments",
Row(vec![MaterialIcon("database"), Text("Fragments").render()])
.full_center()
.gap(ScreenValue::_2),
)
.use_htmx(),
),
),
)
}
// TODO : archive cleanup code

View file

@ -1,3 +1,4 @@
use std::vec;
use std::{collections::HashMap, io::Read, path::PathBuf, sync::Arc};
use based::ui::components::prelude::*;
@ -11,14 +12,19 @@ use based::{
ui::primitives::flex::Column,
};
use chrono::NaiveDate;
use itertools::Itertools;
use maud::{html, PreEscaped};
use rocket::response::Redirect;
use rocket::{get, request::FromSegments, State};
pub mod component;
use component::*;
use serde_json::json;
use webarc::archive::{internalize_urls, Document, DocumentIndex};
use webarc::archive::{
domain_has_fragments, get_fragment, get_fragments_of_domain, get_random_fragment_id,
internalize_urls, Document, DocumentIndex,
};
use webarc::get_mime_type;
use webarc::{
ai::{generate_embedding, remove_data_urls, EmbedStore, SearchResult},
@ -27,8 +33,6 @@ use webarc::{
render_page,
};
// TODO : PDF view
const SEARCH_BAR_STYLE: &str = "w-full px-4 mb-4 py-2 text-white bg-black border-2 border-neon-blue placeholder-neon-blue focus:ring-2 focus:ring-neon-pink focus:outline-none font-mono text-lg";
#[allow(non_snake_case)]
@ -46,7 +50,12 @@ fn time_ago(naive_datetime: &chrono::NaiveDate) -> String {
let duration = now.signed_duration_since(*naive_datetime);
if duration.num_seconds() < 60 {
format!("{} seconds ago", duration.num_seconds())
let s = duration.num_seconds();
if s == 0 {
"today".to_string()
} else {
format!("{} seconds ago", duration.num_seconds())
}
} else if duration.num_minutes() < 60 {
format!("{} minutes ago", duration.num_minutes())
} else if duration.num_hours() < 24 {
@ -236,6 +245,8 @@ pub async fn domain_info_route(
.unwrap_or_default(),
));
// TODO : refactor
let content = html! {
h2 class="text-xl font-bold mb-4 flex items-center w-fit mx-auto" {
img class="p-2" src=(format!("/favicon/{}", &domain.name)) {};
@ -430,6 +441,8 @@ pub async fn render_website(
let mime = get_mime_type(&content).unwrap_or("text/html".to_string());
if mime == "text/html" {
// TODO : fragments url rewrite
if get_config().ROUTE_INTERNAL {
content = internalize_urls(&String::from_utf8_lossy(&content), &domain)
.as_bytes()
@ -460,6 +473,18 @@ pub fn gen_search_element(x: &SearchResult) -> PreEscaped<String> {
}
}
#[get("/f/<hash>")]
pub async fn fragment_get(hash: &str) -> Option<DataResponse> {
if hash == "random" {
let random_hash = get_random_fragment_id().await;
let (data, mime) = get_fragment(&random_hash).await?;
return Some(DataResponse::new(data, mime, None));
}
let (data, mime) = get_fragment(hash).await?;
Some(DataResponse::new(data, mime, Some(60 * 60 * 24)))
}
#[get("/vector_search?<query>&<page>")]
pub async fn vector_search(
query: Option<&str>,
@ -510,3 +535,217 @@ pub async fn vector_search(
let content = search.build("", html! {});
Some(render_page(content, ctx, &shell).await)
}
// TODO : better mime awereness in ui
#[allow(non_snake_case)]
pub fn MIMEIcon(mime: &str) -> PreEscaped<String> {
if mime.starts_with("application") {
return MaterialIcon("description");
} else if mime.starts_with("audio") {
return MaterialIcon("play_circle");
} else if mime.starts_with("image") {
return MaterialIcon("photo");
} else if mime.starts_with("text") {
return MaterialIcon("description");
} else if mime.starts_with("video") {
return MaterialIcon("movie");
}
MaterialIcon("description")
}
pub async fn mime_docs_of_domain(domain: &str) -> PreEscaped<String> {
let other_mimes = DocumentIndex::get_documents_of_other_mime(domain).await;
let by_mime = other_mimes
.into_iter()
.filter(|x| !x.mime.is_empty())
.into_group_map_by(|x| x.mime.clone());
println!("{domain} -> {by_mime:?}");
if by_mime.keys().len() == 0 {
return Nothing();
}
let first = by_mime.keys().next().unwrap();
let mut t = Tabs().active(&format!("domain-{domain}-{first}"));
for (mime, doc) in by_mime {
t = t.add_tab(
&format!("domain-{}-{mime}", domain),
Row(vec![MIMEIcon(&mime), Text(&mime).render()])
.items_center()
.gap(ScreenValue::_2),
UnorderedList().push_for_each(&doc, |doc: &_| {
Link(
&doc.url(),
Context(
Hover(Background(Nothing()).color(Gray::_800)).on(Background(
Rounded(
Padding(
Margin(
Column(vec![
Text(&doc.domain),
Text(&doc.path),
Text(&doc.version.to_string()),
])
.gap(ScreenValue::_2),
)
.all(ScreenValue::_2),
)
.all(ScreenValue::_4),
)
.size(Size::Large),
)
.color(Gray::_700)),
),
)
}),
);
}
Column(vec![
Margin(
Row(vec![favicon(domain), Text(domain)._3xl().bold().render()]).gap(ScreenValue::_2),
)
.x(ScreenValue::_2)
.top(ScreenValue::_4)
.render(),
t.render(),
])
.gap(ScreenValue::_2)
.render()
}
#[get("/mime")]
pub async fn mime_overview(
ctx: RequestContext,
arc: &State<WebsiteArchive>,
shell: &State<Shell>,
) -> StringResponse {
let domains = arc.domains();
let mut d_html = Vec::new();
for d in domains {
d_html.push(mime_docs_of_domain(&d).await);
}
page!(
shell,
ctx,
"Media Items",
Margin(
Div()
.vanish()
.push(Text("Media Items")._2xl().bold())
.push_for_each(&d_html, |x: &_| x.render())
)
.all(ScreenValue::_4)
)
}
#[get("/fragments/<domain>")]
pub async fn fragments_domain_overview(
domain: &str,
ctx: RequestContext,
arc: &State<WebsiteArchive>,
shell: &State<Shell>,
) -> StringResponse {
let fragments = get_fragments_of_domain(domain).await;
let d_html: Vec<_> = fragments
.into_iter()
.map(|(fragment, mime)| {
if mime.starts_with("image") {
return Card(Image(&format!("/f/{fragment}")).height(128).width(128));
}
Card(
Tooltip(
Link(&format!("/f/{fragment}"), Text(&fragment)),
Text(&mime),
)
.white(),
)
})
.collect();
page!(
shell,
ctx,
"Media Items",
Margin(
Div()
.vanish()
.push(
Margin(Text(&format!("Fragments of {domain}"))._2xl().bold())
.bottom(ScreenValue::_4)
)
.push(
Flex(Div().vanish().push_for_each(&d_html, |x: &_| x.render()))
.wrap(Wrap::Wrap)
.gap(ScreenValue::_3)
.justify(Justify::Around)
)
)
.all(ScreenValue::_4)
)
}
#[get("/fragments")]
pub async fn fragments_overview(
ctx: RequestContext,
arc: &State<WebsiteArchive>,
shell: &State<Shell>,
) -> StringResponse {
let domains = arc.domains();
let mut d_html = Vec::new();
for x in domains {
if domain_has_fragments(&x).await {
d_html.push(
Link(
&format!("/fragments/{x}"),
Row(vec![favicon(&x), Text(&x).render()])
.items_center()
.render(),
)
.use_htmx()
.render(),
)
}
}
page!(
shell,
ctx,
"Media Items",
Margin(
Div()
.vanish()
.push(
Row(vec![
Margin(Text("Fragments")._2xl().bold())
.bottom(ScreenValue::_4)
.render(),
Margin(Button(
Row(vec![
MaterialIcon("casino"),
Link(&format!("/f/random"), "Random Fragment").render()
])
.items_center()
.gap(ScreenValue::_2)
))
.bottom(ScreenValue::_6)
.render()
])
.gap(ScreenValue::_4)
.items_center()
)
.push_for_each(&d_html, |x: &_| x.render())
)
.all(ScreenValue::_4)
)
}