From 8df8edeeca1a4253938148c804096041dc68d616 Mon Sep 17 00:00:00 2001 From: JMARyA Date: Thu, 2 Jan 2025 19:00:47 +0100 Subject: [PATCH] update --- Cargo.lock | 115 ++++++++++++++++++++ Cargo.toml | 2 + README.md | 43 +++++++- env => config.toml | 15 +-- docker-compose.yml | 7 +- migrations/0001_embedding.sql | 4 +- src/ai.rs | 64 +++++++---- src/archive.rs | 49 +++++++-- src/args.rs | 61 +++++++++++ src/blacklist.rs | 16 +-- src/conf.rs | 70 ++++++++++++ src/lib.rs | 29 +++++ src/main.rs | 199 +++++++++++++++++++++++++++------- src/pages/component.rs | 22 ---- src/pages/mod.rs | 19 ++-- 15 files changed, 591 insertions(+), 124 deletions(-) rename env => config.toml (51%) create mode 100644 src/args.rs create mode 100644 src/conf.rs create mode 100644 src/lib.rs diff --git a/Cargo.lock b/Cargo.lock index 8e06abb..05db4e7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -59,6 +59,55 @@ dependencies = [ "libc", ] +[[package]] +name = "anstream" +version = "0.6.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8acc5369981196006228e28809f761875c0327210a891e941f4c683b3a99529b" +dependencies = [ + "anstyle", + "anstyle-parse", + "anstyle-query", + "anstyle-wincon", + "colorchoice", + "is_terminal_polyfill", + "utf8parse", +] + +[[package]] +name = "anstyle" +version = "1.0.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "55cc3b69f167a1ef2e161439aa98aed94e6028e5f9a59be9a6ffb47aef1651f9" + +[[package]] +name = "anstyle-parse" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b2d16507662817a6a20a9ea92df6652ee4f94f914589377d69f3b21bc5798a9" +dependencies = [ + "utf8parse", +] + +[[package]] +name = "anstyle-query" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "79947af37f4177cfead1110013d678905c37501914fba0efea834c3fe9a8d60c" +dependencies = [ + "windows-sys 0.59.0", +] + +[[package]] +name = "anstyle-wincon" +version = "3.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2109dbce0e72be3ec00bed26e6a7479ca384ad226efdd66db8fa2e3a38c83125" +dependencies = [ + "anstyle", + "windows-sys 0.59.0", +] + [[package]] name = "async-stream" version = "0.3.6" @@ -313,6 +362,52 @@ dependencies = [ "inout", ] +[[package]] +name = "clap" +version = "4.5.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3135e7ec2ef7b10c6ed8950f0f792ed96ee093fa088608f1c76e569722700c84" +dependencies = [ + "clap_builder", + "clap_derive", +] + +[[package]] +name = "clap_builder" +version = "4.5.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "30582fc632330df2bd26877bde0c1f4470d57c582bbc070376afcd04d8cb4838" +dependencies = [ + "anstream", + "anstyle", + "clap_lex", + "strsim", +] + +[[package]] +name = "clap_derive" +version = "4.5.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ac6a0c7b1a9e9a5186361f67dfa1b88213572f427fb9ab038efb2bd8c582dab" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn 2.0.93", +] + +[[package]] +name = "clap_lex" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f46ad14479a25103f283c0f10005961cf086d8dc42205bb44c46ac563475dca6" + +[[package]] +name = "colorchoice" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b63caa9aa9397e2d9480a9b13673856c78d8ac123288526c37d7839f2a86990" + [[package]] name = "combine" version = "4.6.7" @@ -1351,6 +1446,12 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "is_terminal_polyfill" +version = "1.70.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" + [[package]] name = "itoa" version = "1.0.14" @@ -2942,6 +3043,12 @@ dependencies = [ "unicode-properties", ] +[[package]] +name = "strsim" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" + [[package]] name = "subtle" version = "2.6.1" @@ -3480,6 +3587,12 @@ version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" +[[package]] +name = "utf8parse" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" + [[package]] name = "uuid" version = "1.11.0" @@ -3622,6 +3735,7 @@ version = "0.1.0" dependencies = [ "based", "chrono", + "clap", "env_logger", "futures", "html2md", @@ -3636,6 +3750,7 @@ dependencies = [ "serde_json", "sqlx", "tokio", + "toml", "url", "uuid", ] diff --git a/Cargo.toml b/Cargo.toml index 1385b30..cdb2de4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -22,3 +22,5 @@ reqwest = "0.12.11" ollama-rs = "0.2.2" pgvector = { version = "0.4", features = ["sqlx"] } html2md = "0.2.14" +clap = { version = "4.5.23", features = ["cargo", "derive"] } +toml = "0.8.19" diff --git a/README.md b/README.md index 95400d7..784512c 100644 --- a/README.md +++ b/README.md @@ -1,14 +1,47 @@ # WebArc `webarc` is a local website archive based on [monolith](https://github.com/Y2Z/monolith). -## Configuration -You can configure the application using environment variables: +## Archive Format +A web archive is defined as a directory containing domains in this structure: -- `$ROUTE_INTERNAL` : Rewrite links to point back to the archive itself -- `$DOWNLOAD_ON_DEMAND` : Download missing routes with monolith on demand -- `$BLACKLIST_DOMAINS` : Blacklisted domains (Comma-seperated regex, example: `google.com,.*.youtube.com`) +``` +web_archive/ +├─ domain.com/ +│ ├─ sub/ +│ │ ├─ path/ +│ │ │ ├─ index_YYYY_MM_DD.html +├─ sub.domain.com/ +``` + +Every document of this web archive can then be found at `archive/domain/paths.../index_YYYY_MM_DD.html`. ## Usage +webarc provides a CLI tool to work with the archive structure. + +```sh +# List domains in archive +webarc [--dir ARCHIVE] archive list [-j, --json] + +# List all paths on a domain +webarc [--dir ARCHIVE] archive list [-j, --json] [DOMAIN] + +# List all versions of a document +webarc [--dir ARCHIVE] archive versions [-j, --json] [DOMAIN] [PATH] + +# Get a document +# `--md` will return a markdown version +webarc [--dir ARCHIVE] archive get [--md] [DOMAIN] [PATH] [VERSION] + +# Archive a website +webarc [--dir ARCHIVE] archive download [URL] +``` + +## Configuration +You can configure the application using a config file. Look at the [config.toml](config.toml) file for more information. + +## Web Server +You can start a webserver serving an archive with `webarc serve`. + Archived pages can be viewed at `/s//`. For example, `/s/en.wikipedia.org/wiki/Website` will serve `en.wikipedia.org` at `/wiki/Website`. diff --git a/env b/config.toml similarity index 51% rename from env rename to config.toml index f0a8e54..6349bc0 100644 --- a/env +++ b/config.toml @@ -1,6 +1,3 @@ -# Logging -RUST_LOG=info -ROCKET_ADDRESS=0.0.0.0 # Rewrite links to point back to the archive itself ROUTE_INTERNAL=true @@ -8,12 +5,12 @@ ROUTE_INTERNAL=true # Download missing routes on demand DOWNLOAD_ON_DEMAND=true -# Blacklisted domains (Comma-seperated regex) +[websites] # You can blacklist sites which wont work well -BLACKLIST_DOMAINS="^gitlab" - -# Database -DATABASE_URL=postgres://user:pass@postgres/webarc +BLACKLIST_DOMAINS = [ + "^gitlab" # All domains starting with gitlab +] +[ai] # Ollama URL (Enables vector search) -OLLAMA_URL=127.0.0.1:11434 +OLLAMA_URL="127.0.0.1:11434" diff --git a/docker-compose.yml b/docker-compose.yml index 00690e8..a3858b1 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -6,7 +6,12 @@ services: volumes: - ./websites:/websites - ./favicon:/favicon - env_file: env + - ./config.toml:/config.toml + environment: + - "RUST_LOG=info" + - "ROCKET_ADDRESS=0.0.0.0" + - "DATABASE_URL=postgres://user:pass@postgres/webarc" + command: "webarc serve" postgres: # Any Postgres with support for pgvector diff --git a/migrations/0001_embedding.sql b/migrations/0001_embedding.sql index 15f027e..602f9cb 100644 --- a/migrations/0001_embedding.sql +++ b/migrations/0001_embedding.sql @@ -8,4 +8,6 @@ CREATE TABLE doc_embedding ( chunk INTEGER NOT NULL, embed_mxbai_embed_large vector(1024) NOT NULL, PRIMARY KEY (domain, path, ver, chunk) -) +); + +CREATE INDEX ON doc_embedding USING ivfflat (embed_mxbai_embed_large vector_cosine_ops) WITH (lists = 200); diff --git a/src/ai.rs b/src/ai.rs index a0d72d6..32069ae 100644 --- a/src/ai.rs +++ b/src/ai.rs @@ -6,10 +6,12 @@ use serde::Serialize; use serde_json::json; use sqlx::FromRow; -use crate::archive::{Document, Domain, WebsiteArchive}; +use crate::{ + archive::{Document, Domain, WebsiteArchive}, + conf::get_config, +}; -// TODO : Chunked embeddings + better search + ranking -// TODO : Real citese embeddings + search +// TODO : Cite found chunks in search res? #[derive(Debug, Clone, FromRow, Serialize)] pub struct DocEmbedding { @@ -18,6 +20,7 @@ pub struct DocEmbedding { pub ver: String, pub chunk: i32, + #[allow(dead_code)] #[serde(skip)] embed_mxbai_embed_large: pgvector::Vector, @@ -25,24 +28,42 @@ pub struct DocEmbedding { pub similarity: f64, } +impl DocEmbedding { + pub async fn total_chunks(&self) -> i64 { + let res: (i64,) = sqlx::query_as( + "SELECT MAX(chunk) FROM doc_embedding WHERE domain = $1 AND path = $2 AND ver = $3", + ) + .bind(&self.domain) + .bind(&self.path) + .bind(&self.ver) + .fetch_one(get_pg!()) + .await + .unwrap(); + + res.0 + } +} + #[derive(Debug, Clone, Serialize)] pub struct SearchResult { pub domain: String, pub path: String, + pub total_chunks: i64, pub chunks: Vec, } impl SearchResult { - pub fn new(domain: String, path: String) -> Self { + pub fn new(domain: String, path: String, total_chunks: i64) -> Self { Self { domain, path, + total_chunks, chunks: vec![], } } pub fn similarity(&self) -> f64 { - total_score(&self.chunks) + total_score(&self.chunks) * (self.chunks.len() as f64 / self.total_chunks as f64) } } @@ -99,6 +120,13 @@ pub fn chunked(s: &str) -> Vec { .collect() } +fn remove_data_urls(input: &str) -> String { + let re = regex::Regex::new("data:(.*?)(;base64)?,(.*)").unwrap(); + + // Replace all occurrences of data URLs with an empty string + re.replace_all(input, "").to_string() +} + impl Embedding for Document { async fn embedding(&self, ver: Option) -> Option>> { let latest = "latest".to_string(); @@ -110,7 +138,7 @@ impl Embedding for Document { ); let content_html = self.render_local(ver.clone()).await?; - let content = html2md::parse_html(&content_html); + let content = remove_data_urls(&html2md::parse_html(&content_html)); let mut embeddings = Vec::new(); let content = chunked(&content); @@ -133,7 +161,8 @@ impl Embedding for Document { } pub async fn generate_embedding(mut input: String) -> Option> { - if let Ok(ollama_url) = std::env::var("OLLAMA_URL") { + // TODO : Ollama load balancing + if let Some(ollama_url) = get_config().ai.as_ref().map(|x| x.OLLAMA_URL.clone()) { let (host, port) = ollama_url.split_once(':')?; let ollama = ollama_rs::Ollama::new(format!("http://{host}"), port.parse().ok()?); @@ -231,13 +260,10 @@ impl EmbedStore { } pub async fn search_vector(v: &pgvector::Vector, limit: i64, offset: i64) -> Vec { - // TODO : fix search - // + new ranked algorithm - // + better repr // limit should cover SearchResults not the query -> rework let results: Vec = sqlx::query_as( - "SELECT *, 1 / (1 + (embed_mxbai_embed_large <-> $1)) AS similarity FROM doc_embedding ORDER BY embed_mxbai_embed_large <-> $1 LIMIT $2 OFFSET $3", + "SELECT *, 1 / (1 + (embed_mxbai_embed_large <-> $1)) AS similarity FROM doc_embedding ORDER BY embed_mxbai_embed_large <=> $1 LIMIT $2 OFFSET $3", ) .bind(v) .bind(limit) @@ -249,26 +275,24 @@ impl EmbedStore { let mut search_res: HashMap> = HashMap::new(); for res in results { - let domain = search_res - .entry(res.domain.clone()) - .or_insert(HashMap::new()); - let doc = domain - .entry(res.path.clone()) - .or_insert(SearchResult::new(res.domain.clone(), res.path.clone())); + let domain = search_res.entry(res.domain.clone()).or_default(); + let doc = domain.entry(res.path.clone()).or_insert(SearchResult::new( + res.domain.clone(), + res.path.clone(), + res.total_chunks().await, + )); doc.chunks.push(res); } let mut flat = search_res .into_values() - .map(|x| x.into_values().collect::>()) - .flatten() + .flat_map(|x| x.into_values().collect::>()) .collect::>(); flat.sort_by(|a, b| { b.similarity() .partial_cmp(&a.similarity()) .unwrap_or(std::cmp::Ordering::Equal) - .then(b.chunks.len().cmp(&a.chunks.len())) }); flat } diff --git a/src/archive.rs b/src/archive.rs index 4dea32f..9c03191 100644 --- a/src/archive.rs +++ b/src/archive.rs @@ -3,7 +3,7 @@ use std::{io::Read, path::PathBuf}; use based::{request::RequestContext, result::LogAndIgnore}; use maud::html; -use crate::{blacklist::check_blacklist, favicon::download_fav_for, pages::component::render_page}; +use crate::{blacklist::check_blacklist, conf::get_config, favicon::download_fav_for, render_page}; /// Read directory entries into `Vec` pub fn read_dir(dir: &PathBuf) -> Vec { @@ -22,16 +22,19 @@ pub fn read_dir(dir: &PathBuf) -> Vec { /// Rewrite all URLs in `input` to the format `/s//` fn internalize_urls(input: &str) -> String { - // TODO : Ignore blacklisted urls let url_pattern = r"https?://([a-zA-Z0-9.-]+)(/[\w./-]*)"; let re = regex::Regex::new(url_pattern).unwrap(); re.replace_all(input, |caps: ®ex::Captures| { - format!( - "/s/{}/{}", - &caps[1].trim_start_matches("www."), // Domain - &caps[2] // Path - ) + let domain = caps[1].trim_start_matches("www."); + let path = &caps[2]; + + // Dont transform if in blacklist + if check_blacklist(domain) { + return format!("https://{domain}/{path}"); + } + + format!("/s/{domain}/{path}") }) .to_string() } @@ -82,6 +85,23 @@ impl Domain { Document::new(&self.name, path, self.dir.parent().unwrap().to_path_buf()) } + /// Get all paths associated with the domain + pub fn all_paths(&self) -> Vec { + let mut queue = self.paths("/").0; + + let mut ret = Vec::new(); + + ret.push(PathEntry(self.name.clone(), "/".to_string())); + + while let Some(el) = queue.pop() { + ret.push(el.clone()); + let paths = self.paths(&el.1).0; + queue.extend(paths); + } + + ret + } + /// Retrieves entries and metadata for a given path within the domain. /// /// # Parameters @@ -98,6 +118,12 @@ impl Domain { base_path = base_path.join(p); } + let path = path + .split("/") + .filter(|x| !x.is_empty()) + .collect::>() + .join("/"); + let dir_content = read_dir(&base_path); let mut ret = Vec::new(); @@ -106,6 +132,11 @@ impl Domain { for entry in dir_content { let url_path = format!("{path}/{entry}"); + let url_path = url_path + .split("/") + .filter(|x| !x.is_empty()) + .collect::>() + .join("/"); if entry.starts_with("index_") && entry.ends_with(".html") { is_doc = true; continue; @@ -119,6 +150,7 @@ impl Domain { } /// Represents an entry within a domain's path, containing its name and URL path. +#[derive(Debug, Clone)] pub struct PathEntry(String, String); impl PathEntry { @@ -203,7 +235,7 @@ impl Document { .unwrap(); let content = String::from_utf8_lossy(&buf); - if std::env::var("ROUTE_INTERNAL").unwrap_or("false".to_string()) == "true" { + if get_config().ROUTE_INTERNAL { Some(internalize_urls(&content)) } else { Some(content.to_string()) @@ -291,6 +323,7 @@ impl WebsiteArchive { /// /// This function downloads the content of the URL, processes it, and saves it to the archive. pub async fn archive_url(&self, url: &str) { + // TODO : refactor let parsed_url = url::Url::parse(url).unwrap(); let domain = parsed_url.domain().unwrap().trim_start_matches("www."); diff --git a/src/args.rs b/src/args.rs new file mode 100644 index 0000000..c362440 --- /dev/null +++ b/src/args.rs @@ -0,0 +1,61 @@ +use clap::{arg, command}; + +pub fn get_args() -> clap::ArgMatches { + command!() + .about("Web Archive") + .arg( + arg!(-d --dir "Web archive directory") + .required(false) + .default_value("./websites"), + ) + .subcommand( + command!() + .name("serve") + .about("Start web archive server") + .arg( + arg!(-c --config "Web archive config file") + .required(false) + .default_value("./config.toml"), + ), + ) + .subcommand( + command!() + .name("archive") + .about("Work with web archives") + .subcommand( + command!() + .name("download") + .about("Download a new URL into the archive") + .arg( + arg!(-c --config "Web archive config file") + .required(false) + .default_value("./config.toml"), + ) + .arg(arg!([URL] "The URL to download").required(true)) + ) + .subcommand( + command!() + .name("list") + .about("List domains contained in the archive. If a domain is provided all paths of this domain will be listed.") + .arg(arg!([DOMAIN] "A domain to list").required(false)) + .arg(arg!(-j --json "Ouput JSON").required(false)), + ) + .subcommand( + command!() + .name("versions") + .about("List saved versions of a document") + .arg(arg!(-j --json "Ouput JSON").required(false)) + .arg(arg!([DOMAIN] "A domain").required(true)) + .arg(arg!([PATH] "A path").required(false)) + ) + .subcommand( + command!() + .name("get") + .about("Get a saved document") + .arg(arg!(--md "Ouput Markdown").required(false)) + .arg(arg!([DOMAIN] "A domain").required(true)) + .arg(arg!([PATH] "A path").required(false)) + .arg(arg!([VERSION] "A version").required(false)) + )) + .get_matches() +} diff --git a/src/blacklist.rs b/src/blacklist.rs index 84ea10f..8ff228f 100644 --- a/src/blacklist.rs +++ b/src/blacklist.rs @@ -1,17 +1,17 @@ +use crate::conf::get_config; + /// Checks if a domain is present in the blacklist of unwanted domains. /// -/// This function checks the `$BLACKLIST_DOMAINS` environment variable for a comma-separated list of regular expressions to match against. /// If a match is found, it immediately returns `true`. Otherwise, it returns `false`. pub fn check_blacklist(domain: &str) -> bool { - let blacklist_raw = std::env::var("BLACKLIST_DOMAINS").unwrap_or_default(); + let conf = get_config(); + let conf = conf.websites.as_ref(); - if blacklist_raw.is_empty() { - return false; - } + let blacklisted_domains = conf + .map(|x| x.BLACKLIST_DOMAINS.as_ref()) + .unwrap_or_default(); - let blacklist: Vec<&str> = blacklist_raw.split(',').collect(); - - for domain_regex in blacklist { + for domain_regex in blacklisted_domains.unwrap_or(&Vec::new()) { let rgx = regex::Regex::new(domain_regex).unwrap(); if rgx.is_match(domain) { return true; diff --git a/src/conf.rs b/src/conf.rs new file mode 100644 index 0000000..3b043d6 --- /dev/null +++ b/src/conf.rs @@ -0,0 +1,70 @@ +use std::sync::Arc; + +use serde::Deserialize; +use tokio::sync::OnceCell; + +pub static CONFIG: OnceCell> = OnceCell::const_new(); + +/// Get a reference to global config +pub fn get_config() -> &'static Arc { + crate::conf::CONFIG.get().unwrap() +} + +/// Load a global config +pub fn load_config(path: &str) { + // TODO : Other load locations + if let Ok(file_content) = std::fs::read_to_string(path) { + let conf: Config = + toml::from_str(&file_content).expect("Could not deserialize config file"); + crate::conf::CONFIG.set(std::sync::Arc::new(conf)).unwrap(); + } +} + +/// Load a default global config +pub fn load_default_config() { + if crate::conf::CONFIG.get().is_none() { + crate::conf::CONFIG + .set(std::sync::Arc::new(Config::default())) + .unwrap(); + } +} + +#[allow(non_snake_case)] +#[derive(Debug, Clone, Deserialize)] +pub struct Config { + pub ROUTE_INTERNAL: bool, + pub DOWNLOAD_ON_DEMAND: bool, + pub ai: Option, + pub websites: Option, +} + +#[allow(non_snake_case)] +#[derive(Debug, Clone, Deserialize)] +pub struct AIConfig { + pub OLLAMA_URL: String, +} + +#[allow(non_snake_case)] +#[derive(Debug, Clone, Deserialize)] +pub struct WebsiteConfig { + pub BLACKLIST_DOMAINS: Option>, + pub domains: Option>, +} + +#[derive(Debug, Clone, Deserialize)] +pub struct DomainConfig { + // TODO : Domain specific config + pub blacklist_paths: Option>, + pub no_javascript: bool, +} + +impl Default for Config { + fn default() -> Self { + Self { + ROUTE_INTERNAL: false, + DOWNLOAD_ON_DEMAND: false, + ai: None, + websites: None, + } + } +} diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..9fb72e9 --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,29 @@ +use based::{ + page::Shell, + request::{RequestContext, StringResponse}, +}; +use maud::{html, PreEscaped}; + +pub mod ai; +pub mod archive; +pub mod blacklist; +pub mod conf; +pub mod favicon; + +pub async fn render_page(content: PreEscaped, ctx: RequestContext) -> StringResponse { + based::page::render_page( + content, + "Website Archive", + ctx, + &Shell::new( + html! { + script src="https://cdn.tailwindcss.com" {}; + meta name="viewport" content="width=device-width, initial-scale=1.0" {}; + script src="/assets/htmx.min.js" {}; + }, + html! {}, + Some("bg-zinc-950 text-white min-h-screen flex pt-8 justify-center".to_string()), + ), + ) + .await +} diff --git a/src/main.rs b/src/main.rs index 2ae11f7..7896697 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,49 +1,170 @@ -use ai::EmbedStore; -use archive::WebsiteArchive; use based::get_pg; use rocket::routes; +use webarc::ai::EmbedStore; +use webarc::archive::WebsiteArchive; +use webarc::conf::{get_config, load_config, load_default_config}; -mod ai; -mod archive; -mod blacklist; -mod favicon; +mod args; mod pages; -#[rocket::launch] -async fn launch() -> _ { +#[tokio::main] +async fn main() { env_logger::init(); - let arc = WebsiteArchive::new("./websites"); + let args = args::get_args(); - if std::env::var("DATABASE_URL").is_ok() { - let pg = get_pg!(); - sqlx::migrate!("./migrations").run(pg).await.unwrap(); + let archive_dir: &String = args.get_one("dir").unwrap(); + + match args.subcommand() { + Some(("serve", serve_args)) => { + let config: &String = serve_args.get_one("config").unwrap(); + load_config(config); + + let arc = WebsiteArchive::new(archive_dir); + + if std::env::var("DATABASE_URL").is_ok() { + let pg = get_pg!(); + sqlx::migrate!("./migrations").run(pg).await.unwrap(); + } + + let archive = arc.clone(); + if get_config().ai.is_some() { + tokio::spawn(async move { + EmbedStore::generate_embeddings_for(&archive).await; + }); + } + + let archive = arc.clone(); + tokio::spawn(async move { + webarc::favicon::download_favicons_for_sites(&archive.domains()).await; + }); + + rocket::build() + .mount( + "/", + routes![ + based::htmx::htmx_script_route, + pages::index, + pages::render_website, + pages::domain_info_route, + pages::favicon_route, + pages::vector_search, + pages::render_txt_website + ], + ) + .manage(arc) + .launch() + .await + .unwrap(); + } + Some(("archive", archive_args)) => { + let arc = WebsiteArchive::new(archive_dir); + + match archive_args.subcommand() { + Some(("list", list_args)) => { + let json = list_args.get_flag("json"); + + load_default_config(); + + let elements = if let Some(domain) = list_args.get_one::("DOMAIN") { + arc.get_domain(domain) + .all_paths() + .into_iter() + .map(|x| x.path().clone()) + .collect() + } else { + arc.domains() + }; + + if json { + println!( + "{}", + serde_json::to_string(&serde_json::json!(elements)).unwrap() + ); + } else { + if let Some(domain) = list_args.get_one::("DOMAIN") { + println!("Paths in {domain}:"); + } else { + println!("Domains in {}:", archive_dir); + } + + if elements.is_empty() { + println!("No domains"); + } + + for d in elements { + println!("- {d}"); + } + } + } + Some(("download", dl_args)) => { + let url: &String = dl_args.get_one("URL").unwrap(); + + let config: &String = dl_args.get_one("config").unwrap(); + load_config(config); + + arc.archive_url(url).await; + println!("Saved {url} to archive"); + } + Some(("versions", ver_args)) => { + load_default_config(); + + let domain: &String = ver_args.get_one("DOMAIN").unwrap(); + let path: String = if let Some(path) = ver_args.get_one::("PATH") { + path.clone() + } else { + "/".to_string() + }; + let versions = arc.get_domain(domain).path(&path).versions(); + + let json = ver_args.get_flag("json"); + + if json { + println!("{}", serde_json::to_string(&versions).unwrap()); + } else { + println!("Versions for {domain} / {path}:"); + for v in versions { + println!("- {v}"); + } + } + } + Some(("get", get_args)) => { + load_default_config(); + + let domain: &String = get_args.get_one("DOMAIN").unwrap(); + let path = if let Some(path) = get_args.get_one::("PATH") { + path.clone() + } else { + "/".to_string() + }; + let doc = arc.get_domain(domain).path(&path); + let ver = if let Some(ver) = get_args.get_one::("VERSION") { + ver.clone() + } else { + doc.versions().first().unwrap().clone() + }; + + let md = get_args.get_flag("md"); + + let content = doc.render_local(Some(ver)).await; + + if content.is_none() { + println!("No document found"); + std::process::exit(1); + } + + if md { + let markdown = html2md::parse_html(&content.unwrap()); + println!("{markdown}"); + } else { + println!("{}", content.unwrap()); + } + } + Some((&_, _)) => {} + None => {} + }; + } + Some((&_, _)) => {} + None => {} } - - let archive = arc.clone(); - if std::env::var("OLLAMA_URL").is_ok() { - tokio::spawn(async move { - EmbedStore::generate_embeddings_for(&archive).await; - }); - } - - let archive = arc.clone(); - tokio::spawn(async move { - favicon::download_favicons_for_sites(&archive.domains()).await; - }); - - rocket::build() - .mount( - "/", - routes![ - based::htmx::htmx_script_route, - pages::index, - pages::render_website, - pages::domain_info_route, - pages::favicon_route, - pages::vector_search, - pages::render_txt_website - ], - ) - .manage(arc) } diff --git a/src/pages/component.rs b/src/pages/component.rs index ceed7cb..93755cb 100644 --- a/src/pages/component.rs +++ b/src/pages/component.rs @@ -1,7 +1,3 @@ -use based::{ - page::Shell, - request::{RequestContext, StringResponse}, -}; use maud::{html, PreEscaped}; /// Generates an SVG arrow icon with the specified color. @@ -78,24 +74,6 @@ pub fn gen_path_header( } } -pub async fn render_page(content: PreEscaped, ctx: RequestContext) -> StringResponse { - based::page::render_page( - content, - "Website Archive", - ctx, - &Shell::new( - html! { - script src="https://cdn.tailwindcss.com" {}; - meta name="viewport" content="width=device-width, initial-scale=1.0" {}; - script src="/assets/htmx.min.js" {}; - }, - html! {}, - Some("bg-zinc-950 text-white min-h-screen flex pt-8 justify-center".to_string()), - ), - ) - .await -} - pub fn favicon(site: &str) -> PreEscaped { html! { img class="h-8 w-8 m-2" src=(format!("/favicon/{site}")) {}; diff --git a/src/pages/mod.rs b/src/pages/mod.rs index e4bc4d1..9df6588 100644 --- a/src/pages/mod.rs +++ b/src/pages/mod.rs @@ -13,12 +13,14 @@ pub mod component; use component::*; use serde_json::json; -use crate::{ +use webarc::{ ai::{generate_embedding, EmbedStore, SearchResult}, archive::WebsiteArchive, + conf::get_config, + render_page, }; -const SEARCH_BAR_STYLE: &'static str = "w-full px-4 mb-4 py-2 text-white bg-black border-2 border-neon-blue placeholder-neon-blue focus:ring-2 focus:ring-neon-pink focus:outline-none font-mono text-lg"; +const SEARCH_BAR_STYLE: &str = "w-full px-4 mb-4 py-2 text-white bg-black border-2 border-neon-blue placeholder-neon-blue focus:ring-2 focus:ring-neon-pink focus:outline-none font-mono text-lg"; /// Get the favicon of a domain #[get("/favicon/")] @@ -29,6 +31,8 @@ pub async fn favicon_route(domain: &str) -> Option { .read_to_end(&mut buf) .ok()?; + // TODO : Default favicon + Some(DataResponse::new( buf, "image/x-icon".to_string(), @@ -171,12 +175,7 @@ pub async fn render_website( "text/html".to_string(), Some(60 * 60 * 24), )); - } else if std::env::var("DOWNLOAD_ON_DEMAND") - .unwrap_or("false".to_string()) - .as_str() - == "true" - && time.is_none() - { + } else if get_config().DOWNLOAD_ON_DEMAND && time.is_none() { arc.archive_url(&format!("https://{domain}/{}", path.to_str().unwrap())) .await; @@ -213,9 +212,7 @@ pub async fn vector_search( page: Option, ctx: RequestContext, ) -> Option { - if std::env::var("OLLAMA_URL").is_err() { - return None; - } + get_config().ai.as_ref()?; let page = page.unwrap_or(1);