fix

2025-01-19 01:44:44 +01:00 · 2025-01-19 01:14:29 +01:00 · 2025-01-19 01:01:36 +01:00 · 2025-01-14 19:10:49 +01:00 · 2025-01-11 16:21:15 +01:00 · 2025-01-03 16:54:36 +01:00
20 changed files with 1381 additions and 474 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -59,6 +59,55 @@ dependencies = [
 "libc",
 ]

+[[package]]
+name = "anstream"
+version = "0.6.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8acc5369981196006228e28809f761875c0327210a891e941f4c683b3a99529b"
+dependencies = [
+ "anstyle",
+ "anstyle-parse",
+ "anstyle-query",
+ "anstyle-wincon",
+ "colorchoice",
+ "is_terminal_polyfill",
+ "utf8parse",
+]
+
+[[package]]
+name = "anstyle"
+version = "1.0.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "55cc3b69f167a1ef2e161439aa98aed94e6028e5f9a59be9a6ffb47aef1651f9"
+
+[[package]]
+name = "anstyle-parse"
+version = "0.2.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3b2d16507662817a6a20a9ea92df6652ee4f94f914589377d69f3b21bc5798a9"
+dependencies = [
+ "utf8parse",
+]
+
+[[package]]
+name = "anstyle-query"
+version = "1.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "79947af37f4177cfead1110013d678905c37501914fba0efea834c3fe9a8d60c"
+dependencies = [
+ "windows-sys 0.59.0",
+]
+
+[[package]]
+name = "anstyle-wincon"
+version = "3.0.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2109dbce0e72be3ec00bed26e6a7479ca384ad226efdd66db8fa2e3a38c83125"
+dependencies = [
+ "anstyle",
+ "windows-sys 0.59.0",
+]
+
 [[package]]
 name = "async-stream"
 version = "0.3.6"
@ -313,6 +362,52 @@ dependencies = [
 "inout",
 ]

+[[package]]
+name = "clap"
+version = "4.5.23"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3135e7ec2ef7b10c6ed8950f0f792ed96ee093fa088608f1c76e569722700c84"
+dependencies = [
+ "clap_builder",
+ "clap_derive",
+]
+
+[[package]]
+name = "clap_builder"
+version = "4.5.23"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "30582fc632330df2bd26877bde0c1f4470d57c582bbc070376afcd04d8cb4838"
+dependencies = [
+ "anstream",
+ "anstyle",
+ "clap_lex",
+ "strsim",
+]
+
+[[package]]
+name = "clap_derive"
+version = "4.5.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4ac6a0c7b1a9e9a5186361f67dfa1b88213572f427fb9ab038efb2bd8c582dab"
+dependencies = [
+ "heck",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.93",
+]
+
+[[package]]
+name = "clap_lex"
+version = "0.7.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f46ad14479a25103f283c0f10005961cf086d8dc42205bb44c46ac563475dca6"
+
+[[package]]
+name = "colorchoice"
+version = "1.0.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5b63caa9aa9397e2d9480a9b13673856c78d8ac123288526c37d7839f2a86990"
+
 [[package]]
 name = "combine"
 version = "4.6.7"
@ -1351,6 +1446,12 @@ dependencies = [
 "windows-sys 0.52.0",
 ]

+[[package]]
+name = "is_terminal_polyfill"
+version = "1.70.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf"
+
 [[package]]
 name = "itoa"
 version = "1.0.14"
@ -2942,6 +3043,12 @@ dependencies = [
 "unicode-properties",
 ]

+[[package]]
+name = "strsim"
+version = "0.11.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
+
 [[package]]
 name = "subtle"
 version = "2.6.1"
@ -3462,6 +3569,15 @@ dependencies = [
 "percent-encoding",
 ]

+[[package]]
+name = "url-escape"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "44e0ce4d1246d075ca5abec4b41d33e87a6054d08e2366b63205665e950db218"
+dependencies = [
+ "percent-encoding",
+]
+
 [[package]]
 name = "utf-8"
 version = "0.7.6"
@ -3480,6 +3596,12 @@ version = "1.0.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be"

+[[package]]
+name = "utf8parse"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
+
 [[package]]
 name = "uuid"
 version = "1.11.0"
@ -3622,6 +3744,7 @@ version = "0.1.0"
 dependencies = [
 "based",
 "chrono",
+ "clap",
 "env_logger",
 "futures",
 "html2md",
@ -3636,7 +3759,9 @@ dependencies = [
 "serde_json",
 "sqlx",
 "tokio",
+ "toml",
 "url",
+ "url-escape",
 "uuid",
 ]

--- a/Cargo.toml
+++ b/Cargo.toml
@ -22,3 +22,6 @@ reqwest = "0.12.11"
 ollama-rs = "0.2.2"
 pgvector = { version = "0.4", features = ["sqlx"] }
 html2md = "0.2.14"
+clap = { version = "4.5.23", features = ["cargo", "derive"] }
+toml = "0.8.19"
+url-escape = "0.1.1"
--- a/10
+++ b/10
@ -1,15 +1,15 @@
-FROM rust:buster as builder
-
-COPY . /app
-WORKDIR /app
+FROM rust:buster AS builder

 RUN rustup default nightly
-RUN cargo build --release

 RUN git clone "https://github.com/Y2Z/monolith" /monolith
 WORKDIR /monolith
 RUN cargo build --release

+COPY . /app
+WORKDIR /app
+RUN cargo build --release
+
 FROM debian:buster

 RUN apt-get update && apt-get upgrade -y
--- a/README.md
+++ b/README.md
@ -1,14 +1,47 @@
 # WebArc
 `webarc` is a local website archive based on [monolith](https://github.com/Y2Z/monolith).

-## Configuration
-You can configure the application using environment variables:
+## Archive Format
+A web archive is defined as a directory containing domains in this structure:

- `$ROUTE_INTERNAL` : Rewrite links to point back to the archive itself
- `$DOWNLOAD_ON_DEMAND` : Download missing routes with monolith on demand
- `$BLACKLIST_DOMAINS` : Blacklisted domains (Comma-seperated regex, example: `google.com,.*.youtube.com`)
+```
+web_archive/
+├─ domain.com/
+│  ├─ sub/
+│  │  ├─ path/
+│  │  │  ├─ index_YYYY_MM_DD.html
+├─ sub.domain.com/
+```
+
+Every document of this web archive can then be found at `archive/domain/paths.../index_YYYY_MM_DD.html`.

 ## Usage
+webarc provides a CLI tool to work with the archive structure.
+
+```sh
+# List domains in archive
+webarc [--dir ARCHIVE] archive list [-j, --json]
+
+# List all paths on a domain
+webarc [--dir ARCHIVE] archive list [-j, --json] [DOMAIN]
+
+# List all versions of a document
+webarc [--dir ARCHIVE] archive versions [-j, --json] [DOMAIN] [PATH]
+
+# Get a document
+# `--md` will return a markdown version
+webarc [--dir ARCHIVE] archive get [--md] [DOMAIN] [PATH] [VERSION]
+
+# Archive a website
+webarc [--dir ARCHIVE] archive download [URL]
+```
+
+## Configuration
+You can configure the application using a config file. Look at the [config.toml](config.toml) file for more information.
+
+## Web Server
+You can start a webserver serving an archive with `webarc serve`.
+
 Archived pages can be viewed at `/s/<domain>/<path..>`.
 For example, `/s/en.wikipedia.org/wiki/Website` will serve `en.wikipedia.org` at `/wiki/Website`.

--- a/config.toml
+++ b/config.toml
@ -0,0 +1,73 @@
+
+# Rewrite links to point back to the archive itself
+ROUTE_INTERNAL=true
+
+# Download missing routes on demand
+DOWNLOAD_ON_DEMAND=true
+
+[websites]
+# You can blacklist sites which wont work well
+BLACKLIST_DOMAINS = [
+    "^gitlab", # All domains starting with gitlab
+    "youtube" # YouTube
+]
+
+# Domain configuration (Example)
+[[websites.domains]]
+# The domain the config applies to
+domain = "example.com"
+
+# Blacklisted Path (Regexes)
+blacklist_paths = ["/.*"]
+
+# Exclude <audio> tags
+no_audio = false
+
+# Exclude <video> tags
+no_video = false
+
+# Exclude <img> tags
+no_image = false
+
+# Exclude CSS
+no_css = false
+
+# Exclude Javascript
+no_javascript = false
+
+# Exclude fonts
+no_fonts = false
+
+# Exclude iframes
+no_frames = false
+
+# User Agent
+user_agent = "Safari"
+
+[ai]
+# Ollama URL (Enables vector search)
+OLLAMA_URL="127.0.0.1:11434"
+
+# --- Website Config
+
+[[websites.domains]]
+domain = "developer.mozilla.org"
+no_javascript = true
+
+[[websites.domains]]
+domain = "github.com"
+no_javascript = true
+
+[[websites.domains]]
+domain = "en.wikipedia.org"
+no_javascript = true
+
+[[websites.domains]]
+domain = "api.flutter.dev"
+no_javascript = true
+no_video = true
+
+[[websites.domains]]
+domain = "docs.flutter.dev"
+no_javascript = true
+no_video = true
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -6,7 +6,12 @@ services:
    volumes:
      - ./websites:/websites
      - ./favicon:/favicon
-    env_file: env
+      - ./config.toml:/config.toml
+    environment:
+      - "RUST_LOG=info"
+      - "ROCKET_ADDRESS=0.0.0.0"
+      - "DATABASE_URL=postgres://user:pass@postgres/webarc"
+    command: "/webarc serve"

  postgres:
    # Any Postgres with support for pgvector
--- a/18
+++ b/18
@ -1,18 +0,0 @@
-# Logging
-RUST_LOG=info
-ROCKET_ADDRESS=0.0.0.0
-
-# Rewrite links to point back to the archive itself
-ROUTE_INTERNAL=true
-
-# Download missing routes on demand
-DOWNLOAD_ON_DEMAND=true
-
-# Blacklisted domains (Comma-seperated regex)
-BLACKLIST_DOMAINS="google.com,.*.youtube.com"
-
-# Database
-DATABASE_URL=postgres://user:pass@postgres/webarc
-
-# Ollama URL (Enables vector search)
-OLLAMA_URL=127.0.0.1:11434
--- a/migrations/0001_embedding.sql
+++ b/migrations/0001_embedding.sql
@ -5,6 +5,9 @@ CREATE TABLE doc_embedding (
    domain VARCHAR(500) NOT NULL,
    path VARCHAR(1000) NOT NULL,
    ver VARCHAR(10) NOT NULL,
+    chunk INTEGER NOT NULL,
    embed_mxbai_embed_large vector(1024) NOT NULL,
-    PRIMARY KEY (domain, path, ver)
-)
+    PRIMARY KEY (domain, path, ver, chunk)
+);
+
+CREATE INDEX ON doc_embedding USING ivfflat (embed_mxbai_embed_large vector_cosine_ops) WITH (lists = 200);
--- a/src/ai.rs
+++ b/src/ai.rs
@ -1,4 +1,4 @@
-use std::collections::VecDeque;
+use std::collections::{HashMap, VecDeque};

 use based::{get_pg, request::api::ToAPI, result::LogNoneAndPass};
 use ollama_rs::generation::embeddings::request::{EmbeddingsInput, GenerateEmbeddingsRequest};
@ -6,14 +6,21 @@ use serde::Serialize;
 use serde_json::json;
 use sqlx::FromRow;

-use crate::archive::{Document, Domain, WebsiteArchive};
+use crate::{
+    archive::{Document, Domain, WebsiteArchive},
+    conf::get_config,
+};
+
+// TODO : Cite found chunks in search res?

 #[derive(Debug, Clone, FromRow, Serialize)]
 pub struct DocEmbedding {
    pub domain: String,
    pub path: String,
    pub ver: String,
+    pub chunk: i32,

+    #[allow(dead_code)]
    #[serde(skip)]
    embed_mxbai_embed_large: pgvector::Vector,

@ -21,6 +28,73 @@ pub struct DocEmbedding {
    pub similarity: f64,
 }

+impl DocEmbedding {
+    pub async fn total_chunks(&self) -> i64 {
+        let res: (i64,) = sqlx::query_as(
+            "SELECT COUNT(chunk) FROM doc_embedding WHERE domain = $1 AND path = $2",
+        )
+        .bind(&self.domain)
+        .bind(&self.path)
+        .fetch_one(get_pg!())
+        .await
+        .unwrap();
+
+        res.0
+    }
+}
+
+#[derive(Debug, Clone, Serialize)]
+pub struct SearchResult {
+    pub domain: String,
+    pub path: String,
+    pub total_chunks: i64,
+    pub chunks: Vec<DocEmbedding>,
+}
+
+impl SearchResult {
+    pub fn new(domain: String, path: String, total_chunks: i64) -> Self {
+        Self {
+            domain,
+            path,
+            total_chunks,
+            chunks: vec![],
+        }
+    }
+
+    pub fn similarity(&self) -> f64 {
+        let chunks = f64::from(self.chunks.len() as u32);
+        let total = f64::from(self.total_chunks as i32);
+        let match_percent = chunks / total;
+        total_score(&self.chunks) * match_percent
+    }
+}
+
+pub fn avg_sim(e: &[DocEmbedding]) -> f64 {
+    let mut score = 0.0;
+
+    for e in e {
+        score += e.similarity;
+    }
+
+    score / e.len() as f64
+}
+
+pub fn max_sim(e: &[DocEmbedding]) -> f64 {
+    let mut score = 0.0;
+
+    for e in e {
+        if e.similarity > score {
+            score = e.similarity;
+        }
+    }
+
+    score
+}
+
+pub fn total_score(e: &[DocEmbedding]) -> f64 {
+    (avg_sim(e) + max_sim(e)) / 2.0
+}
+
 impl ToAPI for DocEmbedding {
    async fn api(&self) -> serde_json::Value {
        json!({
@ -33,12 +107,30 @@ impl ToAPI for DocEmbedding {
 }

 pub trait Embedding {
-    fn embedding(&self, ver: Option<String>)
-        -> impl std::future::Future<Output = Option<Vec<f32>>>;
+    fn embedding(
+        &self,
+        ver: Option<String>,
+    ) -> impl std::future::Future<Output = Option<Vec<Vec<f32>>>>;
+}
+
+pub fn chunked(s: &str) -> Vec<String> {
+    const CHUNK_SIZE: usize = 2500;
+    s.chars()
+        .collect::<Vec<char>>()
+        .chunks(CHUNK_SIZE)
+        .map(|chunk| chunk.iter().collect())
+        .collect()
+}
+
+pub fn remove_data_urls(input: &str) -> String {
+    let re = regex::Regex::new("data:(.*?)(;base64)?,(.*)").unwrap();
+
+    // Replace all occurrences of data URLs with an empty string
+    re.replace_all(input, "").to_string()
 }

 impl Embedding for Document {
-    async fn embedding(&self, ver: Option<String>) -> Option<Vec<f32>> {
+    async fn embedding(&self, ver: Option<String>) -> Option<Vec<Vec<f32>>> {
        let latest = "latest".to_string();
        log::info!(
            "Generating Vector embeddings for {} / {} @ {}",
@ -47,14 +139,32 @@ impl Embedding for Document {
            ver.as_ref().unwrap_or(&latest)
        );

-        let content_html = self.render_local(ver).await?;
-        let content = html2md::parse_html(&content_html);
-        generate_embedding(content).await
+        let content_html = self.render_local(ver.clone()).await?;
+        let content = remove_data_urls(&html2md::parse_html(&content_html));
+
+        let mut embeddings = Vec::new();
+        let content = chunked(&content);
+        let len = content.len();
+
+        for (index, c) in content.into_iter().enumerate() {
+            log::info!(
+                "Generating Vector embeddings for {} / {} @ {} [ {} / {} ]",
+                self.domain,
+                self.path,
+                ver.as_ref().unwrap_or(&latest),
+                index + 1,
+                len
+            );
+            embeddings.push(generate_embedding(c).await?);
+        }
+
+        Some(embeddings)
    }
 }

 pub async fn generate_embedding(mut input: String) -> Option<Vec<f32>> {
-    if let Ok(ollama_url) = std::env::var("OLLAMA_URL") {
+    // TODO : Ollama load balancing
+    if let Some(ollama_url) = get_config().ai.as_ref().map(|x| x.OLLAMA_URL.clone()) {
        let (host, port) = ollama_url.split_once(':')?;
        let ollama = ollama_rs::Ollama::new(format!("http://{host}"), port.parse().ok()?);

@ -129,14 +239,17 @@ impl EmbedStore {
            .execute(get_pg!())
            .await;

-            sqlx::query("INSERT INTO doc_embedding VALUES ($1, $2, $3, $4)")
-                .bind(&doc.domain)
-                .bind(&doc.path)
-                .bind(ver)
-                .bind(embed)
-                .execute(get_pg!())
-                .await
-                .unwrap();
+            for (index, embed) in embed.iter().enumerate() {
+                sqlx::query("INSERT INTO doc_embedding VALUES ($1, $2, $3, $4, $5)")
+                    .bind(&doc.domain)
+                    .bind(&doc.path)
+                    .bind(ver)
+                    .bind(index as i64)
+                    .bind(embed)
+                    .execute(get_pg!())
+                    .await
+                    .unwrap();
+            }
        }
    }

@ -148,16 +261,42 @@ impl EmbedStore {
        }
    }

-    pub async fn search_vector(v: &pgvector::Vector, limit: i64, offset: i64) -> Vec<DocEmbedding> {
-        sqlx::query_as(
-            "SELECT *, 1 / (1 + (embed_mxbai_embed_large <-> $1)) AS similarity FROM doc_embedding ORDER BY embed_mxbai_embed_large <-> $1 LIMIT $2 OFFSET $3",
+    pub async fn search_vector(v: &pgvector::Vector, limit: i64, offset: i64) -> Vec<SearchResult> {
+        // limit should cover SearchResults not the query -> rework
+
+        let results: Vec<DocEmbedding> = sqlx::query_as(
+            "SELECT *, 1 / (1 + (embed_mxbai_embed_large <-> $1)) AS similarity FROM doc_embedding ORDER BY embed_mxbai_embed_large <=> $1 LIMIT $2 OFFSET $3",
        )
        .bind(v)
        .bind(limit)
        .bind(offset)
        .fetch_all(get_pg!())
        .await
-        .unwrap()
+        .unwrap();
+
+        let mut search_res: HashMap<String, HashMap<String, SearchResult>> = HashMap::new();
+
+        for res in results {
+            let domain = search_res.entry(res.domain.clone()).or_default();
+            let doc = domain.entry(res.path.clone()).or_insert(SearchResult::new(
+                res.domain.clone(),
+                res.path.clone(),
+                res.total_chunks().await,
+            ));
+            doc.chunks.push(res);
+        }
+
+        let mut flat = search_res
+            .into_values()
+            .flat_map(|x| x.into_values().collect::<Vec<SearchResult>>())
+            .collect::<Vec<SearchResult>>();
+
+        flat.sort_by(|a, b| {
+            b.similarity()
+                .partial_cmp(&a.similarity())
+                .unwrap_or(std::cmp::Ordering::Equal)
+        });
+        flat
    }

    pub async fn generate_embeddings_for(arc: &WebsiteArchive) {
--- a/src/archive.rs
+++ b/src/archive.rs
@ -1,335 +0,0 @@
-use std::path::PathBuf;
-
-use based::request::RequestContext;
-use maud::html;
-
-use crate::{blacklist::check_blacklist, favicon::download_fav_for, pages::component::render_page};
-
-/// Read directory entries into `Vec<String>`
-pub fn read_dir(dir: &PathBuf) -> Vec<String> {
-    let mut list = Vec::new();
-
-    if let Ok(entries) = std::fs::read_dir(dir) {
-        for entry in entries.flatten() {
-            if let Some(file_name) = entry.file_name().to_str() {
-                list.push(file_name.to_string());
-            }
-        }
-    }
-
-    list
-}
-
-/// Rewrite all URLs in `input` to the format `/s/<domain>/<path..>`
-fn internalize_urls(input: &str) -> String {
-    let url_pattern = r"https?://([a-zA-Z0-9.-]+)(/[\w./-]*)";
-    let re = regex::Regex::new(url_pattern).unwrap();
-
-    re.replace_all(input, |caps: &regex::Captures| {
-        format!(
-            "/s/{}/{}",
-            &caps[1].trim_start_matches("www."), // Domain
-            &caps[2]                             // Path
-        )
-    })
-    .to_string()
-}
-
-/// Represents a directory containg archived websites
-#[derive(Debug, Clone)]
-pub struct WebsiteArchive {
-    pub dir: PathBuf,
-}
-
-/// Represents a domain within the website archive
-pub struct Domain {
-    /// Domain name
-    pub name: String,
-    dir: PathBuf,
-}
-
-impl Domain {
-    /// Creates a new `Domain` instance.
-    ///
-    /// If the domain name is not blacklisted, a directory is created.
-    ///
-    /// # Parameters
-    /// - `name`: The name of the domain.
-    /// - `dir`: The directory path for the domain.
-    ///
-    /// # Returns
-    /// A new `Domain` instance.
-    pub fn new(name: &str, dir: PathBuf) -> Self {
-        if !check_blacklist(name) {
-            std::fs::create_dir_all(&dir).unwrap();
-        }
-        Self {
-            name: name.to_string(),
-            dir,
-        }
-    }
-
-    /// Resolves a specific path within the domain and returns a `Document` representing it.
-    ///
-    /// # Parameters
-    /// - `path`: The path to resolve within the domain.
-    ///
-    /// # Returns
-    /// A `Document` instance corresponding to the given path.
-    pub fn path(&self, path: &str) -> Document {
-        Document::new(&self.name, path, self.dir.parent().unwrap().to_path_buf())
-    }
-
-    /// Retrieves entries and metadata for a given path within the domain.
-    ///
-    /// # Parameters
-    /// - `path`: The path to inspect.
-    ///
-    /// # Returns
-    /// A tuple containing:
-    /// - A vector of `PathEntry` instances representing the contents of the path.
-    /// - A boolean indicating whether the path is itself a `Document`
-    pub fn paths(&self, path: &str) -> (Vec<PathEntry>, bool) {
-        let mut base_path = self.dir.clone();
-
-        for p in path.split('/') {
-            base_path = base_path.join(p);
-        }
-
-        let dir_content = read_dir(&base_path);
-
-        let mut ret = Vec::new();
-
-        let mut is_doc = false;
-
-        for entry in dir_content {
-            let url_path = format!("{path}/{entry}");
-            if entry.starts_with("index_") && entry.ends_with(".html") {
-                is_doc = true;
-                continue;
-            }
-
-            ret.push(PathEntry(self.name.clone(), url_path));
-        }
-
-        (ret, is_doc)
-    }
-}
-
-/// Represents an entry within a domain's path, containing its name and URL path.
-pub struct PathEntry(String, String);
-
-impl PathEntry {
-    pub fn url(&self) -> String {
-        format!("/d/{}/{}", self.0, self.1)
-    }
-
-    pub fn path(&self) -> &String {
-        &self.1
-    }
-}
-
-/// Represents a document within a domain
-pub struct Document {
-    /// The domain associated with the document.
-    pub domain: String,
-    /// The path of the document within the domain.
-    pub path: String,
-    base_dir: PathBuf,
-}
-
-impl Document {
-    /// Creates a new `Document` instance.
-    ///
-    /// # Parameters
-    /// - `domain`: The domain to which the document belongs.
-    /// - `path`: The path of the document within the domain.
-    /// - `base_dir`: The base directory of the archive storage.
-    ///
-    /// # Returns
-    /// A new `Document` instance.
-    pub fn new(domain: &str, path: &str, base_dir: PathBuf) -> Self {
-        Self {
-            domain: domain.to_string(),
-            path: path
-                .split('/')
-                .filter(|x| !x.is_empty())
-                .collect::<Vec<&str>>()
-                .join("/"),
-            base_dir,
-        }
-    }
-
-    /// Renders the document, returning its content as a string.
-    ///
-    /// If the environment variable `$ROUTE_INTERNAL` is set to `true`, all links will be rewritten to point to internal archived routes.
-    ///
-    /// # Parameters
-    /// - `version`: An optional version of the document to render in the format `YYYY-MM-DD`.
-    ///
-    /// # Returns
-    /// An `Option` containing the rendered content as a string, or `None` if nothing could be rendered.
-    pub async fn render_local(&self, version: Option<String>) -> Option<String> {
-        if check_blacklist(&self.domain) {
-            let content = html! {
-                h3 { "This site is blacklisted" };
-            };
-            return Some(render_page(content, RequestContext::default()).await.1 .1);
-        }
-
-        let mut file_path = self.doc_dir();
-
-        let latest_version = if let Some(version) = version {
-            format!("index_{version}.html")
-        } else {
-            let versions = self.versions();
-            versions.first().cloned()?
-        };
-
-        file_path = file_path.join(latest_version);
-
-        let content = std::fs::read_to_string(file_path).ok()?;
-
-        if std::env::var("ROUTE_INTERNAL").unwrap_or("false".to_string()) == "true" {
-            Some(internalize_urls(&content))
-        } else {
-            Some(content)
-        }
-    }
-
-    /// Determines the directory where the document is stored.
-    ///
-    /// # Returns
-    /// A `PathBuf` representing the document directory.
-    pub fn doc_dir(&self) -> PathBuf {
-        let mut file_path = self.base_dir.join(&self.domain);
-
-        for p in self.path.split('/') {
-            file_path = file_path.join(p);
-        }
-
-        file_path
-    }
-
-    /// Retrieves available versions of the document.
-    ///
-    /// # Returns
-    /// A vector of strings representing the available versions of the document, sorted in descending order.
-    pub fn versions(&self) -> Vec<String> {
-        let mut res: Vec<String> = read_dir(&self.doc_dir())
-            .into_iter()
-            .filter_map(|x| {
-                if x.starts_with("index_") && x.ends_with(".html") {
-                    return Some(
-                        x.trim_start_matches("index_")
-                            .trim_end_matches(".html")
-                            .to_string(),
-                    );
-                }
-
-                None
-            })
-            .collect();
-        res.sort();
-        res.reverse();
-        res
-    }
-}
-
-impl WebsiteArchive {
-    /// Creates a new `WebsiteArchive` instance.
-    ///
-    /// # Parameters
-    /// - `dir`: The directory path where the archive will be stored.
-    ///
-    /// # Returns
-    /// A new `WebsiteArchive` instance.
-    pub fn new(dir: &str) -> Self {
-        Self {
-            dir: PathBuf::from(dir),
-        }
-    }
-
-    /// Retrieves the list of domain names stored in the archive.
-    ///
-    /// # Returns
-    /// A vector of domain names as strings.
-    pub fn domains(&self) -> Vec<String> {
-        read_dir(&self.dir)
-    }
-
-    /// Retrieves a `Domain` instance for a specified domain name.
-    ///
-    /// # Parameters
-    /// - `domain`: The name of the domain to retrieve.
-    ///
-    /// # Returns
-    /// A `Domain` instance corresponding to the specified domain.
-    pub fn get_domain(&self, domain: &str) -> Domain {
-        Domain::new(domain, self.dir.join(domain))
-    }
-
-    /// Archives a URL by downloading and storing its content.
-    ///
-    /// If the URL does not pass the blacklist check, it will not be archived.
-    ///
-    /// # Parameters
-    /// - `url`: The URL to archive.
-    ///
-    /// This function downloads the content of the URL, processes it, and saves it to the archive.
-    pub async fn archive_url(&self, url: &str) {
-        let parsed_url = url::Url::parse(url).unwrap();
-
-        let domain = parsed_url.domain().unwrap().trim_start_matches("www.");
-
-        // Deny blacklist
-        if check_blacklist(domain) {
-            return;
-        }
-
-        let path = parsed_url.path();
-
-        let mut folder_name = self.dir.join(domain);
-
-        download_fav_for(domain).await;
-
-        for paths in path.split('/') {
-            if !paths.is_empty() {
-                folder_name = folder_name.join(paths);
-            }
-        }
-
-        std::fs::create_dir_all(&folder_name).unwrap();
-
-        let timestamp = chrono::Utc::now().format("%Y-%m-%d").to_string();
-        let filename = folder_name.join(format!("index_{timestamp}.html"));
-
-        log::info!("Archiving {url} to {}", filename.to_str().unwrap());
-
-        run_command(&[
-            "monolith",
-            "-I",
-            "-o",
-            filename.to_str().unwrap(),
-            &format!("https://{}/{}", domain, path),
-        ]);
-    }
-}
-
-// full text search
-// add new sites?
-// transparent auto page downloading
-// redownload after threshold
-
-fn run_command(cmd: &[&str]) {
-    let mut cmd_setup = std::process::Command::new(cmd[0]);
-    let cmd_setup = cmd_setup
-        .args(cmd.iter().skip(1).collect::<Vec<_>>())
-        .stdout(std::process::Stdio::inherit())
-        .stderr(std::process::Stdio::inherit());
-
-    let child = cmd_setup.spawn().unwrap();
-
-    let status = child.wait_with_output().unwrap();
-    assert!(status.status.success());
-}
--- a/src/archive/document.rs
+++ b/src/archive/document.rs
@ -0,0 +1,126 @@
+use std::{io::Read, path::PathBuf};
+
+use based::request::RequestContext;
+use maud::html;
+
+use crate::{blacklist::check_blacklist, conf::get_config, render_page};
+
+use super::{internalize_urls, read_dir};
+
+/// Represents a document within a domain
+pub struct Document {
+    /// The domain associated with the document.
+    pub domain: String,
+    /// The path of the document within the domain.
+    pub path: String,
+    base_dir: PathBuf,
+}
+
+impl Document {
+    /// Creates a new `Document` instance.
+    ///
+    /// # Parameters
+    /// - `domain`: The domain to which the document belongs.
+    /// - `path`: The path of the document within the domain.
+    /// - `base_dir`: The base directory of the archive storage.
+    ///
+    /// # Returns
+    /// A new `Document` instance.
+    pub fn new(domain: &str, path: &str, base_dir: PathBuf) -> Self {
+        let split = path
+            .split('/')
+            .filter(|x| !x.is_empty())
+            .collect::<Vec<&str>>();
+
+        Self {
+            domain: domain.to_string(),
+            path: if split.is_empty() {
+                "/".to_string()
+            } else {
+                split.join("/")
+            },
+            base_dir,
+        }
+    }
+
+    /// Renders the document, returning its content as a string.
+    ///
+    /// If the environment variable `$ROUTE_INTERNAL` is set to `true`, all links will be rewritten to point to internal archived routes.
+    ///
+    /// # Parameters
+    /// - `version`: An optional version of the document to render in the format `YYYY-MM-DD`.
+    ///
+    /// # Returns
+    /// An `Option` containing the rendered content as a string, or `None` if nothing could be rendered.
+    pub async fn render_local(&self, version: Option<String>) -> Option<String> {
+        if check_blacklist(&self.domain) {
+            let content = html! {
+                h3 { "This site is blacklisted" };
+            };
+            return Some(render_page(content, RequestContext::default()).await.1 .1);
+        }
+
+        let mut file_path = self.doc_dir();
+
+        let latest_version = if let Some(version) = version {
+            format!("index_{version}.html")
+        } else {
+            let versions = self.versions();
+            let version = versions.first().cloned()?;
+            format!("index_{version}.html")
+        };
+
+        file_path = file_path.join(latest_version);
+
+        let mut buf = Vec::new();
+        std::fs::File::open(file_path)
+            .ok()?
+            .read_to_end(&mut buf)
+            .unwrap();
+        let content = String::from_utf8_lossy(&buf);
+
+        if get_config().ROUTE_INTERNAL {
+            Some(internalize_urls(&content, &self.domain))
+        } else {
+            Some(content.to_string())
+        }
+    }
+
+    /// Determines the directory where the document is stored.
+    ///
+    /// # Returns
+    /// A `PathBuf` representing the document directory.
+    pub fn doc_dir(&self) -> PathBuf {
+        let mut file_path = self.base_dir.join(&self.domain);
+
+        for p in self.path.split('/').filter(|x| !x.is_empty()) {
+            file_path = file_path.join(p);
+        }
+
+        file_path
+    }
+
+    /// Retrieves available versions of the document.
+    ///
+    /// # Returns
+    /// A vector of strings representing the available versions of the document, sorted in descending order.
+    pub fn versions(&self) -> Vec<String> {
+        let mut res: Vec<String> = read_dir(&self.doc_dir())
+            .into_iter()
+            .filter_map(|x| {
+                if x.starts_with("index_") && x.ends_with(".html") {
+                    return Some(
+                        x.trim_start_matches("index_")
+                            .trim_end_matches(".html")
+                            .to_string(),
+                    );
+                }
+
+                None
+            })
+            .collect();
+        res.sort();
+        res.reverse();
+        res
+    }
+}
--- a/src/archive/domain.rs
+++ b/src/archive/domain.rs
@ -0,0 +1,125 @@
+use std::path::PathBuf;
+
+use based::result::LogAndIgnore;
+
+use crate::blacklist::check_blacklist;
+
+use super::{read_dir, Document};
+
+/// Represents a domain within the website archive
+pub struct Domain {
+    /// Domain name
+    pub name: String,
+    dir: PathBuf,
+}
+
+impl Domain {
+    /// Creates a new `Domain` instance.
+    ///
+    /// If the domain name is not blacklisted, a directory is created.
+    ///
+    /// # Parameters
+    /// - `name`: The name of the domain.
+    /// - `dir`: The directory path for the domain.
+    ///
+    /// # Returns
+    /// A new `Domain` instance.
+    pub fn new(name: &str, dir: PathBuf) -> Self {
+        if !check_blacklist(name) {
+            std::fs::create_dir_all(&dir)
+                .log_err_and_ignore(&format!("Could not create domain dir {name}"));
+        }
+        Self {
+            name: name.to_string(),
+            dir,
+        }
+    }
+
+    /// Resolves a specific path within the domain and returns a `Document` representing it.
+    ///
+    /// # Parameters
+    /// - `path`: The path to resolve within the domain.
+    ///
+    /// # Returns
+    /// A `Document` instance corresponding to the given path.
+    pub fn path(&self, path: &str) -> Document {
+        Document::new(&self.name, path, self.dir.parent().unwrap().to_path_buf())
+    }
+
+    /// Get all paths associated with the domain
+    pub fn all_paths(&self) -> Vec<PathEntry> {
+        let mut queue = self.paths("/").0;
+
+        let mut ret = Vec::new();
+
+        ret.push(PathEntry(self.name.clone(), "/".to_string()));
+
+        while let Some(el) = queue.pop() {
+            ret.push(el.clone());
+            let paths = self.paths(&el.1).0;
+            queue.extend(paths);
+        }
+
+        ret
+    }
+
+    /// Retrieves entries and metadata for a given path within the domain.
+    ///
+    /// # Parameters
+    /// - `path`: The path to inspect.
+    ///
+    /// # Returns
+    /// A tuple containing:
+    /// - A vector of `PathEntry` instances representing the contents of the path.
+    /// - A boolean indicating whether the path is itself a `Document`
+    pub fn paths(&self, path: &str) -> (Vec<PathEntry>, bool) {
+        let mut base_path = self.dir.clone();
+
+        for p in path.split('/') {
+            base_path = base_path.join(p);
+        }
+
+        let path = path
+            .split("/")
+            .filter(|x| !x.is_empty())
+            .collect::<Vec<&str>>()
+            .join("/");
+
+        let dir_content = read_dir(&base_path);
+
+        let mut ret = Vec::new();
+
+        let mut is_doc = false;
+
+        for entry in dir_content {
+            let url_path = format!("{path}/{entry}");
+            let url_path = url_path
+                .split("/")
+                .filter(|x| !x.is_empty())
+                .collect::<Vec<&str>>()
+                .join("/");
+            if entry.starts_with("index_") && entry.ends_with(".html") {
+                is_doc = true;
+                continue;
+            }
+
+            ret.push(PathEntry(self.name.clone(), url_path));
+        }
+
+        (ret, is_doc)
+    }
+}
+
+/// Represents an entry within a domain's path, containing its name and URL path.
+#[derive(Debug, Clone)]
+pub struct PathEntry(String, String);
+
+impl PathEntry {
+    pub fn url(&self) -> String {
+        format!("/d/{}/{}", self.0, self.1)
+    }
+
+    pub fn path(&self) -> &String {
+        &self.1
+    }
+}
--- a/src/archive/mod.rs
+++ b/src/archive/mod.rs
@ -0,0 +1,232 @@
+use std::{collections::HashSet, path::PathBuf};
+
+use crate::{
+    blacklist::{check_blacklist, check_blacklist_path},
+    conf::get_config,
+    favicon::download_fav_for,
+};
+
+mod document;
+mod domain;
+pub use document::Document;
+pub use domain::*;
+
+/// Read directory entries into `Vec<String>`
+pub fn read_dir(dir: &PathBuf) -> Vec<String> {
+    let mut list = Vec::new();
+
+    if let Ok(entries) = std::fs::read_dir(dir) {
+        for entry in entries.flatten() {
+            if let Some(file_name) = entry.file_name().to_str() {
+                list.push(file_name.to_string());
+            }
+        }
+    }
+
+    list
+}
+
+/// Rewrite all URLs in `input` to the format `/s/<domain>/<path..>`
+fn internalize_urls(input: &str, base: &str) -> String {
+    let url_pattern = r#"(\ |"|')(?:(<?)(https?:\/\/([a-zA-Z0-9.-]+))?(\/[\w./-]*))"#;
+    let re = regex::Regex::new(url_pattern).unwrap();
+
+    re.replace_all(input, |caps: &regex::Captures| {
+        if caps.get(2).map(|x| x.as_str()).unwrap_or_default() == "<" {
+            return caps.get(0).unwrap().as_str().to_string();
+        }
+
+        if caps.get(0).unwrap().as_str() == " //" {
+            return " //".to_string();
+        }
+
+        let wrap = caps.get(1).map(|x| x.as_str()).unwrap_or_default();
+
+        if let Some(domain) = caps.get(3) {
+            let domain = domain.as_str();
+            let (protocol, domain) = if domain.starts_with("https://") {
+                ("https", domain.trim_start_matches("https://"))
+            } else {
+                ("http", domain.trim_start_matches("http://"))
+            };
+
+            let domain = domain.trim_start_matches("www.");
+            let path = caps.get(5).map_or("", |m| m.as_str());
+
+            // Skip transformation if the domain is in the blacklist
+            if check_blacklist(domain) {
+                format!("{wrap}{protocol}://{domain}{path}")
+            } else {
+                format!("{wrap}/s/{domain}{path}")
+            }
+        } else if let Some(path) = caps.get(5) {
+            // Handle relative paths
+            format!("{wrap}/s/{base}{}", path.as_str())
+        } else {
+            // Default fallback
+            caps[0].to_string()
+        }
+    })
+    .to_string()
+}
+
+/// Extract all domains
+pub fn extract_domains(input: &str) -> Vec<String> {
+    let url_pattern = r"https?://([a-zA-Z0-9.-]+)(/[\w./-]*)?";
+    let re = regex::Regex::new(url_pattern).unwrap();
+
+    let mut domains = HashSet::new();
+    for caps in re.captures_iter(input) {
+        let domain = caps[1].trim_start_matches("www.");
+        domains.insert(domain.to_string());
+    }
+
+    let mut domains: Vec<_> = domains.into_iter().collect();
+    domains.sort();
+
+    domains
+}
+
+/// Represents a directory containg archived websites
+#[derive(Debug, Clone)]
+pub struct WebsiteArchive {
+    pub dir: PathBuf,
+}
+
+impl WebsiteArchive {
+    /// Creates a new `WebsiteArchive` instance.
+    ///
+    /// # Parameters
+    /// - `dir`: The directory path where the archive will be stored.
+    ///
+    /// # Returns
+    /// A new `WebsiteArchive` instance.
+    pub fn new(dir: &str) -> Self {
+        Self {
+            dir: PathBuf::from(dir),
+        }
+    }
+
+    /// Retrieves the list of domain names stored in the archive.
+    ///
+    /// # Returns
+    /// A vector of domain names as strings.
+    pub fn domains(&self) -> Vec<String> {
+        read_dir(&self.dir)
+    }
+
+    /// Retrieves a `Domain` instance for a specified domain name.
+    ///
+    /// # Parameters
+    /// - `domain`: The name of the domain to retrieve.
+    ///
+    /// # Returns
+    /// A `Domain` instance corresponding to the specified domain.
+    pub fn get_domain(&self, domain: &str) -> Domain {
+        Domain::new(domain, self.dir.join(domain))
+    }
+
+    /// Archives a URL by downloading and storing its content.
+    ///
+    /// If the URL does not pass the blacklist check, it will not be archived.
+    ///
+    /// # Parameters
+    /// - `url`: The URL to archive.
+    ///
+    /// This function downloads the content of the URL, processes it, and saves it to the archive.
+    pub async fn archive_url(&self, url: &str) {
+        let parsed_url = url::Url::parse(url).unwrap();
+
+        let domain = parsed_url.domain().unwrap().trim_start_matches("www.");
+
+        // Deny blacklist
+        if check_blacklist(domain) {
+            return;
+        }
+
+        let path = parsed_url.path();
+
+        if check_blacklist_path(domain, path) {
+            return;
+        }
+
+        let mut folder_name = self.dir.join(domain);
+
+        download_fav_for(domain).await;
+
+        for paths in path.split('/') {
+            let paths = url_escape::decode(paths).to_string();
+            if !paths.is_empty() {
+                folder_name = folder_name.join(paths);
+            }
+        }
+
+        std::fs::create_dir_all(&folder_name).unwrap();
+
+        let timestamp = chrono::Utc::now().format("%Y-%m-%d").to_string();
+        let filename = folder_name.join(format!("index_{timestamp}.html"));
+
+        log::info!("Archiving {url} to {}", filename.to_str().unwrap());
+
+        let conf = get_config()
+            .get_domain_config(domain)
+            .cloned()
+            .unwrap_or_default();
+
+        let mut cmd = vec!["monolith", "--isolate", "-o", filename.to_str().unwrap()];
+
+        if conf.no_audio.unwrap_or_default() {
+            cmd.push("--no-audio");
+        }
+
+        if conf.no_css.unwrap_or_default() {
+            cmd.push("--no-css");
+        }
+
+        if conf.no_frames.unwrap_or_default() {
+            cmd.push("--no-frames");
+        }
+
+        if conf.no_fonts.unwrap_or_default() {
+            cmd.push("--no-frames");
+        }
+
+        if conf.no_image.unwrap_or_default() {
+            cmd.push("--no-images");
+        }
+
+        if conf.no_javascript.unwrap_or_default() {
+            cmd.push("--no-js");
+            cmd.push("--unwrap-noscript");
+        }
+
+        if conf.no_video.unwrap_or_default() {
+            cmd.push("--no-video");
+        }
+
+        if let Some(ua) = &conf.user_agent {
+            cmd.push("--user-agent");
+            cmd.push(ua.as_str());
+        }
+
+        let mut url = url::Url::parse(&format!("https://{domain}")).unwrap();
+        url = url.join(path).unwrap();
+        let url = url.to_string();
+        cmd.push(&url);
+
+        run_command(&cmd);
+    }
+}
+
+fn run_command(cmd: &[&str]) {
+    let mut cmd_setup = std::process::Command::new(cmd[0]);
+    let cmd_setup = cmd_setup
+        .args(cmd.iter().skip(1).collect::<Vec<_>>())
+        .stdout(std::process::Stdio::inherit())
+        .stderr(std::process::Stdio::inherit());
+
+    let child = cmd_setup.spawn().unwrap();
+
+    let status = child.wait_with_output().unwrap();
+    assert!(status.status.success());
+}
--- a/src/args.rs
+++ b/src/args.rs
@ -0,0 +1,61 @@
+use clap::{arg, command};
+
+pub fn get_args() -> clap::ArgMatches {
+    command!()
+        .about("Web Archive")
+        .arg(
+            arg!(-d --dir <dir> "Web archive directory")
+                .required(false)
+                .default_value("./websites"),
+        )
+        .subcommand(
+            command!()
+                .name("serve")
+                .about("Start web archive server")
+                .arg(
+                    arg!(-c --config <config> "Web archive config file")
+                        .required(false)
+                        .default_value("./config.toml"),
+                ),
+        )
+        .subcommand(
+            command!()
+                .name("archive")
+                .about("Work with web archives")
+                .subcommand(
+                    command!()
+                    .name("download")
+                    .about("Download a new URL into the archive")
+                    .arg(
+                        arg!(-c --config <config> "Web archive config file")
+                            .required(false)
+                            .default_value("./config.toml"),
+                    )
+                    .arg(arg!([URL] "The URL to download").required(true))
+                )
+                .subcommand(
+                    command!()
+                        .name("list")
+                        .about("List domains contained in the archive. If a domain is provided all paths of this domain will be listed.")
+                        .arg(arg!([DOMAIN] "A domain to list").required(false))
+                        .arg(arg!(-j --json "Ouput JSON").required(false)),
+                )
+                .subcommand(
+                    command!()
+                        .name("versions")
+                        .about("List saved versions of a document")
+                        .arg(arg!(-j --json "Ouput JSON").required(false))
+                        .arg(arg!([DOMAIN] "A domain").required(true))
+                        .arg(arg!([PATH] "A path").required(false))
+                )
+                .subcommand(
+                    command!()
+                        .name("get")
+                        .about("Get a saved document")
+                        .arg(arg!(--md "Ouput Markdown").required(false))
+                        .arg(arg!([DOMAIN] "A domain").required(true))
+                        .arg(arg!([PATH] "A path").required(false))
+                        .arg(arg!([VERSION] "A version").required(false))
+                ))
+        .get_matches()
+}
--- a/src/blacklist.rs
+++ b/src/blacklist.rs
@ -1,19 +1,43 @@
+use crate::conf::get_config;
+
 /// Checks if a domain is present in the blacklist of unwanted domains.
 ///
-/// This function checks the `$BLACKLIST_DOMAINS` environment variable for a comma-separated list of regular expressions to match against.
 /// If a match is found, it immediately returns `true`. Otherwise, it returns `false`.
 pub fn check_blacklist(domain: &str) -> bool {
-    let blacklist_raw = std::env::var("BLACKLIST_DOMAINS").unwrap_or_default();
+    let conf = get_config();
+    let conf = conf.websites.as_ref();

-    if blacklist_raw.is_empty() {
-        return false;
+    // TODO : Block IPs
+    // TODO : Test SSRF
+
+    let blacklisted_domains = conf
+        .map(|x| x.BLACKLIST_DOMAINS.as_ref())
+        .unwrap_or_default();
+
+    check_regex(domain, blacklisted_domains.unwrap_or(&Vec::new()))
+}
+
+pub fn check_blacklist_path(domain: &str, path: &str) -> bool {
+    let conf = get_config();
+    let conf = conf.websites.as_ref();
+
+    if let Some(website) = conf {
+        let empty = Vec::new();
+        let domain_conf = website.domains.as_ref().unwrap_or(&empty);
+        if let Some(domain_conf) = domain_conf.iter().find(|x| x.domain == domain) {
+            let empty = Vec::new();
+            let blacklist = domain_conf.blacklist_paths.as_ref().unwrap_or(&empty);
+            return check_regex(path, blacklist);
+        }
    }

-    let blacklist: Vec<&str> = blacklist_raw.split(',').collect();
+    false
+}

-    for domain_regex in blacklist {
-        let rgx = regex::Regex::new(domain_regex).unwrap();
-        if rgx.is_match(domain) {
+pub fn check_regex(input: &str, regexes: &Vec<String>) -> bool {
+    for regex in regexes {
+        let rgx = regex::Regex::new(regex).unwrap();
+        if rgx.is_match(input) {
            return true;
        }
    }
--- a/src/conf.rs
+++ b/src/conf.rs
@ -0,0 +1,107 @@
+use std::sync::Arc;
+
+use serde::Deserialize;
+use tokio::sync::OnceCell;
+
+pub static CONFIG: OnceCell<Arc<Config>> = OnceCell::const_new();
+
+/// Get a reference to global config
+pub fn get_config() -> &'static Arc<Config> {
+    crate::conf::CONFIG.get().unwrap()
+}
+
+/// Load a global config
+pub fn load_config(path: &str) {
+    // TODO : Other load locations
+    if let Ok(file_content) = std::fs::read_to_string(path) {
+        let conf: Config =
+            toml::from_str(&file_content).expect("Could not deserialize config file");
+        crate::conf::CONFIG.set(std::sync::Arc::new(conf)).unwrap();
+    }
+}
+
+/// Load a default global config
+pub fn load_default_config() {
+    if crate::conf::CONFIG.get().is_none() {
+        crate::conf::CONFIG
+            .set(std::sync::Arc::new(Config::default()))
+            .unwrap();
+    }
+}
+
+#[allow(non_snake_case)]
+#[derive(Debug, Clone, Deserialize)]
+pub struct Config {
+    pub ROUTE_INTERNAL: bool,
+    pub DOWNLOAD_ON_DEMAND: bool,
+    pub ai: Option<AIConfig>,
+    pub websites: Option<WebsiteConfig>,
+}
+
+impl Config {
+    pub fn get_domain_config(&self, domain: &str) -> Option<&DomainConfig> {
+        if let Some(websites) = &self.websites {
+            if let Some(domains) = &websites.domains {
+                let domain = domains.iter().find(|x| x.domain == domain);
+                return domain;
+            }
+        }
+
+        None
+    }
+}
+
+#[allow(non_snake_case)]
+#[derive(Debug, Clone, Deserialize)]
+pub struct AIConfig {
+    pub OLLAMA_URL: String,
+}
+
+#[allow(non_snake_case)]
+#[derive(Debug, Clone, Deserialize)]
+pub struct WebsiteConfig {
+    pub BLACKLIST_DOMAINS: Option<Vec<String>>,
+    pub domains: Option<Vec<DomainConfig>>,
+}
+
+#[derive(Debug, Clone, Deserialize)]
+pub struct DomainConfig {
+    pub domain: String,
+    pub blacklist_paths: Option<Vec<String>>,
+    pub no_audio: Option<bool>,
+    pub no_video: Option<bool>,
+    pub no_image: Option<bool>,
+    pub no_css: Option<bool>,
+    pub no_javascript: Option<bool>,
+    pub no_fonts: Option<bool>,
+    pub no_frames: Option<bool>,
+    pub user_agent: Option<String>,
+}
+
+impl Default for DomainConfig {
+    fn default() -> Self {
+        Self {
+            domain: String::new(),
+            blacklist_paths: None,
+            no_audio: Some(false),
+            no_video: Some(false),
+            no_image: Some(false),
+            no_css: Some(false),
+            no_javascript: Some(false),
+            no_fonts: Some(false),
+            no_frames: Some(false),
+            user_agent: None,
+        }
+    }
+}
+
+impl Default for Config {
+    fn default() -> Self {
+        Self {
+            ROUTE_INTERNAL: false,
+            DOWNLOAD_ON_DEMAND: false,
+            ai: None,
+            websites: None,
+        }
+    }
+}
--- a/src/lib.rs
+++ b/src/lib.rs
@ -0,0 +1,29 @@
+use based::{
+    page::Shell,
+    request::{RequestContext, StringResponse},
+};
+use maud::{html, PreEscaped};
+
+pub mod ai;
+pub mod archive;
+pub mod blacklist;
+pub mod conf;
+pub mod favicon;
+
+pub async fn render_page(content: PreEscaped<String>, ctx: RequestContext) -> StringResponse {
+    based::page::render_page(
+        content,
+        "Website Archive",
+        ctx,
+        &Shell::new(
+            html! {
+                script src="https://cdn.tailwindcss.com" {};
+                meta name="viewport" content="width=device-width, initial-scale=1.0" {};
+                script src="/assets/htmx.min.js" {};
+            },
+            html! {},
+            Some("bg-zinc-950 text-white min-h-screen flex pt-8 justify-center".to_string()),
+        ),
+    )
+    .await
+}
--- a/src/main.rs
+++ b/src/main.rs
@ -1,46 +1,170 @@
-use ai::EmbedStore;
-use archive::WebsiteArchive;
 use based::get_pg;
 use rocket::routes;
+use webarc::ai::EmbedStore;
+use webarc::archive::WebsiteArchive;
+use webarc::conf::{get_config, load_config, load_default_config};

-mod ai;
-mod archive;
-mod blacklist;
-mod favicon;
+mod args;
 mod pages;

-#[rocket::launch]
-async fn launch() -> _ {
+#[tokio::main]
+async fn main() {
    env_logger::init();

-    let arc = WebsiteArchive::new("./websites");
+    let args = args::get_args();

-    if std::env::var("DATABASE_URL").is_ok() {
-        let pg = get_pg!();
-        sqlx::migrate!("./migrations").run(pg).await.unwrap();
+    let archive_dir: &String = args.get_one("dir").unwrap();
+
+    match args.subcommand() {
+        Some(("serve", serve_args)) => {
+            let config: &String = serve_args.get_one("config").unwrap();
+            load_config(config);
+
+            let arc = WebsiteArchive::new(archive_dir);
+
+            if std::env::var("DATABASE_URL").is_ok() {
+                let pg = get_pg!();
+                sqlx::migrate!("./migrations").run(pg).await.unwrap();
+            }
+
+            let archive = arc.clone();
+            if get_config().ai.is_some() {
+                tokio::spawn(async move {
+                    EmbedStore::generate_embeddings_for(&archive).await;
+                });
+            }
+
+            let archive = arc.clone();
+            tokio::spawn(async move {
+                webarc::favicon::download_favicons_for_sites(&archive.domains()).await;
+            });
+
+            rocket::build()
+                .mount(
+                    "/",
+                    routes![
+                        based::htmx::htmx_script_route,
+                        pages::index,
+                        pages::render_website,
+                        pages::domain_info_route,
+                        pages::favicon_route,
+                        pages::vector_search,
+                        pages::render_txt_website
+                    ],
+                )
+                .manage(arc)
+                .launch()
+                .await
+                .unwrap();
+        }
+        Some(("archive", archive_args)) => {
+            let arc = WebsiteArchive::new(archive_dir);
+
+            match archive_args.subcommand() {
+                Some(("list", list_args)) => {
+                    let json = list_args.get_flag("json");
+
+                    load_default_config();
+
+                    let elements = if let Some(domain) = list_args.get_one::<String>("DOMAIN") {
+                        arc.get_domain(domain)
+                            .all_paths()
+                            .into_iter()
+                            .map(|x| x.path().clone())
+                            .collect()
+                    } else {
+                        arc.domains()
+                    };
+
+                    if json {
+                        println!(
+                            "{}",
+                            serde_json::to_string(&serde_json::json!(elements)).unwrap()
+                        );
+                    } else {
+                        if let Some(domain) = list_args.get_one::<String>("DOMAIN") {
+                            println!("Paths in {domain}:");
+                        } else {
+                            println!("Domains in {}:", archive_dir);
+                        }
+
+                        if elements.is_empty() {
+                            println!("No domains");
+                        }
+
+                        for d in elements {
+                            println!("- {d}");
+                        }
+                    }
+                }
+                Some(("download", dl_args)) => {
+                    let url: &String = dl_args.get_one("URL").unwrap();
+
+                    let config: &String = dl_args.get_one("config").unwrap();
+                    load_config(config);
+
+                    arc.archive_url(url).await;
+                    println!("Saved {url} to archive");
+                }
+                Some(("versions", ver_args)) => {
+                    load_default_config();
+
+                    let domain: &String = ver_args.get_one("DOMAIN").unwrap();
+                    let path: String = if let Some(path) = ver_args.get_one::<String>("PATH") {
+                        path.clone()
+                    } else {
+                        "/".to_string()
+                    };
+                    let versions = arc.get_domain(domain).path(&path).versions();
+
+                    let json = ver_args.get_flag("json");
+
+                    if json {
+                        println!("{}", serde_json::to_string(&versions).unwrap());
+                    } else {
+                        println!("Versions for {domain} / {path}:");
+                        for v in versions {
+                            println!("- {v}");
+                        }
+                    }
+                }
+                Some(("get", get_args)) => {
+                    load_default_config();
+
+                    let domain: &String = get_args.get_one("DOMAIN").unwrap();
+                    let path = if let Some(path) = get_args.get_one::<String>("PATH") {
+                        path.clone()
+                    } else {
+                        "/".to_string()
+                    };
+                    let doc = arc.get_domain(domain).path(&path);
+                    let ver = if let Some(ver) = get_args.get_one::<String>("VERSION") {
+                        ver.clone()
+                    } else {
+                        doc.versions().first().unwrap().clone()
+                    };
+
+                    let md = get_args.get_flag("md");
+
+                    let content = doc.render_local(Some(ver)).await;
+
+                    if content.is_none() {
+                        println!("No document found");
+                        std::process::exit(1);
+                    }
+
+                    if md {
+                        let markdown = html2md::parse_html(&content.unwrap());
+                        println!("{markdown}");
+                    } else {
+                        println!("{}", content.unwrap());
+                    }
+                }
+                Some((&_, _)) => {}
+                None => {}
+            };
+        }
+        Some((&_, _)) => {}
+        None => {}
    }
-
-    if std::env::var("OLLAMA_URL").is_ok() {
-        EmbedStore::generate_embeddings_for(&arc).await;
-    }
-
-    let archive = arc.clone();
-
-    tokio::spawn(async move {
-        favicon::download_favicons_for_sites(&archive.domains()).await;
-    });
-
-    rocket::build()
-        .mount(
-            "/",
-            routes![
-                based::htmx::htmx_script_route,
-                pages::index,
-                pages::render_website,
-                pages::domain_info_route,
-                pages::favicon_route,
-                pages::vector_search
-            ],
-        )
-        .manage(arc)
 }
--- a/src/pages/component.rs
+++ b/src/pages/component.rs
@ -1,7 +1,3 @@
-use based::{
-    page::Shell,
-    request::{RequestContext, StringResponse},
-};
 use maud::{html, PreEscaped};

 /// Generates an SVG arrow icon with the specified color.
@ -78,20 +74,8 @@ pub fn gen_path_header(
    }
 }

-pub async fn render_page(content: PreEscaped<String>, ctx: RequestContext) -> StringResponse {
-    based::page::render_page(
-        content,
-        "Website Archive",
-        ctx,
-        &Shell::new(
-            html! {
-                script src="https://cdn.tailwindcss.com" {};
-                meta name="viewport" content="width=device-width, initial-scale=1.0" {};
-                script src="/assets/htmx.min.js" {};
-            },
-            html! {},
-            Some("bg-zinc-950 text-white min-h-screen flex pt-8 justify-center".to_string()),
-        ),
-    )
-    .await
+pub fn favicon(site: &str) -> PreEscaped<String> {
+    html! {
+        img class="h-8 w-8 m-2" src=(format!("/favicon/{site}")) {};
+    }
 }
--- a/src/pages/mod.rs
+++ b/src/pages/mod.rs
@ -7,18 +7,20 @@ use based::{
    },
 };
 use maud::{html, PreEscaped};
-use rocket::{get, State};
+use rocket::{get, request::FromSegments, State};

 pub mod component;
 use component::*;
 use serde_json::json;

-use crate::{
-    ai::{generate_embedding, DocEmbedding, EmbedStore},
-    archive::WebsiteArchive,
+use webarc::{
+    ai::{generate_embedding, remove_data_urls, EmbedStore, SearchResult},
+    archive::{extract_domains, WebsiteArchive},
+    conf::get_config,
+    render_page,
 };

-const SEARCH_BAR_STYLE: &'static str = "w-full px-4 mb-4 py-2 text-white bg-black border-2 border-neon-blue placeholder-neon-blue focus:ring-2 focus:ring-neon-pink focus:outline-none font-mono text-lg";
+const SEARCH_BAR_STYLE: &str = "w-full px-4 mb-4 py-2 text-white bg-black border-2 border-neon-blue placeholder-neon-blue focus:ring-2 focus:ring-neon-pink focus:outline-none font-mono text-lg";

 /// Get the favicon of a domain
 #[get("/favicon/<domain>")]
@ -29,6 +31,8 @@ pub async fn favicon_route(domain: &str) -> Option<DataResponse> {
        .read_to_end(&mut buf)
        .ok()?;

+    // TODO : Default favicon
+
    Some(DataResponse::new(
        buf,
        "image/x-icon".to_string(),
@ -57,7 +61,7 @@ pub async fn index(ctx: RequestContext, arc: &State<WebsiteArchive>) -> StringRe
                @for site in websites {
                    a href=(format!("/d/{site}")) class="bg-neutral-900 shadow-md rounded-lg hover:bg-neutral-800 bg-gray-1 hover:cursor-pointer transition-all duration-300 flex flex-col items-center justify-center aspect-square max-w-60" {
                        div class="bg-blue-500 text-white rounded-full p-4" {
-                            img class="h-8 w-8" src=(format!("/favicon/{site}")) {};
+                            (favicon(&site))
                        };
                        p class="mt-4 text-base font-medium" { (site) };
                    };
@ -84,7 +88,7 @@ pub async fn domain_info_route(
    let (path_entries, is_doc) = domain.paths(paths.to_str().unwrap());
    let path_seperations: Vec<&str> = paths.to_str().unwrap().split('/').collect();

-    // TODO : Show domains beeing linked on the page
+    let domains = extract_domains(&document.render_local(None).await.unwrap_or_default());

    let content = html! {
        h2 class="text-xl font-bold mb-4 flex items-center" {
@ -130,20 +134,89 @@ pub async fn domain_info_route(
            };
        };
    };
+
+    @if !domains.is_empty() {
+        div class="max-w-md mx-auto p-4 bg-neutral-900 rounded-lg shadow-md" {
+            h3 class="font-bold mb-2" { "Domains linked on this page:" };
+            ul class="space-y-2 p-4" {
+                @for domain in domains {
+                    a href=(format!("/d/{domain}")) class="flex items-center gap-2 p-3 border bg-neutral-800 rounded hover:shadow-lg transition" {
+                        (favicon(&domain));
+                        span class="font-medium" { (domain) };
+                    };
+                };
+            };
+        };
+    };
    };

    render_page(content, ctx).await
 }

+#[get("/txt/<domain>/<path..>?<time>&<no_data_urls>")]
+pub async fn render_txt_website(
+    domain: &str,
+    path: PathBuf,
+    time: Option<&str>,
+    no_data_urls: Option<&str>,
+    arc: &State<WebsiteArchive>,
+) -> Option<String> {
+    let document = arc.get_domain(domain).path(path.to_str().unwrap());
+
+    let mut content = document
+        .render_local(time.map(|time| time.to_string()))
+        .await?;
+
+    if no_data_urls.is_some() {
+        content = remove_data_urls(&content);
+    }
+
+    Some(html2md::parse_html(&content))
+}
+
+pub struct PathSegment {
+    segments: Vec<String>,
+}
+
+impl PathSegment {
+    pub fn to_str(&self) -> String {
+        self.segments.join("/")
+    }
+}
+
+impl<'r> FromSegments<'r> for PathSegment {
+    type Error = ();
+
+    fn from_segments(
+        segments: rocket::http::uri::Segments<'r, rocket::http::uri::fmt::Path>,
+    ) -> Result<Self, Self::Error> {
+        let paths: Vec<_> = segments
+            .filter_map(|x| {
+                if x == "." {
+                    return None;
+                }
+
+                if x == ".." {
+                    return None;
+                }
+
+                Some(x.to_string())
+            })
+            .collect();
+
+        Ok(PathSegment { segments: paths })
+    }
+}
+
 /// Return archived version of `domain` / `path` at `time`
 #[get("/s/<domain>/<path..>?<time>")]
 pub async fn render_website(
    domain: &str,
-    path: PathBuf,
+    path: PathSegment,
    time: Option<&str>,
    arc: &State<WebsiteArchive>,
 ) -> Option<DataResponse> {
-    let document = arc.get_domain(domain).path(path.to_str().unwrap());
+    let document = arc.get_domain(domain).path(&path.to_str());

    let content = document
        .render_local(time.map(|time| time.to_string()))
@ -155,13 +228,8 @@ pub async fn render_website(
            "text/html".to_string(),
            Some(60 * 60 * 24),
        ));
-    } else if std::env::var("DOWNLOAD_ON_DEMAND")
-        .unwrap_or("false".to_string())
-        .as_str()
-        == "true"
-        && time.is_none()
-    {
-        arc.archive_url(&format!("https://{domain}/{}", path.to_str().unwrap()))
+    } else if get_config().DOWNLOAD_ON_DEMAND && time.is_none() {
+        arc.archive_url(&format!("https://{domain}/{}", path.to_str()))
            .await;

        let content = document.render_local(None).await?;
@ -176,17 +244,17 @@ pub async fn render_website(
    None
 }

-pub fn gen_search_element(x: &DocEmbedding) -> PreEscaped<String> {
+pub fn gen_search_element(x: &SearchResult) -> PreEscaped<String> {
    html! {
        div class="text-xl font-bold mt-4 p-4 flex items-center w-full max-w-4xl max-h-40 mx-auto bg-neutral-800 shadow-md rounded-lg overflow-hidden border border-neutral-900 hover:cursor-pointer"
        hx-get=(format!("/d/{}/{}", x.domain, x.path))
        hx-target="#main_content" hx-push-url="true" hx-swap="innerHTML"
        {
-                img class="p-2" src=(format!("/favicon/{}", &x.domain));
+                (favicon(&x.domain))
                a { (x.domain) };
                (slash_seperator());
                (gen_path_header(x.path.split('/').collect(), &x.domain, false));
-                p class="font-bold p-2 text-stone-400" { (format!("{:.2} %", x.similarity * 100.0)) };
+                p class="font-bold p-2 text-stone-400" { (format!("{:.2} % [{} matches]", x.similarity() * 100.0, x.chunks.len())) };
        };
    }
 }
@ -197,9 +265,7 @@ pub async fn vector_search(
    page: Option<i64>,
    ctx: RequestContext,
 ) -> Option<StringResponse> {
-    if std::env::var("OLLAMA_URL").is_err() {
-        return None;
-    }
+    get_config().ai.as_ref()?;

    let page = page.unwrap_or(1);

@ -220,13 +286,14 @@ pub async fn vector_search(
                    EmbedStore::search_vector(&input, limit as i64, offset as i64).await
                })
            },
-            5,
+            1500,
        )
        .pager(page as u64, vector)
        .await;

        // API Route
        if query.ends_with(".json") {
+            // TODO : Better search API
            return Some(respond_json(&json!(&results.page(page as u64))));
        }
Author	SHA1	Message	Date
JMARyA	b530ae4dc3	fix Some checks failed ci/woodpecker/push/build Pipeline failed Details	2025-01-19 01:44:44 +01:00
JMARyA	c828c00352	fix Some checks failed ci/woodpecker/push/build Pipeline failed Details	2025-01-19 01:14:29 +01:00
JMARyA	c85db8ac2a	fix Some checks failed ci/woodpecker/push/build Pipeline failed Details	2025-01-19 01:01:36 +01:00
JMARyA	20aeb7edac	fix urls Some checks failed ci/woodpecker/push/build Pipeline failed Details	2025-01-14 19:10:49 +01:00
JMARyA	3696f61b02	fix + refactor Some checks failed ci/woodpecker/push/build Pipeline failed Details	2025-01-11 16:21:15 +01:00
JMARyA	56f13c6524	fix Some checks failed ci/woodpecker/push/build Pipeline failed Details	2025-01-03 16:54:36 +01:00
JMARyA	536f42a4e8	add per domain config Some checks failed ci/woodpecker/push/build Pipeline failed Details	2025-01-03 13:34:59 +01:00
JMARyA	dc10052c16	update Some checks failed ci/woodpecker/push/build Pipeline failed Details	2025-01-03 00:20:22 +01:00
JMARyA	a4a60c86df	update Some checks failed ci/woodpecker/push/build Pipeline failed Details	2025-01-02 23:35:41 +01:00
JMARyA	6700d4d817	fix Some checks failed ci/woodpecker/push/build Pipeline failed Details	2025-01-02 22:56:51 +01:00
JMARyA	586d3f4c0c	fix Some checks failed ci/woodpecker/push/build Pipeline failed Details	2025-01-02 19:35:37 +01:00
JMARyA	a21fd44f64	fix Some checks failed ci/woodpecker/push/build Pipeline failed Details	2025-01-02 19:10:27 +01:00
JMARyA	8df8edeeca	update Some checks failed ci/woodpecker/push/build Pipeline failed Details	2025-01-02 19:00:47 +01:00
JMARyA	0f6e5f5b10	search Some checks failed ci/woodpecker/push/build Pipeline failed Details	2024-12-31 17:42:56 +01:00
JMARyA	3ed87eaba2	txt route Some checks failed ci/woodpecker/push/build Pipeline failed Details	2024-12-31 17:41:52 +01:00
JMARyA	0e5ca89f1d	fix ordering Some checks failed ci/woodpecker/push/build Pipeline failed Details	2024-12-31 03:18:14 +01:00
JMARyA	38287e77e7	increase chunk size Some checks failed ci/woodpecker/push/build Pipeline failed Details	2024-12-31 02:30:21 +01:00
JMARyA	6aea22576c	update chunked embed Some checks failed ci/woodpecker/push/build Pipeline failed Details	2024-12-31 02:03:03 +01:00
JMARyA	e50d31479c	fix	2024-12-30 23:21:48 +01:00
JMARyA	5cbc7ef0d2	fix Some checks are pending ci/woodpecker/push/build Pipeline is running Details	2024-12-30 23:08:38 +01:00
JMARyA	37cd37018f	fix fav Some checks failed ci/woodpecker/push/build Pipeline failed Details	2024-12-30 22:14:39 +01:00
JMARyA	c3d22f8e89	fix Some checks failed ci/woodpecker/push/build Pipeline failed Details	2024-12-30 22:06:15 +01:00
JMARyA	69a9eb4d9d	refactor Some checks failed ci/woodpecker/push/build Pipeline failed Details	2024-12-30 21:52:48 +01:00