parent
0f6e5f5b10
commit
8df8edeeca
15 changed files with 591 additions and 124 deletions
115
Cargo.lock
generated
115
Cargo.lock
generated
|
@ -59,6 +59,55 @@ dependencies = [
|
|||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "anstream"
|
||||
version = "0.6.18"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8acc5369981196006228e28809f761875c0327210a891e941f4c683b3a99529b"
|
||||
dependencies = [
|
||||
"anstyle",
|
||||
"anstyle-parse",
|
||||
"anstyle-query",
|
||||
"anstyle-wincon",
|
||||
"colorchoice",
|
||||
"is_terminal_polyfill",
|
||||
"utf8parse",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "anstyle"
|
||||
version = "1.0.10"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "55cc3b69f167a1ef2e161439aa98aed94e6028e5f9a59be9a6ffb47aef1651f9"
|
||||
|
||||
[[package]]
|
||||
name = "anstyle-parse"
|
||||
version = "0.2.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3b2d16507662817a6a20a9ea92df6652ee4f94f914589377d69f3b21bc5798a9"
|
||||
dependencies = [
|
||||
"utf8parse",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "anstyle-query"
|
||||
version = "1.1.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "79947af37f4177cfead1110013d678905c37501914fba0efea834c3fe9a8d60c"
|
||||
dependencies = [
|
||||
"windows-sys 0.59.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "anstyle-wincon"
|
||||
version = "3.0.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2109dbce0e72be3ec00bed26e6a7479ca384ad226efdd66db8fa2e3a38c83125"
|
||||
dependencies = [
|
||||
"anstyle",
|
||||
"windows-sys 0.59.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "async-stream"
|
||||
version = "0.3.6"
|
||||
|
@ -313,6 +362,52 @@ dependencies = [
|
|||
"inout",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "clap"
|
||||
version = "4.5.23"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3135e7ec2ef7b10c6ed8950f0f792ed96ee093fa088608f1c76e569722700c84"
|
||||
dependencies = [
|
||||
"clap_builder",
|
||||
"clap_derive",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "clap_builder"
|
||||
version = "4.5.23"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "30582fc632330df2bd26877bde0c1f4470d57c582bbc070376afcd04d8cb4838"
|
||||
dependencies = [
|
||||
"anstream",
|
||||
"anstyle",
|
||||
"clap_lex",
|
||||
"strsim",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "clap_derive"
|
||||
version = "4.5.18"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4ac6a0c7b1a9e9a5186361f67dfa1b88213572f427fb9ab038efb2bd8c582dab"
|
||||
dependencies = [
|
||||
"heck",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.93",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "clap_lex"
|
||||
version = "0.7.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f46ad14479a25103f283c0f10005961cf086d8dc42205bb44c46ac563475dca6"
|
||||
|
||||
[[package]]
|
||||
name = "colorchoice"
|
||||
version = "1.0.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5b63caa9aa9397e2d9480a9b13673856c78d8ac123288526c37d7839f2a86990"
|
||||
|
||||
[[package]]
|
||||
name = "combine"
|
||||
version = "4.6.7"
|
||||
|
@ -1351,6 +1446,12 @@ dependencies = [
|
|||
"windows-sys 0.52.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "is_terminal_polyfill"
|
||||
version = "1.70.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf"
|
||||
|
||||
[[package]]
|
||||
name = "itoa"
|
||||
version = "1.0.14"
|
||||
|
@ -2942,6 +3043,12 @@ dependencies = [
|
|||
"unicode-properties",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "strsim"
|
||||
version = "0.11.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
|
||||
|
||||
[[package]]
|
||||
name = "subtle"
|
||||
version = "2.6.1"
|
||||
|
@ -3480,6 +3587,12 @@ version = "1.0.4"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be"
|
||||
|
||||
[[package]]
|
||||
name = "utf8parse"
|
||||
version = "0.2.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
|
||||
|
||||
[[package]]
|
||||
name = "uuid"
|
||||
version = "1.11.0"
|
||||
|
@ -3622,6 +3735,7 @@ version = "0.1.0"
|
|||
dependencies = [
|
||||
"based",
|
||||
"chrono",
|
||||
"clap",
|
||||
"env_logger",
|
||||
"futures",
|
||||
"html2md",
|
||||
|
@ -3636,6 +3750,7 @@ dependencies = [
|
|||
"serde_json",
|
||||
"sqlx",
|
||||
"tokio",
|
||||
"toml",
|
||||
"url",
|
||||
"uuid",
|
||||
]
|
||||
|
|
|
@ -22,3 +22,5 @@ reqwest = "0.12.11"
|
|||
ollama-rs = "0.2.2"
|
||||
pgvector = { version = "0.4", features = ["sqlx"] }
|
||||
html2md = "0.2.14"
|
||||
clap = { version = "4.5.23", features = ["cargo", "derive"] }
|
||||
toml = "0.8.19"
|
||||
|
|
43
README.md
43
README.md
|
@ -1,14 +1,47 @@
|
|||
# WebArc
|
||||
`webarc` is a local website archive based on [monolith](https://github.com/Y2Z/monolith).
|
||||
|
||||
## Configuration
|
||||
You can configure the application using environment variables:
|
||||
## Archive Format
|
||||
A web archive is defined as a directory containing domains in this structure:
|
||||
|
||||
- `$ROUTE_INTERNAL` : Rewrite links to point back to the archive itself
|
||||
- `$DOWNLOAD_ON_DEMAND` : Download missing routes with monolith on demand
|
||||
- `$BLACKLIST_DOMAINS` : Blacklisted domains (Comma-seperated regex, example: `google.com,.*.youtube.com`)
|
||||
```
|
||||
web_archive/
|
||||
├─ domain.com/
|
||||
│ ├─ sub/
|
||||
│ │ ├─ path/
|
||||
│ │ │ ├─ index_YYYY_MM_DD.html
|
||||
├─ sub.domain.com/
|
||||
```
|
||||
|
||||
Every document of this web archive can then be found at `archive/domain/paths.../index_YYYY_MM_DD.html`.
|
||||
|
||||
## Usage
|
||||
webarc provides a CLI tool to work with the archive structure.
|
||||
|
||||
```sh
|
||||
# List domains in archive
|
||||
webarc [--dir ARCHIVE] archive list [-j, --json]
|
||||
|
||||
# List all paths on a domain
|
||||
webarc [--dir ARCHIVE] archive list [-j, --json] [DOMAIN]
|
||||
|
||||
# List all versions of a document
|
||||
webarc [--dir ARCHIVE] archive versions [-j, --json] [DOMAIN] [PATH]
|
||||
|
||||
# Get a document
|
||||
# `--md` will return a markdown version
|
||||
webarc [--dir ARCHIVE] archive get [--md] [DOMAIN] [PATH] [VERSION]
|
||||
|
||||
# Archive a website
|
||||
webarc [--dir ARCHIVE] archive download [URL]
|
||||
```
|
||||
|
||||
## Configuration
|
||||
You can configure the application using a config file. Look at the [config.toml](config.toml) file for more information.
|
||||
|
||||
## Web Server
|
||||
You can start a webserver serving an archive with `webarc serve`.
|
||||
|
||||
Archived pages can be viewed at `/s/<domain>/<path..>`.
|
||||
For example, `/s/en.wikipedia.org/wiki/Website` will serve `en.wikipedia.org` at `/wiki/Website`.
|
||||
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# Logging
|
||||
RUST_LOG=info
|
||||
ROCKET_ADDRESS=0.0.0.0
|
||||
|
||||
# Rewrite links to point back to the archive itself
|
||||
ROUTE_INTERNAL=true
|
||||
|
@ -8,12 +5,12 @@ ROUTE_INTERNAL=true
|
|||
# Download missing routes on demand
|
||||
DOWNLOAD_ON_DEMAND=true
|
||||
|
||||
# Blacklisted domains (Comma-seperated regex)
|
||||
[websites]
|
||||
# You can blacklist sites which wont work well
|
||||
BLACKLIST_DOMAINS="^gitlab"
|
||||
|
||||
# Database
|
||||
DATABASE_URL=postgres://user:pass@postgres/webarc
|
||||
BLACKLIST_DOMAINS = [
|
||||
"^gitlab" # All domains starting with gitlab
|
||||
]
|
||||
|
||||
[ai]
|
||||
# Ollama URL (Enables vector search)
|
||||
OLLAMA_URL=127.0.0.1:11434
|
||||
OLLAMA_URL="127.0.0.1:11434"
|
|
@ -6,7 +6,12 @@ services:
|
|||
volumes:
|
||||
- ./websites:/websites
|
||||
- ./favicon:/favicon
|
||||
env_file: env
|
||||
- ./config.toml:/config.toml
|
||||
environment:
|
||||
- "RUST_LOG=info"
|
||||
- "ROCKET_ADDRESS=0.0.0.0"
|
||||
- "DATABASE_URL=postgres://user:pass@postgres/webarc"
|
||||
command: "webarc serve"
|
||||
|
||||
postgres:
|
||||
# Any Postgres with support for pgvector
|
||||
|
|
|
@ -8,4 +8,6 @@ CREATE TABLE doc_embedding (
|
|||
chunk INTEGER NOT NULL,
|
||||
embed_mxbai_embed_large vector(1024) NOT NULL,
|
||||
PRIMARY KEY (domain, path, ver, chunk)
|
||||
)
|
||||
);
|
||||
|
||||
CREATE INDEX ON doc_embedding USING ivfflat (embed_mxbai_embed_large vector_cosine_ops) WITH (lists = 200);
|
||||
|
|
64
src/ai.rs
64
src/ai.rs
|
@ -6,10 +6,12 @@ use serde::Serialize;
|
|||
use serde_json::json;
|
||||
use sqlx::FromRow;
|
||||
|
||||
use crate::archive::{Document, Domain, WebsiteArchive};
|
||||
use crate::{
|
||||
archive::{Document, Domain, WebsiteArchive},
|
||||
conf::get_config,
|
||||
};
|
||||
|
||||
// TODO : Chunked embeddings + better search + ranking
|
||||
// TODO : Real citese embeddings + search
|
||||
// TODO : Cite found chunks in search res?
|
||||
|
||||
#[derive(Debug, Clone, FromRow, Serialize)]
|
||||
pub struct DocEmbedding {
|
||||
|
@ -18,6 +20,7 @@ pub struct DocEmbedding {
|
|||
pub ver: String,
|
||||
pub chunk: i32,
|
||||
|
||||
#[allow(dead_code)]
|
||||
#[serde(skip)]
|
||||
embed_mxbai_embed_large: pgvector::Vector,
|
||||
|
||||
|
@ -25,24 +28,42 @@ pub struct DocEmbedding {
|
|||
pub similarity: f64,
|
||||
}
|
||||
|
||||
impl DocEmbedding {
|
||||
pub async fn total_chunks(&self) -> i64 {
|
||||
let res: (i64,) = sqlx::query_as(
|
||||
"SELECT MAX(chunk) FROM doc_embedding WHERE domain = $1 AND path = $2 AND ver = $3",
|
||||
)
|
||||
.bind(&self.domain)
|
||||
.bind(&self.path)
|
||||
.bind(&self.ver)
|
||||
.fetch_one(get_pg!())
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
res.0
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub struct SearchResult {
|
||||
pub domain: String,
|
||||
pub path: String,
|
||||
pub total_chunks: i64,
|
||||
pub chunks: Vec<DocEmbedding>,
|
||||
}
|
||||
|
||||
impl SearchResult {
|
||||
pub fn new(domain: String, path: String) -> Self {
|
||||
pub fn new(domain: String, path: String, total_chunks: i64) -> Self {
|
||||
Self {
|
||||
domain,
|
||||
path,
|
||||
total_chunks,
|
||||
chunks: vec![],
|
||||
}
|
||||
}
|
||||
|
||||
pub fn similarity(&self) -> f64 {
|
||||
total_score(&self.chunks)
|
||||
total_score(&self.chunks) * (self.chunks.len() as f64 / self.total_chunks as f64)
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -99,6 +120,13 @@ pub fn chunked(s: &str) -> Vec<String> {
|
|||
.collect()
|
||||
}
|
||||
|
||||
fn remove_data_urls(input: &str) -> String {
|
||||
let re = regex::Regex::new("data:(.*?)(;base64)?,(.*)").unwrap();
|
||||
|
||||
// Replace all occurrences of data URLs with an empty string
|
||||
re.replace_all(input, "").to_string()
|
||||
}
|
||||
|
||||
impl Embedding for Document {
|
||||
async fn embedding(&self, ver: Option<String>) -> Option<Vec<Vec<f32>>> {
|
||||
let latest = "latest".to_string();
|
||||
|
@ -110,7 +138,7 @@ impl Embedding for Document {
|
|||
);
|
||||
|
||||
let content_html = self.render_local(ver.clone()).await?;
|
||||
let content = html2md::parse_html(&content_html);
|
||||
let content = remove_data_urls(&html2md::parse_html(&content_html));
|
||||
|
||||
let mut embeddings = Vec::new();
|
||||
let content = chunked(&content);
|
||||
|
@ -133,7 +161,8 @@ impl Embedding for Document {
|
|||
}
|
||||
|
||||
pub async fn generate_embedding(mut input: String) -> Option<Vec<f32>> {
|
||||
if let Ok(ollama_url) = std::env::var("OLLAMA_URL") {
|
||||
// TODO : Ollama load balancing
|
||||
if let Some(ollama_url) = get_config().ai.as_ref().map(|x| x.OLLAMA_URL.clone()) {
|
||||
let (host, port) = ollama_url.split_once(':')?;
|
||||
let ollama = ollama_rs::Ollama::new(format!("http://{host}"), port.parse().ok()?);
|
||||
|
||||
|
@ -231,13 +260,10 @@ impl EmbedStore {
|
|||
}
|
||||
|
||||
pub async fn search_vector(v: &pgvector::Vector, limit: i64, offset: i64) -> Vec<SearchResult> {
|
||||
// TODO : fix search
|
||||
// + new ranked algorithm
|
||||
// + better repr
|
||||
// limit should cover SearchResults not the query -> rework
|
||||
|
||||
let results: Vec<DocEmbedding> = sqlx::query_as(
|
||||
"SELECT *, 1 / (1 + (embed_mxbai_embed_large <-> $1)) AS similarity FROM doc_embedding ORDER BY embed_mxbai_embed_large <-> $1 LIMIT $2 OFFSET $3",
|
||||
"SELECT *, 1 / (1 + (embed_mxbai_embed_large <-> $1)) AS similarity FROM doc_embedding ORDER BY embed_mxbai_embed_large <=> $1 LIMIT $2 OFFSET $3",
|
||||
)
|
||||
.bind(v)
|
||||
.bind(limit)
|
||||
|
@ -249,26 +275,24 @@ impl EmbedStore {
|
|||
let mut search_res: HashMap<String, HashMap<String, SearchResult>> = HashMap::new();
|
||||
|
||||
for res in results {
|
||||
let domain = search_res
|
||||
.entry(res.domain.clone())
|
||||
.or_insert(HashMap::new());
|
||||
let doc = domain
|
||||
.entry(res.path.clone())
|
||||
.or_insert(SearchResult::new(res.domain.clone(), res.path.clone()));
|
||||
let domain = search_res.entry(res.domain.clone()).or_default();
|
||||
let doc = domain.entry(res.path.clone()).or_insert(SearchResult::new(
|
||||
res.domain.clone(),
|
||||
res.path.clone(),
|
||||
res.total_chunks().await,
|
||||
));
|
||||
doc.chunks.push(res);
|
||||
}
|
||||
|
||||
let mut flat = search_res
|
||||
.into_values()
|
||||
.map(|x| x.into_values().collect::<Vec<SearchResult>>())
|
||||
.flatten()
|
||||
.flat_map(|x| x.into_values().collect::<Vec<SearchResult>>())
|
||||
.collect::<Vec<SearchResult>>();
|
||||
|
||||
flat.sort_by(|a, b| {
|
||||
b.similarity()
|
||||
.partial_cmp(&a.similarity())
|
||||
.unwrap_or(std::cmp::Ordering::Equal)
|
||||
.then(b.chunks.len().cmp(&a.chunks.len()))
|
||||
});
|
||||
flat
|
||||
}
|
||||
|
|
|
@ -3,7 +3,7 @@ use std::{io::Read, path::PathBuf};
|
|||
use based::{request::RequestContext, result::LogAndIgnore};
|
||||
use maud::html;
|
||||
|
||||
use crate::{blacklist::check_blacklist, favicon::download_fav_for, pages::component::render_page};
|
||||
use crate::{blacklist::check_blacklist, conf::get_config, favicon::download_fav_for, render_page};
|
||||
|
||||
/// Read directory entries into `Vec<String>`
|
||||
pub fn read_dir(dir: &PathBuf) -> Vec<String> {
|
||||
|
@ -22,16 +22,19 @@ pub fn read_dir(dir: &PathBuf) -> Vec<String> {
|
|||
|
||||
/// Rewrite all URLs in `input` to the format `/s/<domain>/<path..>`
|
||||
fn internalize_urls(input: &str) -> String {
|
||||
// TODO : Ignore blacklisted urls
|
||||
let url_pattern = r"https?://([a-zA-Z0-9.-]+)(/[\w./-]*)";
|
||||
let re = regex::Regex::new(url_pattern).unwrap();
|
||||
|
||||
re.replace_all(input, |caps: ®ex::Captures| {
|
||||
format!(
|
||||
"/s/{}/{}",
|
||||
&caps[1].trim_start_matches("www."), // Domain
|
||||
&caps[2] // Path
|
||||
)
|
||||
let domain = caps[1].trim_start_matches("www.");
|
||||
let path = &caps[2];
|
||||
|
||||
// Dont transform if in blacklist
|
||||
if check_blacklist(domain) {
|
||||
return format!("https://{domain}/{path}");
|
||||
}
|
||||
|
||||
format!("/s/{domain}/{path}")
|
||||
})
|
||||
.to_string()
|
||||
}
|
||||
|
@ -82,6 +85,23 @@ impl Domain {
|
|||
Document::new(&self.name, path, self.dir.parent().unwrap().to_path_buf())
|
||||
}
|
||||
|
||||
/// Get all paths associated with the domain
|
||||
pub fn all_paths(&self) -> Vec<PathEntry> {
|
||||
let mut queue = self.paths("/").0;
|
||||
|
||||
let mut ret = Vec::new();
|
||||
|
||||
ret.push(PathEntry(self.name.clone(), "/".to_string()));
|
||||
|
||||
while let Some(el) = queue.pop() {
|
||||
ret.push(el.clone());
|
||||
let paths = self.paths(&el.1).0;
|
||||
queue.extend(paths);
|
||||
}
|
||||
|
||||
ret
|
||||
}
|
||||
|
||||
/// Retrieves entries and metadata for a given path within the domain.
|
||||
///
|
||||
/// # Parameters
|
||||
|
@ -98,6 +118,12 @@ impl Domain {
|
|||
base_path = base_path.join(p);
|
||||
}
|
||||
|
||||
let path = path
|
||||
.split("/")
|
||||
.filter(|x| !x.is_empty())
|
||||
.collect::<Vec<&str>>()
|
||||
.join("/");
|
||||
|
||||
let dir_content = read_dir(&base_path);
|
||||
|
||||
let mut ret = Vec::new();
|
||||
|
@ -106,6 +132,11 @@ impl Domain {
|
|||
|
||||
for entry in dir_content {
|
||||
let url_path = format!("{path}/{entry}");
|
||||
let url_path = url_path
|
||||
.split("/")
|
||||
.filter(|x| !x.is_empty())
|
||||
.collect::<Vec<&str>>()
|
||||
.join("/");
|
||||
if entry.starts_with("index_") && entry.ends_with(".html") {
|
||||
is_doc = true;
|
||||
continue;
|
||||
|
@ -119,6 +150,7 @@ impl Domain {
|
|||
}
|
||||
|
||||
/// Represents an entry within a domain's path, containing its name and URL path.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct PathEntry(String, String);
|
||||
|
||||
impl PathEntry {
|
||||
|
@ -203,7 +235,7 @@ impl Document {
|
|||
.unwrap();
|
||||
let content = String::from_utf8_lossy(&buf);
|
||||
|
||||
if std::env::var("ROUTE_INTERNAL").unwrap_or("false".to_string()) == "true" {
|
||||
if get_config().ROUTE_INTERNAL {
|
||||
Some(internalize_urls(&content))
|
||||
} else {
|
||||
Some(content.to_string())
|
||||
|
@ -291,6 +323,7 @@ impl WebsiteArchive {
|
|||
///
|
||||
/// This function downloads the content of the URL, processes it, and saves it to the archive.
|
||||
pub async fn archive_url(&self, url: &str) {
|
||||
// TODO : refactor
|
||||
let parsed_url = url::Url::parse(url).unwrap();
|
||||
|
||||
let domain = parsed_url.domain().unwrap().trim_start_matches("www.");
|
||||
|
|
61
src/args.rs
Normal file
61
src/args.rs
Normal file
|
@ -0,0 +1,61 @@
|
|||
use clap::{arg, command};
|
||||
|
||||
pub fn get_args() -> clap::ArgMatches {
|
||||
command!()
|
||||
.about("Web Archive")
|
||||
.arg(
|
||||
arg!(-d --dir <dir> "Web archive directory")
|
||||
.required(false)
|
||||
.default_value("./websites"),
|
||||
)
|
||||
.subcommand(
|
||||
command!()
|
||||
.name("serve")
|
||||
.about("Start web archive server")
|
||||
.arg(
|
||||
arg!(-c --config <config> "Web archive config file")
|
||||
.required(false)
|
||||
.default_value("./config.toml"),
|
||||
),
|
||||
)
|
||||
.subcommand(
|
||||
command!()
|
||||
.name("archive")
|
||||
.about("Work with web archives")
|
||||
.subcommand(
|
||||
command!()
|
||||
.name("download")
|
||||
.about("Download a new URL into the archive")
|
||||
.arg(
|
||||
arg!(-c --config <config> "Web archive config file")
|
||||
.required(false)
|
||||
.default_value("./config.toml"),
|
||||
)
|
||||
.arg(arg!([URL] "The URL to download").required(true))
|
||||
)
|
||||
.subcommand(
|
||||
command!()
|
||||
.name("list")
|
||||
.about("List domains contained in the archive. If a domain is provided all paths of this domain will be listed.")
|
||||
.arg(arg!([DOMAIN] "A domain to list").required(false))
|
||||
.arg(arg!(-j --json "Ouput JSON").required(false)),
|
||||
)
|
||||
.subcommand(
|
||||
command!()
|
||||
.name("versions")
|
||||
.about("List saved versions of a document")
|
||||
.arg(arg!(-j --json "Ouput JSON").required(false))
|
||||
.arg(arg!([DOMAIN] "A domain").required(true))
|
||||
.arg(arg!([PATH] "A path").required(false))
|
||||
)
|
||||
.subcommand(
|
||||
command!()
|
||||
.name("get")
|
||||
.about("Get a saved document")
|
||||
.arg(arg!(--md "Ouput Markdown").required(false))
|
||||
.arg(arg!([DOMAIN] "A domain").required(true))
|
||||
.arg(arg!([PATH] "A path").required(false))
|
||||
.arg(arg!([VERSION] "A version").required(false))
|
||||
))
|
||||
.get_matches()
|
||||
}
|
|
@ -1,17 +1,17 @@
|
|||
use crate::conf::get_config;
|
||||
|
||||
/// Checks if a domain is present in the blacklist of unwanted domains.
|
||||
///
|
||||
/// This function checks the `$BLACKLIST_DOMAINS` environment variable for a comma-separated list of regular expressions to match against.
|
||||
/// If a match is found, it immediately returns `true`. Otherwise, it returns `false`.
|
||||
pub fn check_blacklist(domain: &str) -> bool {
|
||||
let blacklist_raw = std::env::var("BLACKLIST_DOMAINS").unwrap_or_default();
|
||||
let conf = get_config();
|
||||
let conf = conf.websites.as_ref();
|
||||
|
||||
if blacklist_raw.is_empty() {
|
||||
return false;
|
||||
}
|
||||
let blacklisted_domains = conf
|
||||
.map(|x| x.BLACKLIST_DOMAINS.as_ref())
|
||||
.unwrap_or_default();
|
||||
|
||||
let blacklist: Vec<&str> = blacklist_raw.split(',').collect();
|
||||
|
||||
for domain_regex in blacklist {
|
||||
for domain_regex in blacklisted_domains.unwrap_or(&Vec::new()) {
|
||||
let rgx = regex::Regex::new(domain_regex).unwrap();
|
||||
if rgx.is_match(domain) {
|
||||
return true;
|
||||
|
|
70
src/conf.rs
Normal file
70
src/conf.rs
Normal file
|
@ -0,0 +1,70 @@
|
|||
use std::sync::Arc;
|
||||
|
||||
use serde::Deserialize;
|
||||
use tokio::sync::OnceCell;
|
||||
|
||||
pub static CONFIG: OnceCell<Arc<Config>> = OnceCell::const_new();
|
||||
|
||||
/// Get a reference to global config
|
||||
pub fn get_config() -> &'static Arc<Config> {
|
||||
crate::conf::CONFIG.get().unwrap()
|
||||
}
|
||||
|
||||
/// Load a global config
|
||||
pub fn load_config(path: &str) {
|
||||
// TODO : Other load locations
|
||||
if let Ok(file_content) = std::fs::read_to_string(path) {
|
||||
let conf: Config =
|
||||
toml::from_str(&file_content).expect("Could not deserialize config file");
|
||||
crate::conf::CONFIG.set(std::sync::Arc::new(conf)).unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
/// Load a default global config
|
||||
pub fn load_default_config() {
|
||||
if crate::conf::CONFIG.get().is_none() {
|
||||
crate::conf::CONFIG
|
||||
.set(std::sync::Arc::new(Config::default()))
|
||||
.unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(non_snake_case)]
|
||||
#[derive(Debug, Clone, Deserialize)]
|
||||
pub struct Config {
|
||||
pub ROUTE_INTERNAL: bool,
|
||||
pub DOWNLOAD_ON_DEMAND: bool,
|
||||
pub ai: Option<AIConfig>,
|
||||
pub websites: Option<WebsiteConfig>,
|
||||
}
|
||||
|
||||
#[allow(non_snake_case)]
|
||||
#[derive(Debug, Clone, Deserialize)]
|
||||
pub struct AIConfig {
|
||||
pub OLLAMA_URL: String,
|
||||
}
|
||||
|
||||
#[allow(non_snake_case)]
|
||||
#[derive(Debug, Clone, Deserialize)]
|
||||
pub struct WebsiteConfig {
|
||||
pub BLACKLIST_DOMAINS: Option<Vec<String>>,
|
||||
pub domains: Option<Vec<DomainConfig>>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Deserialize)]
|
||||
pub struct DomainConfig {
|
||||
// TODO : Domain specific config
|
||||
pub blacklist_paths: Option<Vec<String>>,
|
||||
pub no_javascript: bool,
|
||||
}
|
||||
|
||||
impl Default for Config {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
ROUTE_INTERNAL: false,
|
||||
DOWNLOAD_ON_DEMAND: false,
|
||||
ai: None,
|
||||
websites: None,
|
||||
}
|
||||
}
|
||||
}
|
29
src/lib.rs
Normal file
29
src/lib.rs
Normal file
|
@ -0,0 +1,29 @@
|
|||
use based::{
|
||||
page::Shell,
|
||||
request::{RequestContext, StringResponse},
|
||||
};
|
||||
use maud::{html, PreEscaped};
|
||||
|
||||
pub mod ai;
|
||||
pub mod archive;
|
||||
pub mod blacklist;
|
||||
pub mod conf;
|
||||
pub mod favicon;
|
||||
|
||||
pub async fn render_page(content: PreEscaped<String>, ctx: RequestContext) -> StringResponse {
|
||||
based::page::render_page(
|
||||
content,
|
||||
"Website Archive",
|
||||
ctx,
|
||||
&Shell::new(
|
||||
html! {
|
||||
script src="https://cdn.tailwindcss.com" {};
|
||||
meta name="viewport" content="width=device-width, initial-scale=1.0" {};
|
||||
script src="/assets/htmx.min.js" {};
|
||||
},
|
||||
html! {},
|
||||
Some("bg-zinc-950 text-white min-h-screen flex pt-8 justify-center".to_string()),
|
||||
),
|
||||
)
|
||||
.await
|
||||
}
|
143
src/main.rs
143
src/main.rs
|
@ -1,19 +1,26 @@
|
|||
use ai::EmbedStore;
|
||||
use archive::WebsiteArchive;
|
||||
use based::get_pg;
|
||||
use rocket::routes;
|
||||
use webarc::ai::EmbedStore;
|
||||
use webarc::archive::WebsiteArchive;
|
||||
use webarc::conf::{get_config, load_config, load_default_config};
|
||||
|
||||
mod ai;
|
||||
mod archive;
|
||||
mod blacklist;
|
||||
mod favicon;
|
||||
mod args;
|
||||
mod pages;
|
||||
|
||||
#[rocket::launch]
|
||||
async fn launch() -> _ {
|
||||
#[tokio::main]
|
||||
async fn main() {
|
||||
env_logger::init();
|
||||
|
||||
let arc = WebsiteArchive::new("./websites");
|
||||
let args = args::get_args();
|
||||
|
||||
let archive_dir: &String = args.get_one("dir").unwrap();
|
||||
|
||||
match args.subcommand() {
|
||||
Some(("serve", serve_args)) => {
|
||||
let config: &String = serve_args.get_one("config").unwrap();
|
||||
load_config(config);
|
||||
|
||||
let arc = WebsiteArchive::new(archive_dir);
|
||||
|
||||
if std::env::var("DATABASE_URL").is_ok() {
|
||||
let pg = get_pg!();
|
||||
|
@ -21,7 +28,7 @@ async fn launch() -> _ {
|
|||
}
|
||||
|
||||
let archive = arc.clone();
|
||||
if std::env::var("OLLAMA_URL").is_ok() {
|
||||
if get_config().ai.is_some() {
|
||||
tokio::spawn(async move {
|
||||
EmbedStore::generate_embeddings_for(&archive).await;
|
||||
});
|
||||
|
@ -29,7 +36,7 @@ async fn launch() -> _ {
|
|||
|
||||
let archive = arc.clone();
|
||||
tokio::spawn(async move {
|
||||
favicon::download_favicons_for_sites(&archive.domains()).await;
|
||||
webarc::favicon::download_favicons_for_sites(&archive.domains()).await;
|
||||
});
|
||||
|
||||
rocket::build()
|
||||
|
@ -46,4 +53,118 @@ async fn launch() -> _ {
|
|||
],
|
||||
)
|
||||
.manage(arc)
|
||||
.launch()
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
Some(("archive", archive_args)) => {
|
||||
let arc = WebsiteArchive::new(archive_dir);
|
||||
|
||||
match archive_args.subcommand() {
|
||||
Some(("list", list_args)) => {
|
||||
let json = list_args.get_flag("json");
|
||||
|
||||
load_default_config();
|
||||
|
||||
let elements = if let Some(domain) = list_args.get_one::<String>("DOMAIN") {
|
||||
arc.get_domain(domain)
|
||||
.all_paths()
|
||||
.into_iter()
|
||||
.map(|x| x.path().clone())
|
||||
.collect()
|
||||
} else {
|
||||
arc.domains()
|
||||
};
|
||||
|
||||
if json {
|
||||
println!(
|
||||
"{}",
|
||||
serde_json::to_string(&serde_json::json!(elements)).unwrap()
|
||||
);
|
||||
} else {
|
||||
if let Some(domain) = list_args.get_one::<String>("DOMAIN") {
|
||||
println!("Paths in {domain}:");
|
||||
} else {
|
||||
println!("Domains in {}:", archive_dir);
|
||||
}
|
||||
|
||||
if elements.is_empty() {
|
||||
println!("No domains");
|
||||
}
|
||||
|
||||
for d in elements {
|
||||
println!("- {d}");
|
||||
}
|
||||
}
|
||||
}
|
||||
Some(("download", dl_args)) => {
|
||||
let url: &String = dl_args.get_one("URL").unwrap();
|
||||
|
||||
let config: &String = dl_args.get_one("config").unwrap();
|
||||
load_config(config);
|
||||
|
||||
arc.archive_url(url).await;
|
||||
println!("Saved {url} to archive");
|
||||
}
|
||||
Some(("versions", ver_args)) => {
|
||||
load_default_config();
|
||||
|
||||
let domain: &String = ver_args.get_one("DOMAIN").unwrap();
|
||||
let path: String = if let Some(path) = ver_args.get_one::<String>("PATH") {
|
||||
path.clone()
|
||||
} else {
|
||||
"/".to_string()
|
||||
};
|
||||
let versions = arc.get_domain(domain).path(&path).versions();
|
||||
|
||||
let json = ver_args.get_flag("json");
|
||||
|
||||
if json {
|
||||
println!("{}", serde_json::to_string(&versions).unwrap());
|
||||
} else {
|
||||
println!("Versions for {domain} / {path}:");
|
||||
for v in versions {
|
||||
println!("- {v}");
|
||||
}
|
||||
}
|
||||
}
|
||||
Some(("get", get_args)) => {
|
||||
load_default_config();
|
||||
|
||||
let domain: &String = get_args.get_one("DOMAIN").unwrap();
|
||||
let path = if let Some(path) = get_args.get_one::<String>("PATH") {
|
||||
path.clone()
|
||||
} else {
|
||||
"/".to_string()
|
||||
};
|
||||
let doc = arc.get_domain(domain).path(&path);
|
||||
let ver = if let Some(ver) = get_args.get_one::<String>("VERSION") {
|
||||
ver.clone()
|
||||
} else {
|
||||
doc.versions().first().unwrap().clone()
|
||||
};
|
||||
|
||||
let md = get_args.get_flag("md");
|
||||
|
||||
let content = doc.render_local(Some(ver)).await;
|
||||
|
||||
if content.is_none() {
|
||||
println!("No document found");
|
||||
std::process::exit(1);
|
||||
}
|
||||
|
||||
if md {
|
||||
let markdown = html2md::parse_html(&content.unwrap());
|
||||
println!("{markdown}");
|
||||
} else {
|
||||
println!("{}", content.unwrap());
|
||||
}
|
||||
}
|
||||
Some((&_, _)) => {}
|
||||
None => {}
|
||||
};
|
||||
}
|
||||
Some((&_, _)) => {}
|
||||
None => {}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,7 +1,3 @@
|
|||
use based::{
|
||||
page::Shell,
|
||||
request::{RequestContext, StringResponse},
|
||||
};
|
||||
use maud::{html, PreEscaped};
|
||||
|
||||
/// Generates an SVG arrow icon with the specified color.
|
||||
|
@ -78,24 +74,6 @@ pub fn gen_path_header(
|
|||
}
|
||||
}
|
||||
|
||||
pub async fn render_page(content: PreEscaped<String>, ctx: RequestContext) -> StringResponse {
|
||||
based::page::render_page(
|
||||
content,
|
||||
"Website Archive",
|
||||
ctx,
|
||||
&Shell::new(
|
||||
html! {
|
||||
script src="https://cdn.tailwindcss.com" {};
|
||||
meta name="viewport" content="width=device-width, initial-scale=1.0" {};
|
||||
script src="/assets/htmx.min.js" {};
|
||||
},
|
||||
html! {},
|
||||
Some("bg-zinc-950 text-white min-h-screen flex pt-8 justify-center".to_string()),
|
||||
),
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
pub fn favicon(site: &str) -> PreEscaped<String> {
|
||||
html! {
|
||||
img class="h-8 w-8 m-2" src=(format!("/favicon/{site}")) {};
|
||||
|
|
|
@ -13,12 +13,14 @@ pub mod component;
|
|||
use component::*;
|
||||
use serde_json::json;
|
||||
|
||||
use crate::{
|
||||
use webarc::{
|
||||
ai::{generate_embedding, EmbedStore, SearchResult},
|
||||
archive::WebsiteArchive,
|
||||
conf::get_config,
|
||||
render_page,
|
||||
};
|
||||
|
||||
const SEARCH_BAR_STYLE: &'static str = "w-full px-4 mb-4 py-2 text-white bg-black border-2 border-neon-blue placeholder-neon-blue focus:ring-2 focus:ring-neon-pink focus:outline-none font-mono text-lg";
|
||||
const SEARCH_BAR_STYLE: &str = "w-full px-4 mb-4 py-2 text-white bg-black border-2 border-neon-blue placeholder-neon-blue focus:ring-2 focus:ring-neon-pink focus:outline-none font-mono text-lg";
|
||||
|
||||
/// Get the favicon of a domain
|
||||
#[get("/favicon/<domain>")]
|
||||
|
@ -29,6 +31,8 @@ pub async fn favicon_route(domain: &str) -> Option<DataResponse> {
|
|||
.read_to_end(&mut buf)
|
||||
.ok()?;
|
||||
|
||||
// TODO : Default favicon
|
||||
|
||||
Some(DataResponse::new(
|
||||
buf,
|
||||
"image/x-icon".to_string(),
|
||||
|
@ -171,12 +175,7 @@ pub async fn render_website(
|
|||
"text/html".to_string(),
|
||||
Some(60 * 60 * 24),
|
||||
));
|
||||
} else if std::env::var("DOWNLOAD_ON_DEMAND")
|
||||
.unwrap_or("false".to_string())
|
||||
.as_str()
|
||||
== "true"
|
||||
&& time.is_none()
|
||||
{
|
||||
} else if get_config().DOWNLOAD_ON_DEMAND && time.is_none() {
|
||||
arc.archive_url(&format!("https://{domain}/{}", path.to_str().unwrap()))
|
||||
.await;
|
||||
|
||||
|
@ -213,9 +212,7 @@ pub async fn vector_search(
|
|||
page: Option<i64>,
|
||||
ctx: RequestContext,
|
||||
) -> Option<StringResponse> {
|
||||
if std::env::var("OLLAMA_URL").is_err() {
|
||||
return None;
|
||||
}
|
||||
get_config().ai.as_ref()?;
|
||||
|
||||
let page = page.unwrap_or(1);
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue