Compare commits
No commits in common. "main" and "ollama" have entirely different histories.
20 changed files with 476 additions and 1383 deletions
125
Cargo.lock
generated
125
Cargo.lock
generated
|
@ -59,55 +59,6 @@ dependencies = [
|
||||||
"libc",
|
"libc",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "anstream"
|
|
||||||
version = "0.6.18"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "8acc5369981196006228e28809f761875c0327210a891e941f4c683b3a99529b"
|
|
||||||
dependencies = [
|
|
||||||
"anstyle",
|
|
||||||
"anstyle-parse",
|
|
||||||
"anstyle-query",
|
|
||||||
"anstyle-wincon",
|
|
||||||
"colorchoice",
|
|
||||||
"is_terminal_polyfill",
|
|
||||||
"utf8parse",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "anstyle"
|
|
||||||
version = "1.0.10"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "55cc3b69f167a1ef2e161439aa98aed94e6028e5f9a59be9a6ffb47aef1651f9"
|
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "anstyle-parse"
|
|
||||||
version = "0.2.6"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "3b2d16507662817a6a20a9ea92df6652ee4f94f914589377d69f3b21bc5798a9"
|
|
||||||
dependencies = [
|
|
||||||
"utf8parse",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "anstyle-query"
|
|
||||||
version = "1.1.2"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "79947af37f4177cfead1110013d678905c37501914fba0efea834c3fe9a8d60c"
|
|
||||||
dependencies = [
|
|
||||||
"windows-sys 0.59.0",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "anstyle-wincon"
|
|
||||||
version = "3.0.6"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "2109dbce0e72be3ec00bed26e6a7479ca384ad226efdd66db8fa2e3a38c83125"
|
|
||||||
dependencies = [
|
|
||||||
"anstyle",
|
|
||||||
"windows-sys 0.59.0",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "async-stream"
|
name = "async-stream"
|
||||||
version = "0.3.6"
|
version = "0.3.6"
|
||||||
|
@ -362,52 +313,6 @@ dependencies = [
|
||||||
"inout",
|
"inout",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "clap"
|
|
||||||
version = "4.5.23"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "3135e7ec2ef7b10c6ed8950f0f792ed96ee093fa088608f1c76e569722700c84"
|
|
||||||
dependencies = [
|
|
||||||
"clap_builder",
|
|
||||||
"clap_derive",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "clap_builder"
|
|
||||||
version = "4.5.23"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "30582fc632330df2bd26877bde0c1f4470d57c582bbc070376afcd04d8cb4838"
|
|
||||||
dependencies = [
|
|
||||||
"anstream",
|
|
||||||
"anstyle",
|
|
||||||
"clap_lex",
|
|
||||||
"strsim",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "clap_derive"
|
|
||||||
version = "4.5.18"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "4ac6a0c7b1a9e9a5186361f67dfa1b88213572f427fb9ab038efb2bd8c582dab"
|
|
||||||
dependencies = [
|
|
||||||
"heck",
|
|
||||||
"proc-macro2",
|
|
||||||
"quote",
|
|
||||||
"syn 2.0.93",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "clap_lex"
|
|
||||||
version = "0.7.4"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "f46ad14479a25103f283c0f10005961cf086d8dc42205bb44c46ac563475dca6"
|
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "colorchoice"
|
|
||||||
version = "1.0.3"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "5b63caa9aa9397e2d9480a9b13673856c78d8ac123288526c37d7839f2a86990"
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "combine"
|
name = "combine"
|
||||||
version = "4.6.7"
|
version = "4.6.7"
|
||||||
|
@ -1446,12 +1351,6 @@ dependencies = [
|
||||||
"windows-sys 0.52.0",
|
"windows-sys 0.52.0",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "is_terminal_polyfill"
|
|
||||||
version = "1.70.1"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf"
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "itoa"
|
name = "itoa"
|
||||||
version = "1.0.14"
|
version = "1.0.14"
|
||||||
|
@ -3043,12 +2942,6 @@ dependencies = [
|
||||||
"unicode-properties",
|
"unicode-properties",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "strsim"
|
|
||||||
version = "0.11.1"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "subtle"
|
name = "subtle"
|
||||||
version = "2.6.1"
|
version = "2.6.1"
|
||||||
|
@ -3569,15 +3462,6 @@ dependencies = [
|
||||||
"percent-encoding",
|
"percent-encoding",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "url-escape"
|
|
||||||
version = "0.1.1"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "44e0ce4d1246d075ca5abec4b41d33e87a6054d08e2366b63205665e950db218"
|
|
||||||
dependencies = [
|
|
||||||
"percent-encoding",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "utf-8"
|
name = "utf-8"
|
||||||
version = "0.7.6"
|
version = "0.7.6"
|
||||||
|
@ -3596,12 +3480,6 @@ version = "1.0.4"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be"
|
checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be"
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "utf8parse"
|
|
||||||
version = "0.2.2"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "uuid"
|
name = "uuid"
|
||||||
version = "1.11.0"
|
version = "1.11.0"
|
||||||
|
@ -3744,7 +3622,6 @@ version = "0.1.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"based",
|
"based",
|
||||||
"chrono",
|
"chrono",
|
||||||
"clap",
|
|
||||||
"env_logger",
|
"env_logger",
|
||||||
"futures",
|
"futures",
|
||||||
"html2md",
|
"html2md",
|
||||||
|
@ -3759,9 +3636,7 @@ dependencies = [
|
||||||
"serde_json",
|
"serde_json",
|
||||||
"sqlx",
|
"sqlx",
|
||||||
"tokio",
|
"tokio",
|
||||||
"toml",
|
|
||||||
"url",
|
"url",
|
||||||
"url-escape",
|
|
||||||
"uuid",
|
"uuid",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
|
@ -22,6 +22,3 @@ reqwest = "0.12.11"
|
||||||
ollama-rs = "0.2.2"
|
ollama-rs = "0.2.2"
|
||||||
pgvector = { version = "0.4", features = ["sqlx"] }
|
pgvector = { version = "0.4", features = ["sqlx"] }
|
||||||
html2md = "0.2.14"
|
html2md = "0.2.14"
|
||||||
clap = { version = "4.5.23", features = ["cargo", "derive"] }
|
|
||||||
toml = "0.8.19"
|
|
||||||
url-escape = "0.1.1"
|
|
||||||
|
|
14
Dockerfile
14
Dockerfile
|
@ -1,13 +1,13 @@
|
||||||
FROM rust:buster AS builder
|
FROM rust:buster as builder
|
||||||
|
|
||||||
RUN rustup default nightly
|
|
||||||
|
|
||||||
RUN git clone "https://github.com/Y2Z/monolith" /monolith
|
|
||||||
WORKDIR /monolith
|
|
||||||
RUN cargo build --release
|
|
||||||
|
|
||||||
COPY . /app
|
COPY . /app
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|
||||||
|
RUN rustup default nightly
|
||||||
|
RUN cargo build --release
|
||||||
|
|
||||||
|
RUN git clone "https://github.com/Y2Z/monolith" /monolith
|
||||||
|
WORKDIR /monolith
|
||||||
RUN cargo build --release
|
RUN cargo build --release
|
||||||
|
|
||||||
FROM debian:buster
|
FROM debian:buster
|
||||||
|
|
43
README.md
43
README.md
|
@ -1,47 +1,14 @@
|
||||||
# WebArc
|
# WebArc
|
||||||
`webarc` is a local website archive based on [monolith](https://github.com/Y2Z/monolith).
|
`webarc` is a local website archive based on [monolith](https://github.com/Y2Z/monolith).
|
||||||
|
|
||||||
## Archive Format
|
## Configuration
|
||||||
A web archive is defined as a directory containing domains in this structure:
|
You can configure the application using environment variables:
|
||||||
|
|
||||||
```
|
- `$ROUTE_INTERNAL` : Rewrite links to point back to the archive itself
|
||||||
web_archive/
|
- `$DOWNLOAD_ON_DEMAND` : Download missing routes with monolith on demand
|
||||||
├─ domain.com/
|
- `$BLACKLIST_DOMAINS` : Blacklisted domains (Comma-seperated regex, example: `google.com,.*.youtube.com`)
|
||||||
│ ├─ sub/
|
|
||||||
│ │ ├─ path/
|
|
||||||
│ │ │ ├─ index_YYYY_MM_DD.html
|
|
||||||
├─ sub.domain.com/
|
|
||||||
```
|
|
||||||
|
|
||||||
Every document of this web archive can then be found at `archive/domain/paths.../index_YYYY_MM_DD.html`.
|
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
webarc provides a CLI tool to work with the archive structure.
|
|
||||||
|
|
||||||
```sh
|
|
||||||
# List domains in archive
|
|
||||||
webarc [--dir ARCHIVE] archive list [-j, --json]
|
|
||||||
|
|
||||||
# List all paths on a domain
|
|
||||||
webarc [--dir ARCHIVE] archive list [-j, --json] [DOMAIN]
|
|
||||||
|
|
||||||
# List all versions of a document
|
|
||||||
webarc [--dir ARCHIVE] archive versions [-j, --json] [DOMAIN] [PATH]
|
|
||||||
|
|
||||||
# Get a document
|
|
||||||
# `--md` will return a markdown version
|
|
||||||
webarc [--dir ARCHIVE] archive get [--md] [DOMAIN] [PATH] [VERSION]
|
|
||||||
|
|
||||||
# Archive a website
|
|
||||||
webarc [--dir ARCHIVE] archive download [URL]
|
|
||||||
```
|
|
||||||
|
|
||||||
## Configuration
|
|
||||||
You can configure the application using a config file. Look at the [config.toml](config.toml) file for more information.
|
|
||||||
|
|
||||||
## Web Server
|
|
||||||
You can start a webserver serving an archive with `webarc serve`.
|
|
||||||
|
|
||||||
Archived pages can be viewed at `/s/<domain>/<path..>`.
|
Archived pages can be viewed at `/s/<domain>/<path..>`.
|
||||||
For example, `/s/en.wikipedia.org/wiki/Website` will serve `en.wikipedia.org` at `/wiki/Website`.
|
For example, `/s/en.wikipedia.org/wiki/Website` will serve `en.wikipedia.org` at `/wiki/Website`.
|
||||||
|
|
||||||
|
|
73
config.toml
73
config.toml
|
@ -1,73 +0,0 @@
|
||||||
|
|
||||||
# Rewrite links to point back to the archive itself
|
|
||||||
ROUTE_INTERNAL=true
|
|
||||||
|
|
||||||
# Download missing routes on demand
|
|
||||||
DOWNLOAD_ON_DEMAND=true
|
|
||||||
|
|
||||||
[websites]
|
|
||||||
# You can blacklist sites which wont work well
|
|
||||||
BLACKLIST_DOMAINS = [
|
|
||||||
"^gitlab", # All domains starting with gitlab
|
|
||||||
"youtube" # YouTube
|
|
||||||
]
|
|
||||||
|
|
||||||
# Domain configuration (Example)
|
|
||||||
[[websites.domains]]
|
|
||||||
# The domain the config applies to
|
|
||||||
domain = "example.com"
|
|
||||||
|
|
||||||
# Blacklisted Path (Regexes)
|
|
||||||
blacklist_paths = ["/.*"]
|
|
||||||
|
|
||||||
# Exclude <audio> tags
|
|
||||||
no_audio = false
|
|
||||||
|
|
||||||
# Exclude <video> tags
|
|
||||||
no_video = false
|
|
||||||
|
|
||||||
# Exclude <img> tags
|
|
||||||
no_image = false
|
|
||||||
|
|
||||||
# Exclude CSS
|
|
||||||
no_css = false
|
|
||||||
|
|
||||||
# Exclude Javascript
|
|
||||||
no_javascript = false
|
|
||||||
|
|
||||||
# Exclude fonts
|
|
||||||
no_fonts = false
|
|
||||||
|
|
||||||
# Exclude iframes
|
|
||||||
no_frames = false
|
|
||||||
|
|
||||||
# User Agent
|
|
||||||
user_agent = "Safari"
|
|
||||||
|
|
||||||
[ai]
|
|
||||||
# Ollama URL (Enables vector search)
|
|
||||||
OLLAMA_URL="127.0.0.1:11434"
|
|
||||||
|
|
||||||
# --- Website Config
|
|
||||||
|
|
||||||
[[websites.domains]]
|
|
||||||
domain = "developer.mozilla.org"
|
|
||||||
no_javascript = true
|
|
||||||
|
|
||||||
[[websites.domains]]
|
|
||||||
domain = "github.com"
|
|
||||||
no_javascript = true
|
|
||||||
|
|
||||||
[[websites.domains]]
|
|
||||||
domain = "en.wikipedia.org"
|
|
||||||
no_javascript = true
|
|
||||||
|
|
||||||
[[websites.domains]]
|
|
||||||
domain = "api.flutter.dev"
|
|
||||||
no_javascript = true
|
|
||||||
no_video = true
|
|
||||||
|
|
||||||
[[websites.domains]]
|
|
||||||
domain = "docs.flutter.dev"
|
|
||||||
no_javascript = true
|
|
||||||
no_video = true
|
|
|
@ -6,12 +6,7 @@ services:
|
||||||
volumes:
|
volumes:
|
||||||
- ./websites:/websites
|
- ./websites:/websites
|
||||||
- ./favicon:/favicon
|
- ./favicon:/favicon
|
||||||
- ./config.toml:/config.toml
|
env_file: env
|
||||||
environment:
|
|
||||||
- "RUST_LOG=info"
|
|
||||||
- "ROCKET_ADDRESS=0.0.0.0"
|
|
||||||
- "DATABASE_URL=postgres://user:pass@postgres/webarc"
|
|
||||||
command: "/webarc serve"
|
|
||||||
|
|
||||||
postgres:
|
postgres:
|
||||||
# Any Postgres with support for pgvector
|
# Any Postgres with support for pgvector
|
||||||
|
|
18
env
Normal file
18
env
Normal file
|
@ -0,0 +1,18 @@
|
||||||
|
# Logging
|
||||||
|
RUST_LOG=info
|
||||||
|
ROCKET_ADDRESS=0.0.0.0
|
||||||
|
|
||||||
|
# Rewrite links to point back to the archive itself
|
||||||
|
ROUTE_INTERNAL=true
|
||||||
|
|
||||||
|
# Download missing routes on demand
|
||||||
|
DOWNLOAD_ON_DEMAND=true
|
||||||
|
|
||||||
|
# Blacklisted domains (Comma-seperated regex)
|
||||||
|
BLACKLIST_DOMAINS="google.com,.*.youtube.com"
|
||||||
|
|
||||||
|
# Database
|
||||||
|
DATABASE_URL=postgres://user:pass@postgres/webarc
|
||||||
|
|
||||||
|
# Ollama URL (Enables vector search)
|
||||||
|
OLLAMA_URL=127.0.0.1:11434
|
|
@ -5,9 +5,6 @@ CREATE TABLE doc_embedding (
|
||||||
domain VARCHAR(500) NOT NULL,
|
domain VARCHAR(500) NOT NULL,
|
||||||
path VARCHAR(1000) NOT NULL,
|
path VARCHAR(1000) NOT NULL,
|
||||||
ver VARCHAR(10) NOT NULL,
|
ver VARCHAR(10) NOT NULL,
|
||||||
chunk INTEGER NOT NULL,
|
|
||||||
embed_mxbai_embed_large vector(1024) NOT NULL,
|
embed_mxbai_embed_large vector(1024) NOT NULL,
|
||||||
PRIMARY KEY (domain, path, ver, chunk)
|
PRIMARY KEY (domain, path, ver)
|
||||||
);
|
)
|
||||||
|
|
||||||
CREATE INDEX ON doc_embedding USING ivfflat (embed_mxbai_embed_large vector_cosine_ops) WITH (lists = 200);
|
|
||||||
|
|
167
src/ai.rs
167
src/ai.rs
|
@ -1,4 +1,4 @@
|
||||||
use std::collections::{HashMap, VecDeque};
|
use std::collections::VecDeque;
|
||||||
|
|
||||||
use based::{get_pg, request::api::ToAPI, result::LogNoneAndPass};
|
use based::{get_pg, request::api::ToAPI, result::LogNoneAndPass};
|
||||||
use ollama_rs::generation::embeddings::request::{EmbeddingsInput, GenerateEmbeddingsRequest};
|
use ollama_rs::generation::embeddings::request::{EmbeddingsInput, GenerateEmbeddingsRequest};
|
||||||
|
@ -6,21 +6,14 @@ use serde::Serialize;
|
||||||
use serde_json::json;
|
use serde_json::json;
|
||||||
use sqlx::FromRow;
|
use sqlx::FromRow;
|
||||||
|
|
||||||
use crate::{
|
use crate::archive::{Document, Domain, WebsiteArchive};
|
||||||
archive::{Document, Domain, WebsiteArchive},
|
|
||||||
conf::get_config,
|
|
||||||
};
|
|
||||||
|
|
||||||
// TODO : Cite found chunks in search res?
|
|
||||||
|
|
||||||
#[derive(Debug, Clone, FromRow, Serialize)]
|
#[derive(Debug, Clone, FromRow, Serialize)]
|
||||||
pub struct DocEmbedding {
|
pub struct DocEmbedding {
|
||||||
pub domain: String,
|
pub domain: String,
|
||||||
pub path: String,
|
pub path: String,
|
||||||
pub ver: String,
|
pub ver: String,
|
||||||
pub chunk: i32,
|
|
||||||
|
|
||||||
#[allow(dead_code)]
|
|
||||||
#[serde(skip)]
|
#[serde(skip)]
|
||||||
embed_mxbai_embed_large: pgvector::Vector,
|
embed_mxbai_embed_large: pgvector::Vector,
|
||||||
|
|
||||||
|
@ -28,73 +21,6 @@ pub struct DocEmbedding {
|
||||||
pub similarity: f64,
|
pub similarity: f64,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl DocEmbedding {
|
|
||||||
pub async fn total_chunks(&self) -> i64 {
|
|
||||||
let res: (i64,) = sqlx::query_as(
|
|
||||||
"SELECT COUNT(chunk) FROM doc_embedding WHERE domain = $1 AND path = $2",
|
|
||||||
)
|
|
||||||
.bind(&self.domain)
|
|
||||||
.bind(&self.path)
|
|
||||||
.fetch_one(get_pg!())
|
|
||||||
.await
|
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
res.0
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Clone, Serialize)]
|
|
||||||
pub struct SearchResult {
|
|
||||||
pub domain: String,
|
|
||||||
pub path: String,
|
|
||||||
pub total_chunks: i64,
|
|
||||||
pub chunks: Vec<DocEmbedding>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl SearchResult {
|
|
||||||
pub fn new(domain: String, path: String, total_chunks: i64) -> Self {
|
|
||||||
Self {
|
|
||||||
domain,
|
|
||||||
path,
|
|
||||||
total_chunks,
|
|
||||||
chunks: vec![],
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn similarity(&self) -> f64 {
|
|
||||||
let chunks = f64::from(self.chunks.len() as u32);
|
|
||||||
let total = f64::from(self.total_chunks as i32);
|
|
||||||
let match_percent = chunks / total;
|
|
||||||
total_score(&self.chunks) * match_percent
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn avg_sim(e: &[DocEmbedding]) -> f64 {
|
|
||||||
let mut score = 0.0;
|
|
||||||
|
|
||||||
for e in e {
|
|
||||||
score += e.similarity;
|
|
||||||
}
|
|
||||||
|
|
||||||
score / e.len() as f64
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn max_sim(e: &[DocEmbedding]) -> f64 {
|
|
||||||
let mut score = 0.0;
|
|
||||||
|
|
||||||
for e in e {
|
|
||||||
if e.similarity > score {
|
|
||||||
score = e.similarity;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
score
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn total_score(e: &[DocEmbedding]) -> f64 {
|
|
||||||
(avg_sim(e) + max_sim(e)) / 2.0
|
|
||||||
}
|
|
||||||
|
|
||||||
impl ToAPI for DocEmbedding {
|
impl ToAPI for DocEmbedding {
|
||||||
async fn api(&self) -> serde_json::Value {
|
async fn api(&self) -> serde_json::Value {
|
||||||
json!({
|
json!({
|
||||||
|
@ -107,30 +33,12 @@ impl ToAPI for DocEmbedding {
|
||||||
}
|
}
|
||||||
|
|
||||||
pub trait Embedding {
|
pub trait Embedding {
|
||||||
fn embedding(
|
fn embedding(&self, ver: Option<String>)
|
||||||
&self,
|
-> impl std::future::Future<Output = Option<Vec<f32>>>;
|
||||||
ver: Option<String>,
|
|
||||||
) -> impl std::future::Future<Output = Option<Vec<Vec<f32>>>>;
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn chunked(s: &str) -> Vec<String> {
|
|
||||||
const CHUNK_SIZE: usize = 2500;
|
|
||||||
s.chars()
|
|
||||||
.collect::<Vec<char>>()
|
|
||||||
.chunks(CHUNK_SIZE)
|
|
||||||
.map(|chunk| chunk.iter().collect())
|
|
||||||
.collect()
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn remove_data_urls(input: &str) -> String {
|
|
||||||
let re = regex::Regex::new("data:(.*?)(;base64)?,(.*)").unwrap();
|
|
||||||
|
|
||||||
// Replace all occurrences of data URLs with an empty string
|
|
||||||
re.replace_all(input, "").to_string()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Embedding for Document {
|
impl Embedding for Document {
|
||||||
async fn embedding(&self, ver: Option<String>) -> Option<Vec<Vec<f32>>> {
|
async fn embedding(&self, ver: Option<String>) -> Option<Vec<f32>> {
|
||||||
let latest = "latest".to_string();
|
let latest = "latest".to_string();
|
||||||
log::info!(
|
log::info!(
|
||||||
"Generating Vector embeddings for {} / {} @ {}",
|
"Generating Vector embeddings for {} / {} @ {}",
|
||||||
|
@ -139,32 +47,14 @@ impl Embedding for Document {
|
||||||
ver.as_ref().unwrap_or(&latest)
|
ver.as_ref().unwrap_or(&latest)
|
||||||
);
|
);
|
||||||
|
|
||||||
let content_html = self.render_local(ver.clone()).await?;
|
let content_html = self.render_local(ver).await?;
|
||||||
let content = remove_data_urls(&html2md::parse_html(&content_html));
|
let content = html2md::parse_html(&content_html);
|
||||||
|
generate_embedding(content).await
|
||||||
let mut embeddings = Vec::new();
|
|
||||||
let content = chunked(&content);
|
|
||||||
let len = content.len();
|
|
||||||
|
|
||||||
for (index, c) in content.into_iter().enumerate() {
|
|
||||||
log::info!(
|
|
||||||
"Generating Vector embeddings for {} / {} @ {} [ {} / {} ]",
|
|
||||||
self.domain,
|
|
||||||
self.path,
|
|
||||||
ver.as_ref().unwrap_or(&latest),
|
|
||||||
index + 1,
|
|
||||||
len
|
|
||||||
);
|
|
||||||
embeddings.push(generate_embedding(c).await?);
|
|
||||||
}
|
|
||||||
|
|
||||||
Some(embeddings)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn generate_embedding(mut input: String) -> Option<Vec<f32>> {
|
pub async fn generate_embedding(mut input: String) -> Option<Vec<f32>> {
|
||||||
// TODO : Ollama load balancing
|
if let Ok(ollama_url) = std::env::var("OLLAMA_URL") {
|
||||||
if let Some(ollama_url) = get_config().ai.as_ref().map(|x| x.OLLAMA_URL.clone()) {
|
|
||||||
let (host, port) = ollama_url.split_once(':')?;
|
let (host, port) = ollama_url.split_once(':')?;
|
||||||
let ollama = ollama_rs::Ollama::new(format!("http://{host}"), port.parse().ok()?);
|
let ollama = ollama_rs::Ollama::new(format!("http://{host}"), port.parse().ok()?);
|
||||||
|
|
||||||
|
@ -239,19 +129,16 @@ impl EmbedStore {
|
||||||
.execute(get_pg!())
|
.execute(get_pg!())
|
||||||
.await;
|
.await;
|
||||||
|
|
||||||
for (index, embed) in embed.iter().enumerate() {
|
sqlx::query("INSERT INTO doc_embedding VALUES ($1, $2, $3, $4)")
|
||||||
sqlx::query("INSERT INTO doc_embedding VALUES ($1, $2, $3, $4, $5)")
|
|
||||||
.bind(&doc.domain)
|
.bind(&doc.domain)
|
||||||
.bind(&doc.path)
|
.bind(&doc.path)
|
||||||
.bind(ver)
|
.bind(ver)
|
||||||
.bind(index as i64)
|
|
||||||
.bind(embed)
|
.bind(embed)
|
||||||
.execute(get_pg!())
|
.execute(get_pg!())
|
||||||
.await
|
.await
|
||||||
.unwrap();
|
.unwrap();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
pub async fn ensure_embedding(doc: &Document) {
|
pub async fn ensure_embedding(doc: &Document) {
|
||||||
for ver in doc.versions() {
|
for ver in doc.versions() {
|
||||||
|
@ -261,42 +148,16 @@ impl EmbedStore {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn search_vector(v: &pgvector::Vector, limit: i64, offset: i64) -> Vec<SearchResult> {
|
pub async fn search_vector(v: &pgvector::Vector, limit: i64, offset: i64) -> Vec<DocEmbedding> {
|
||||||
// limit should cover SearchResults not the query -> rework
|
sqlx::query_as(
|
||||||
|
"SELECT *, 1 / (1 + (embed_mxbai_embed_large <-> $1)) AS similarity FROM doc_embedding ORDER BY embed_mxbai_embed_large <-> $1 LIMIT $2 OFFSET $3",
|
||||||
let results: Vec<DocEmbedding> = sqlx::query_as(
|
|
||||||
"SELECT *, 1 / (1 + (embed_mxbai_embed_large <-> $1)) AS similarity FROM doc_embedding ORDER BY embed_mxbai_embed_large <=> $1 LIMIT $2 OFFSET $3",
|
|
||||||
)
|
)
|
||||||
.bind(v)
|
.bind(v)
|
||||||
.bind(limit)
|
.bind(limit)
|
||||||
.bind(offset)
|
.bind(offset)
|
||||||
.fetch_all(get_pg!())
|
.fetch_all(get_pg!())
|
||||||
.await
|
.await
|
||||||
.unwrap();
|
.unwrap()
|
||||||
|
|
||||||
let mut search_res: HashMap<String, HashMap<String, SearchResult>> = HashMap::new();
|
|
||||||
|
|
||||||
for res in results {
|
|
||||||
let domain = search_res.entry(res.domain.clone()).or_default();
|
|
||||||
let doc = domain.entry(res.path.clone()).or_insert(SearchResult::new(
|
|
||||||
res.domain.clone(),
|
|
||||||
res.path.clone(),
|
|
||||||
res.total_chunks().await,
|
|
||||||
));
|
|
||||||
doc.chunks.push(res);
|
|
||||||
}
|
|
||||||
|
|
||||||
let mut flat = search_res
|
|
||||||
.into_values()
|
|
||||||
.flat_map(|x| x.into_values().collect::<Vec<SearchResult>>())
|
|
||||||
.collect::<Vec<SearchResult>>();
|
|
||||||
|
|
||||||
flat.sort_by(|a, b| {
|
|
||||||
b.similarity()
|
|
||||||
.partial_cmp(&a.similarity())
|
|
||||||
.unwrap_or(std::cmp::Ordering::Equal)
|
|
||||||
});
|
|
||||||
flat
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn generate_embeddings_for(arc: &WebsiteArchive) {
|
pub async fn generate_embeddings_for(arc: &WebsiteArchive) {
|
||||||
|
|
335
src/archive.rs
Normal file
335
src/archive.rs
Normal file
|
@ -0,0 +1,335 @@
|
||||||
|
use std::path::PathBuf;
|
||||||
|
|
||||||
|
use based::request::RequestContext;
|
||||||
|
use maud::html;
|
||||||
|
|
||||||
|
use crate::{blacklist::check_blacklist, favicon::download_fav_for, pages::component::render_page};
|
||||||
|
|
||||||
|
/// Read directory entries into `Vec<String>`
|
||||||
|
pub fn read_dir(dir: &PathBuf) -> Vec<String> {
|
||||||
|
let mut list = Vec::new();
|
||||||
|
|
||||||
|
if let Ok(entries) = std::fs::read_dir(dir) {
|
||||||
|
for entry in entries.flatten() {
|
||||||
|
if let Some(file_name) = entry.file_name().to_str() {
|
||||||
|
list.push(file_name.to_string());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
list
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Rewrite all URLs in `input` to the format `/s/<domain>/<path..>`
|
||||||
|
fn internalize_urls(input: &str) -> String {
|
||||||
|
let url_pattern = r"https?://([a-zA-Z0-9.-]+)(/[\w./-]*)";
|
||||||
|
let re = regex::Regex::new(url_pattern).unwrap();
|
||||||
|
|
||||||
|
re.replace_all(input, |caps: ®ex::Captures| {
|
||||||
|
format!(
|
||||||
|
"/s/{}/{}",
|
||||||
|
&caps[1].trim_start_matches("www."), // Domain
|
||||||
|
&caps[2] // Path
|
||||||
|
)
|
||||||
|
})
|
||||||
|
.to_string()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Represents a directory containg archived websites
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
pub struct WebsiteArchive {
|
||||||
|
pub dir: PathBuf,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Represents a domain within the website archive
|
||||||
|
pub struct Domain {
|
||||||
|
/// Domain name
|
||||||
|
pub name: String,
|
||||||
|
dir: PathBuf,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Domain {
|
||||||
|
/// Creates a new `Domain` instance.
|
||||||
|
///
|
||||||
|
/// If the domain name is not blacklisted, a directory is created.
|
||||||
|
///
|
||||||
|
/// # Parameters
|
||||||
|
/// - `name`: The name of the domain.
|
||||||
|
/// - `dir`: The directory path for the domain.
|
||||||
|
///
|
||||||
|
/// # Returns
|
||||||
|
/// A new `Domain` instance.
|
||||||
|
pub fn new(name: &str, dir: PathBuf) -> Self {
|
||||||
|
if !check_blacklist(name) {
|
||||||
|
std::fs::create_dir_all(&dir).unwrap();
|
||||||
|
}
|
||||||
|
Self {
|
||||||
|
name: name.to_string(),
|
||||||
|
dir,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Resolves a specific path within the domain and returns a `Document` representing it.
|
||||||
|
///
|
||||||
|
/// # Parameters
|
||||||
|
/// - `path`: The path to resolve within the domain.
|
||||||
|
///
|
||||||
|
/// # Returns
|
||||||
|
/// A `Document` instance corresponding to the given path.
|
||||||
|
pub fn path(&self, path: &str) -> Document {
|
||||||
|
Document::new(&self.name, path, self.dir.parent().unwrap().to_path_buf())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Retrieves entries and metadata for a given path within the domain.
|
||||||
|
///
|
||||||
|
/// # Parameters
|
||||||
|
/// - `path`: The path to inspect.
|
||||||
|
///
|
||||||
|
/// # Returns
|
||||||
|
/// A tuple containing:
|
||||||
|
/// - A vector of `PathEntry` instances representing the contents of the path.
|
||||||
|
/// - A boolean indicating whether the path is itself a `Document`
|
||||||
|
pub fn paths(&self, path: &str) -> (Vec<PathEntry>, bool) {
|
||||||
|
let mut base_path = self.dir.clone();
|
||||||
|
|
||||||
|
for p in path.split('/') {
|
||||||
|
base_path = base_path.join(p);
|
||||||
|
}
|
||||||
|
|
||||||
|
let dir_content = read_dir(&base_path);
|
||||||
|
|
||||||
|
let mut ret = Vec::new();
|
||||||
|
|
||||||
|
let mut is_doc = false;
|
||||||
|
|
||||||
|
for entry in dir_content {
|
||||||
|
let url_path = format!("{path}/{entry}");
|
||||||
|
if entry.starts_with("index_") && entry.ends_with(".html") {
|
||||||
|
is_doc = true;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
ret.push(PathEntry(self.name.clone(), url_path));
|
||||||
|
}
|
||||||
|
|
||||||
|
(ret, is_doc)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Represents an entry within a domain's path, containing its name and URL path.
|
||||||
|
pub struct PathEntry(String, String);
|
||||||
|
|
||||||
|
impl PathEntry {
|
||||||
|
pub fn url(&self) -> String {
|
||||||
|
format!("/d/{}/{}", self.0, self.1)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn path(&self) -> &String {
|
||||||
|
&self.1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Represents a document within a domain
|
||||||
|
pub struct Document {
|
||||||
|
/// The domain associated with the document.
|
||||||
|
pub domain: String,
|
||||||
|
/// The path of the document within the domain.
|
||||||
|
pub path: String,
|
||||||
|
base_dir: PathBuf,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Document {
|
||||||
|
/// Creates a new `Document` instance.
|
||||||
|
///
|
||||||
|
/// # Parameters
|
||||||
|
/// - `domain`: The domain to which the document belongs.
|
||||||
|
/// - `path`: The path of the document within the domain.
|
||||||
|
/// - `base_dir`: The base directory of the archive storage.
|
||||||
|
///
|
||||||
|
/// # Returns
|
||||||
|
/// A new `Document` instance.
|
||||||
|
pub fn new(domain: &str, path: &str, base_dir: PathBuf) -> Self {
|
||||||
|
Self {
|
||||||
|
domain: domain.to_string(),
|
||||||
|
path: path
|
||||||
|
.split('/')
|
||||||
|
.filter(|x| !x.is_empty())
|
||||||
|
.collect::<Vec<&str>>()
|
||||||
|
.join("/"),
|
||||||
|
base_dir,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Renders the document, returning its content as a string.
|
||||||
|
///
|
||||||
|
/// If the environment variable `$ROUTE_INTERNAL` is set to `true`, all links will be rewritten to point to internal archived routes.
|
||||||
|
///
|
||||||
|
/// # Parameters
|
||||||
|
/// - `version`: An optional version of the document to render in the format `YYYY-MM-DD`.
|
||||||
|
///
|
||||||
|
/// # Returns
|
||||||
|
/// An `Option` containing the rendered content as a string, or `None` if nothing could be rendered.
|
||||||
|
pub async fn render_local(&self, version: Option<String>) -> Option<String> {
|
||||||
|
if check_blacklist(&self.domain) {
|
||||||
|
let content = html! {
|
||||||
|
h3 { "This site is blacklisted" };
|
||||||
|
};
|
||||||
|
return Some(render_page(content, RequestContext::default()).await.1 .1);
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut file_path = self.doc_dir();
|
||||||
|
|
||||||
|
let latest_version = if let Some(version) = version {
|
||||||
|
format!("index_{version}.html")
|
||||||
|
} else {
|
||||||
|
let versions = self.versions();
|
||||||
|
versions.first().cloned()?
|
||||||
|
};
|
||||||
|
|
||||||
|
file_path = file_path.join(latest_version);
|
||||||
|
|
||||||
|
let content = std::fs::read_to_string(file_path).ok()?;
|
||||||
|
|
||||||
|
if std::env::var("ROUTE_INTERNAL").unwrap_or("false".to_string()) == "true" {
|
||||||
|
Some(internalize_urls(&content))
|
||||||
|
} else {
|
||||||
|
Some(content)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Determines the directory where the document is stored.
|
||||||
|
///
|
||||||
|
/// # Returns
|
||||||
|
/// A `PathBuf` representing the document directory.
|
||||||
|
pub fn doc_dir(&self) -> PathBuf {
|
||||||
|
let mut file_path = self.base_dir.join(&self.domain);
|
||||||
|
|
||||||
|
for p in self.path.split('/') {
|
||||||
|
file_path = file_path.join(p);
|
||||||
|
}
|
||||||
|
|
||||||
|
file_path
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Retrieves available versions of the document.
|
||||||
|
///
|
||||||
|
/// # Returns
|
||||||
|
/// A vector of strings representing the available versions of the document, sorted in descending order.
|
||||||
|
pub fn versions(&self) -> Vec<String> {
|
||||||
|
let mut res: Vec<String> = read_dir(&self.doc_dir())
|
||||||
|
.into_iter()
|
||||||
|
.filter_map(|x| {
|
||||||
|
if x.starts_with("index_") && x.ends_with(".html") {
|
||||||
|
return Some(
|
||||||
|
x.trim_start_matches("index_")
|
||||||
|
.trim_end_matches(".html")
|
||||||
|
.to_string(),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
None
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
res.sort();
|
||||||
|
res.reverse();
|
||||||
|
res
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl WebsiteArchive {
|
||||||
|
/// Creates a new `WebsiteArchive` instance.
|
||||||
|
///
|
||||||
|
/// # Parameters
|
||||||
|
/// - `dir`: The directory path where the archive will be stored.
|
||||||
|
///
|
||||||
|
/// # Returns
|
||||||
|
/// A new `WebsiteArchive` instance.
|
||||||
|
pub fn new(dir: &str) -> Self {
|
||||||
|
Self {
|
||||||
|
dir: PathBuf::from(dir),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Retrieves the list of domain names stored in the archive.
|
||||||
|
///
|
||||||
|
/// # Returns
|
||||||
|
/// A vector of domain names as strings.
|
||||||
|
pub fn domains(&self) -> Vec<String> {
|
||||||
|
read_dir(&self.dir)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Retrieves a `Domain` instance for a specified domain name.
|
||||||
|
///
|
||||||
|
/// # Parameters
|
||||||
|
/// - `domain`: The name of the domain to retrieve.
|
||||||
|
///
|
||||||
|
/// # Returns
|
||||||
|
/// A `Domain` instance corresponding to the specified domain.
|
||||||
|
pub fn get_domain(&self, domain: &str) -> Domain {
|
||||||
|
Domain::new(domain, self.dir.join(domain))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Archives a URL by downloading and storing its content.
|
||||||
|
///
|
||||||
|
/// If the URL does not pass the blacklist check, it will not be archived.
|
||||||
|
///
|
||||||
|
/// # Parameters
|
||||||
|
/// - `url`: The URL to archive.
|
||||||
|
///
|
||||||
|
/// This function downloads the content of the URL, processes it, and saves it to the archive.
|
||||||
|
pub async fn archive_url(&self, url: &str) {
|
||||||
|
let parsed_url = url::Url::parse(url).unwrap();
|
||||||
|
|
||||||
|
let domain = parsed_url.domain().unwrap().trim_start_matches("www.");
|
||||||
|
|
||||||
|
// Deny blacklist
|
||||||
|
if check_blacklist(domain) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
let path = parsed_url.path();
|
||||||
|
|
||||||
|
let mut folder_name = self.dir.join(domain);
|
||||||
|
|
||||||
|
download_fav_for(domain).await;
|
||||||
|
|
||||||
|
for paths in path.split('/') {
|
||||||
|
if !paths.is_empty() {
|
||||||
|
folder_name = folder_name.join(paths);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
std::fs::create_dir_all(&folder_name).unwrap();
|
||||||
|
|
||||||
|
let timestamp = chrono::Utc::now().format("%Y-%m-%d").to_string();
|
||||||
|
let filename = folder_name.join(format!("index_{timestamp}.html"));
|
||||||
|
|
||||||
|
log::info!("Archiving {url} to {}", filename.to_str().unwrap());
|
||||||
|
|
||||||
|
run_command(&[
|
||||||
|
"monolith",
|
||||||
|
"-I",
|
||||||
|
"-o",
|
||||||
|
filename.to_str().unwrap(),
|
||||||
|
&format!("https://{}/{}", domain, path),
|
||||||
|
]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// full text search
|
||||||
|
// add new sites?
|
||||||
|
// transparent auto page downloading
|
||||||
|
// redownload after threshold
|
||||||
|
|
||||||
|
fn run_command(cmd: &[&str]) {
|
||||||
|
let mut cmd_setup = std::process::Command::new(cmd[0]);
|
||||||
|
let cmd_setup = cmd_setup
|
||||||
|
.args(cmd.iter().skip(1).collect::<Vec<_>>())
|
||||||
|
.stdout(std::process::Stdio::inherit())
|
||||||
|
.stderr(std::process::Stdio::inherit());
|
||||||
|
|
||||||
|
let child = cmd_setup.spawn().unwrap();
|
||||||
|
|
||||||
|
let status = child.wait_with_output().unwrap();
|
||||||
|
assert!(status.status.success());
|
||||||
|
}
|
|
@ -1,126 +0,0 @@
|
||||||
use std::{io::Read, path::PathBuf};
|
|
||||||
|
|
||||||
use based::request::RequestContext;
|
|
||||||
use maud::html;
|
|
||||||
|
|
||||||
use crate::{blacklist::check_blacklist, conf::get_config, render_page};
|
|
||||||
|
|
||||||
use super::{internalize_urls, read_dir};
|
|
||||||
|
|
||||||
/// Represents a document within a domain
|
|
||||||
pub struct Document {
|
|
||||||
/// The domain associated with the document.
|
|
||||||
pub domain: String,
|
|
||||||
/// The path of the document within the domain.
|
|
||||||
pub path: String,
|
|
||||||
base_dir: PathBuf,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Document {
|
|
||||||
/// Creates a new `Document` instance.
|
|
||||||
///
|
|
||||||
/// # Parameters
|
|
||||||
/// - `domain`: The domain to which the document belongs.
|
|
||||||
/// - `path`: The path of the document within the domain.
|
|
||||||
/// - `base_dir`: The base directory of the archive storage.
|
|
||||||
///
|
|
||||||
/// # Returns
|
|
||||||
/// A new `Document` instance.
|
|
||||||
pub fn new(domain: &str, path: &str, base_dir: PathBuf) -> Self {
|
|
||||||
let split = path
|
|
||||||
.split('/')
|
|
||||||
.filter(|x| !x.is_empty())
|
|
||||||
.collect::<Vec<&str>>();
|
|
||||||
|
|
||||||
Self {
|
|
||||||
domain: domain.to_string(),
|
|
||||||
path: if split.is_empty() {
|
|
||||||
"/".to_string()
|
|
||||||
} else {
|
|
||||||
split.join("/")
|
|
||||||
},
|
|
||||||
base_dir,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Renders the document, returning its content as a string.
|
|
||||||
///
|
|
||||||
/// If the environment variable `$ROUTE_INTERNAL` is set to `true`, all links will be rewritten to point to internal archived routes.
|
|
||||||
///
|
|
||||||
/// # Parameters
|
|
||||||
/// - `version`: An optional version of the document to render in the format `YYYY-MM-DD`.
|
|
||||||
///
|
|
||||||
/// # Returns
|
|
||||||
/// An `Option` containing the rendered content as a string, or `None` if nothing could be rendered.
|
|
||||||
pub async fn render_local(&self, version: Option<String>) -> Option<String> {
|
|
||||||
if check_blacklist(&self.domain) {
|
|
||||||
let content = html! {
|
|
||||||
h3 { "This site is blacklisted" };
|
|
||||||
};
|
|
||||||
return Some(render_page(content, RequestContext::default()).await.1 .1);
|
|
||||||
}
|
|
||||||
|
|
||||||
let mut file_path = self.doc_dir();
|
|
||||||
|
|
||||||
let latest_version = if let Some(version) = version {
|
|
||||||
format!("index_{version}.html")
|
|
||||||
} else {
|
|
||||||
let versions = self.versions();
|
|
||||||
let version = versions.first().cloned()?;
|
|
||||||
format!("index_{version}.html")
|
|
||||||
};
|
|
||||||
|
|
||||||
file_path = file_path.join(latest_version);
|
|
||||||
|
|
||||||
let mut buf = Vec::new();
|
|
||||||
std::fs::File::open(file_path)
|
|
||||||
.ok()?
|
|
||||||
.read_to_end(&mut buf)
|
|
||||||
.unwrap();
|
|
||||||
let content = String::from_utf8_lossy(&buf);
|
|
||||||
|
|
||||||
if get_config().ROUTE_INTERNAL {
|
|
||||||
Some(internalize_urls(&content, &self.domain))
|
|
||||||
} else {
|
|
||||||
Some(content.to_string())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Determines the directory where the document is stored.
|
|
||||||
///
|
|
||||||
/// # Returns
|
|
||||||
/// A `PathBuf` representing the document directory.
|
|
||||||
pub fn doc_dir(&self) -> PathBuf {
|
|
||||||
let mut file_path = self.base_dir.join(&self.domain);
|
|
||||||
|
|
||||||
for p in self.path.split('/').filter(|x| !x.is_empty()) {
|
|
||||||
file_path = file_path.join(p);
|
|
||||||
}
|
|
||||||
|
|
||||||
file_path
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Retrieves available versions of the document.
|
|
||||||
///
|
|
||||||
/// # Returns
|
|
||||||
/// A vector of strings representing the available versions of the document, sorted in descending order.
|
|
||||||
pub fn versions(&self) -> Vec<String> {
|
|
||||||
let mut res: Vec<String> = read_dir(&self.doc_dir())
|
|
||||||
.into_iter()
|
|
||||||
.filter_map(|x| {
|
|
||||||
if x.starts_with("index_") && x.ends_with(".html") {
|
|
||||||
return Some(
|
|
||||||
x.trim_start_matches("index_")
|
|
||||||
.trim_end_matches(".html")
|
|
||||||
.to_string(),
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
None
|
|
||||||
})
|
|
||||||
.collect();
|
|
||||||
res.sort();
|
|
||||||
res.reverse();
|
|
||||||
res
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,125 +0,0 @@
|
||||||
use std::path::PathBuf;
|
|
||||||
|
|
||||||
use based::result::LogAndIgnore;
|
|
||||||
|
|
||||||
use crate::blacklist::check_blacklist;
|
|
||||||
|
|
||||||
use super::{read_dir, Document};
|
|
||||||
|
|
||||||
/// Represents a domain within the website archive
|
|
||||||
pub struct Domain {
|
|
||||||
/// Domain name
|
|
||||||
pub name: String,
|
|
||||||
dir: PathBuf,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Domain {
|
|
||||||
/// Creates a new `Domain` instance.
|
|
||||||
///
|
|
||||||
/// If the domain name is not blacklisted, a directory is created.
|
|
||||||
///
|
|
||||||
/// # Parameters
|
|
||||||
/// - `name`: The name of the domain.
|
|
||||||
/// - `dir`: The directory path for the domain.
|
|
||||||
///
|
|
||||||
/// # Returns
|
|
||||||
/// A new `Domain` instance.
|
|
||||||
pub fn new(name: &str, dir: PathBuf) -> Self {
|
|
||||||
if !check_blacklist(name) {
|
|
||||||
std::fs::create_dir_all(&dir)
|
|
||||||
.log_err_and_ignore(&format!("Could not create domain dir {name}"));
|
|
||||||
}
|
|
||||||
Self {
|
|
||||||
name: name.to_string(),
|
|
||||||
dir,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Resolves a specific path within the domain and returns a `Document` representing it.
|
|
||||||
///
|
|
||||||
/// # Parameters
|
|
||||||
/// - `path`: The path to resolve within the domain.
|
|
||||||
///
|
|
||||||
/// # Returns
|
|
||||||
/// A `Document` instance corresponding to the given path.
|
|
||||||
pub fn path(&self, path: &str) -> Document {
|
|
||||||
Document::new(&self.name, path, self.dir.parent().unwrap().to_path_buf())
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Get all paths associated with the domain
|
|
||||||
pub fn all_paths(&self) -> Vec<PathEntry> {
|
|
||||||
let mut queue = self.paths("/").0;
|
|
||||||
|
|
||||||
let mut ret = Vec::new();
|
|
||||||
|
|
||||||
ret.push(PathEntry(self.name.clone(), "/".to_string()));
|
|
||||||
|
|
||||||
while let Some(el) = queue.pop() {
|
|
||||||
ret.push(el.clone());
|
|
||||||
let paths = self.paths(&el.1).0;
|
|
||||||
queue.extend(paths);
|
|
||||||
}
|
|
||||||
|
|
||||||
ret
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Retrieves entries and metadata for a given path within the domain.
|
|
||||||
///
|
|
||||||
/// # Parameters
|
|
||||||
/// - `path`: The path to inspect.
|
|
||||||
///
|
|
||||||
/// # Returns
|
|
||||||
/// A tuple containing:
|
|
||||||
/// - A vector of `PathEntry` instances representing the contents of the path.
|
|
||||||
/// - A boolean indicating whether the path is itself a `Document`
|
|
||||||
pub fn paths(&self, path: &str) -> (Vec<PathEntry>, bool) {
|
|
||||||
let mut base_path = self.dir.clone();
|
|
||||||
|
|
||||||
for p in path.split('/') {
|
|
||||||
base_path = base_path.join(p);
|
|
||||||
}
|
|
||||||
|
|
||||||
let path = path
|
|
||||||
.split("/")
|
|
||||||
.filter(|x| !x.is_empty())
|
|
||||||
.collect::<Vec<&str>>()
|
|
||||||
.join("/");
|
|
||||||
|
|
||||||
let dir_content = read_dir(&base_path);
|
|
||||||
|
|
||||||
let mut ret = Vec::new();
|
|
||||||
|
|
||||||
let mut is_doc = false;
|
|
||||||
|
|
||||||
for entry in dir_content {
|
|
||||||
let url_path = format!("{path}/{entry}");
|
|
||||||
let url_path = url_path
|
|
||||||
.split("/")
|
|
||||||
.filter(|x| !x.is_empty())
|
|
||||||
.collect::<Vec<&str>>()
|
|
||||||
.join("/");
|
|
||||||
if entry.starts_with("index_") && entry.ends_with(".html") {
|
|
||||||
is_doc = true;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
ret.push(PathEntry(self.name.clone(), url_path));
|
|
||||||
}
|
|
||||||
|
|
||||||
(ret, is_doc)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Represents an entry within a domain's path, containing its name and URL path.
|
|
||||||
#[derive(Debug, Clone)]
|
|
||||||
pub struct PathEntry(String, String);
|
|
||||||
|
|
||||||
impl PathEntry {
|
|
||||||
pub fn url(&self) -> String {
|
|
||||||
format!("/d/{}/{}", self.0, self.1)
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn path(&self) -> &String {
|
|
||||||
&self.1
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,232 +0,0 @@
|
||||||
use std::{collections::HashSet, path::PathBuf};
|
|
||||||
|
|
||||||
use crate::{
|
|
||||||
blacklist::{check_blacklist, check_blacklist_path},
|
|
||||||
conf::get_config,
|
|
||||||
favicon::download_fav_for,
|
|
||||||
};
|
|
||||||
|
|
||||||
mod document;
|
|
||||||
mod domain;
|
|
||||||
pub use document::Document;
|
|
||||||
pub use domain::*;
|
|
||||||
|
|
||||||
/// Read directory entries into `Vec<String>`
|
|
||||||
pub fn read_dir(dir: &PathBuf) -> Vec<String> {
|
|
||||||
let mut list = Vec::new();
|
|
||||||
|
|
||||||
if let Ok(entries) = std::fs::read_dir(dir) {
|
|
||||||
for entry in entries.flatten() {
|
|
||||||
if let Some(file_name) = entry.file_name().to_str() {
|
|
||||||
list.push(file_name.to_string());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
list
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Rewrite all URLs in `input` to the format `/s/<domain>/<path..>`
|
|
||||||
fn internalize_urls(input: &str, base: &str) -> String {
|
|
||||||
let url_pattern = r#"(\ |"|')(?:(<?)(https?:\/\/([a-zA-Z0-9.-]+))?(\/[\w./-]*))"#;
|
|
||||||
let re = regex::Regex::new(url_pattern).unwrap();
|
|
||||||
|
|
||||||
re.replace_all(input, |caps: ®ex::Captures| {
|
|
||||||
if caps.get(2).map(|x| x.as_str()).unwrap_or_default() == "<" {
|
|
||||||
return caps.get(0).unwrap().as_str().to_string();
|
|
||||||
}
|
|
||||||
|
|
||||||
if caps.get(0).unwrap().as_str() == " //" {
|
|
||||||
return " //".to_string();
|
|
||||||
}
|
|
||||||
|
|
||||||
let wrap = caps.get(1).map(|x| x.as_str()).unwrap_or_default();
|
|
||||||
|
|
||||||
if let Some(domain) = caps.get(3) {
|
|
||||||
let domain = domain.as_str();
|
|
||||||
let (protocol, domain) = if domain.starts_with("https://") {
|
|
||||||
("https", domain.trim_start_matches("https://"))
|
|
||||||
} else {
|
|
||||||
("http", domain.trim_start_matches("http://"))
|
|
||||||
};
|
|
||||||
|
|
||||||
let domain = domain.trim_start_matches("www.");
|
|
||||||
let path = caps.get(5).map_or("", |m| m.as_str());
|
|
||||||
|
|
||||||
// Skip transformation if the domain is in the blacklist
|
|
||||||
if check_blacklist(domain) {
|
|
||||||
format!("{wrap}{protocol}://{domain}{path}")
|
|
||||||
} else {
|
|
||||||
format!("{wrap}/s/{domain}{path}")
|
|
||||||
}
|
|
||||||
} else if let Some(path) = caps.get(5) {
|
|
||||||
// Handle relative paths
|
|
||||||
format!("{wrap}/s/{base}{}", path.as_str())
|
|
||||||
} else {
|
|
||||||
// Default fallback
|
|
||||||
caps[0].to_string()
|
|
||||||
}
|
|
||||||
})
|
|
||||||
.to_string()
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Extract all domains
|
|
||||||
pub fn extract_domains(input: &str) -> Vec<String> {
|
|
||||||
let url_pattern = r"https?://([a-zA-Z0-9.-]+)(/[\w./-]*)?";
|
|
||||||
let re = regex::Regex::new(url_pattern).unwrap();
|
|
||||||
|
|
||||||
let mut domains = HashSet::new();
|
|
||||||
for caps in re.captures_iter(input) {
|
|
||||||
let domain = caps[1].trim_start_matches("www.");
|
|
||||||
domains.insert(domain.to_string());
|
|
||||||
}
|
|
||||||
|
|
||||||
let mut domains: Vec<_> = domains.into_iter().collect();
|
|
||||||
domains.sort();
|
|
||||||
|
|
||||||
domains
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Represents a directory containg archived websites
|
|
||||||
#[derive(Debug, Clone)]
|
|
||||||
pub struct WebsiteArchive {
|
|
||||||
pub dir: PathBuf,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl WebsiteArchive {
|
|
||||||
/// Creates a new `WebsiteArchive` instance.
|
|
||||||
///
|
|
||||||
/// # Parameters
|
|
||||||
/// - `dir`: The directory path where the archive will be stored.
|
|
||||||
///
|
|
||||||
/// # Returns
|
|
||||||
/// A new `WebsiteArchive` instance.
|
|
||||||
pub fn new(dir: &str) -> Self {
|
|
||||||
Self {
|
|
||||||
dir: PathBuf::from(dir),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Retrieves the list of domain names stored in the archive.
|
|
||||||
///
|
|
||||||
/// # Returns
|
|
||||||
/// A vector of domain names as strings.
|
|
||||||
pub fn domains(&self) -> Vec<String> {
|
|
||||||
read_dir(&self.dir)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Retrieves a `Domain` instance for a specified domain name.
|
|
||||||
///
|
|
||||||
/// # Parameters
|
|
||||||
/// - `domain`: The name of the domain to retrieve.
|
|
||||||
///
|
|
||||||
/// # Returns
|
|
||||||
/// A `Domain` instance corresponding to the specified domain.
|
|
||||||
pub fn get_domain(&self, domain: &str) -> Domain {
|
|
||||||
Domain::new(domain, self.dir.join(domain))
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Archives a URL by downloading and storing its content.
|
|
||||||
///
|
|
||||||
/// If the URL does not pass the blacklist check, it will not be archived.
|
|
||||||
///
|
|
||||||
/// # Parameters
|
|
||||||
/// - `url`: The URL to archive.
|
|
||||||
///
|
|
||||||
/// This function downloads the content of the URL, processes it, and saves it to the archive.
|
|
||||||
pub async fn archive_url(&self, url: &str) {
|
|
||||||
let parsed_url = url::Url::parse(url).unwrap();
|
|
||||||
|
|
||||||
let domain = parsed_url.domain().unwrap().trim_start_matches("www.");
|
|
||||||
|
|
||||||
// Deny blacklist
|
|
||||||
if check_blacklist(domain) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
let path = parsed_url.path();
|
|
||||||
|
|
||||||
if check_blacklist_path(domain, path) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
let mut folder_name = self.dir.join(domain);
|
|
||||||
|
|
||||||
download_fav_for(domain).await;
|
|
||||||
|
|
||||||
for paths in path.split('/') {
|
|
||||||
let paths = url_escape::decode(paths).to_string();
|
|
||||||
if !paths.is_empty() {
|
|
||||||
folder_name = folder_name.join(paths);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
std::fs::create_dir_all(&folder_name).unwrap();
|
|
||||||
|
|
||||||
let timestamp = chrono::Utc::now().format("%Y-%m-%d").to_string();
|
|
||||||
let filename = folder_name.join(format!("index_{timestamp}.html"));
|
|
||||||
|
|
||||||
log::info!("Archiving {url} to {}", filename.to_str().unwrap());
|
|
||||||
|
|
||||||
let conf = get_config()
|
|
||||||
.get_domain_config(domain)
|
|
||||||
.cloned()
|
|
||||||
.unwrap_or_default();
|
|
||||||
|
|
||||||
let mut cmd = vec!["monolith", "--isolate", "-o", filename.to_str().unwrap()];
|
|
||||||
|
|
||||||
if conf.no_audio.unwrap_or_default() {
|
|
||||||
cmd.push("--no-audio");
|
|
||||||
}
|
|
||||||
|
|
||||||
if conf.no_css.unwrap_or_default() {
|
|
||||||
cmd.push("--no-css");
|
|
||||||
}
|
|
||||||
|
|
||||||
if conf.no_frames.unwrap_or_default() {
|
|
||||||
cmd.push("--no-frames");
|
|
||||||
}
|
|
||||||
|
|
||||||
if conf.no_fonts.unwrap_or_default() {
|
|
||||||
cmd.push("--no-frames");
|
|
||||||
}
|
|
||||||
|
|
||||||
if conf.no_image.unwrap_or_default() {
|
|
||||||
cmd.push("--no-images");
|
|
||||||
}
|
|
||||||
|
|
||||||
if conf.no_javascript.unwrap_or_default() {
|
|
||||||
cmd.push("--no-js");
|
|
||||||
cmd.push("--unwrap-noscript");
|
|
||||||
}
|
|
||||||
|
|
||||||
if conf.no_video.unwrap_or_default() {
|
|
||||||
cmd.push("--no-video");
|
|
||||||
}
|
|
||||||
|
|
||||||
if let Some(ua) = &conf.user_agent {
|
|
||||||
cmd.push("--user-agent");
|
|
||||||
cmd.push(ua.as_str());
|
|
||||||
}
|
|
||||||
|
|
||||||
let mut url = url::Url::parse(&format!("https://{domain}")).unwrap();
|
|
||||||
url = url.join(path).unwrap();
|
|
||||||
let url = url.to_string();
|
|
||||||
cmd.push(&url);
|
|
||||||
|
|
||||||
run_command(&cmd);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn run_command(cmd: &[&str]) {
|
|
||||||
let mut cmd_setup = std::process::Command::new(cmd[0]);
|
|
||||||
let cmd_setup = cmd_setup
|
|
||||||
.args(cmd.iter().skip(1).collect::<Vec<_>>())
|
|
||||||
.stdout(std::process::Stdio::inherit())
|
|
||||||
.stderr(std::process::Stdio::inherit());
|
|
||||||
|
|
||||||
let child = cmd_setup.spawn().unwrap();
|
|
||||||
|
|
||||||
let status = child.wait_with_output().unwrap();
|
|
||||||
assert!(status.status.success());
|
|
||||||
}
|
|
61
src/args.rs
61
src/args.rs
|
@ -1,61 +0,0 @@
|
||||||
use clap::{arg, command};
|
|
||||||
|
|
||||||
pub fn get_args() -> clap::ArgMatches {
|
|
||||||
command!()
|
|
||||||
.about("Web Archive")
|
|
||||||
.arg(
|
|
||||||
arg!(-d --dir <dir> "Web archive directory")
|
|
||||||
.required(false)
|
|
||||||
.default_value("./websites"),
|
|
||||||
)
|
|
||||||
.subcommand(
|
|
||||||
command!()
|
|
||||||
.name("serve")
|
|
||||||
.about("Start web archive server")
|
|
||||||
.arg(
|
|
||||||
arg!(-c --config <config> "Web archive config file")
|
|
||||||
.required(false)
|
|
||||||
.default_value("./config.toml"),
|
|
||||||
),
|
|
||||||
)
|
|
||||||
.subcommand(
|
|
||||||
command!()
|
|
||||||
.name("archive")
|
|
||||||
.about("Work with web archives")
|
|
||||||
.subcommand(
|
|
||||||
command!()
|
|
||||||
.name("download")
|
|
||||||
.about("Download a new URL into the archive")
|
|
||||||
.arg(
|
|
||||||
arg!(-c --config <config> "Web archive config file")
|
|
||||||
.required(false)
|
|
||||||
.default_value("./config.toml"),
|
|
||||||
)
|
|
||||||
.arg(arg!([URL] "The URL to download").required(true))
|
|
||||||
)
|
|
||||||
.subcommand(
|
|
||||||
command!()
|
|
||||||
.name("list")
|
|
||||||
.about("List domains contained in the archive. If a domain is provided all paths of this domain will be listed.")
|
|
||||||
.arg(arg!([DOMAIN] "A domain to list").required(false))
|
|
||||||
.arg(arg!(-j --json "Ouput JSON").required(false)),
|
|
||||||
)
|
|
||||||
.subcommand(
|
|
||||||
command!()
|
|
||||||
.name("versions")
|
|
||||||
.about("List saved versions of a document")
|
|
||||||
.arg(arg!(-j --json "Ouput JSON").required(false))
|
|
||||||
.arg(arg!([DOMAIN] "A domain").required(true))
|
|
||||||
.arg(arg!([PATH] "A path").required(false))
|
|
||||||
)
|
|
||||||
.subcommand(
|
|
||||||
command!()
|
|
||||||
.name("get")
|
|
||||||
.about("Get a saved document")
|
|
||||||
.arg(arg!(--md "Ouput Markdown").required(false))
|
|
||||||
.arg(arg!([DOMAIN] "A domain").required(true))
|
|
||||||
.arg(arg!([PATH] "A path").required(false))
|
|
||||||
.arg(arg!([VERSION] "A version").required(false))
|
|
||||||
))
|
|
||||||
.get_matches()
|
|
||||||
}
|
|
|
@ -1,43 +1,19 @@
|
||||||
use crate::conf::get_config;
|
|
||||||
|
|
||||||
/// Checks if a domain is present in the blacklist of unwanted domains.
|
/// Checks if a domain is present in the blacklist of unwanted domains.
|
||||||
///
|
///
|
||||||
|
/// This function checks the `$BLACKLIST_DOMAINS` environment variable for a comma-separated list of regular expressions to match against.
|
||||||
/// If a match is found, it immediately returns `true`. Otherwise, it returns `false`.
|
/// If a match is found, it immediately returns `true`. Otherwise, it returns `false`.
|
||||||
pub fn check_blacklist(domain: &str) -> bool {
|
pub fn check_blacklist(domain: &str) -> bool {
|
||||||
let conf = get_config();
|
let blacklist_raw = std::env::var("BLACKLIST_DOMAINS").unwrap_or_default();
|
||||||
let conf = conf.websites.as_ref();
|
|
||||||
|
|
||||||
// TODO : Block IPs
|
if blacklist_raw.is_empty() {
|
||||||
// TODO : Test SSRF
|
return false;
|
||||||
|
|
||||||
let blacklisted_domains = conf
|
|
||||||
.map(|x| x.BLACKLIST_DOMAINS.as_ref())
|
|
||||||
.unwrap_or_default();
|
|
||||||
|
|
||||||
check_regex(domain, blacklisted_domains.unwrap_or(&Vec::new()))
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn check_blacklist_path(domain: &str, path: &str) -> bool {
|
|
||||||
let conf = get_config();
|
|
||||||
let conf = conf.websites.as_ref();
|
|
||||||
|
|
||||||
if let Some(website) = conf {
|
|
||||||
let empty = Vec::new();
|
|
||||||
let domain_conf = website.domains.as_ref().unwrap_or(&empty);
|
|
||||||
if let Some(domain_conf) = domain_conf.iter().find(|x| x.domain == domain) {
|
|
||||||
let empty = Vec::new();
|
|
||||||
let blacklist = domain_conf.blacklist_paths.as_ref().unwrap_or(&empty);
|
|
||||||
return check_regex(path, blacklist);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
false
|
let blacklist: Vec<&str> = blacklist_raw.split(',').collect();
|
||||||
}
|
|
||||||
|
|
||||||
pub fn check_regex(input: &str, regexes: &Vec<String>) -> bool {
|
for domain_regex in blacklist {
|
||||||
for regex in regexes {
|
let rgx = regex::Regex::new(domain_regex).unwrap();
|
||||||
let rgx = regex::Regex::new(regex).unwrap();
|
if rgx.is_match(domain) {
|
||||||
if rgx.is_match(input) {
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
107
src/conf.rs
107
src/conf.rs
|
@ -1,107 +0,0 @@
|
||||||
use std::sync::Arc;
|
|
||||||
|
|
||||||
use serde::Deserialize;
|
|
||||||
use tokio::sync::OnceCell;
|
|
||||||
|
|
||||||
pub static CONFIG: OnceCell<Arc<Config>> = OnceCell::const_new();
|
|
||||||
|
|
||||||
/// Get a reference to global config
|
|
||||||
pub fn get_config() -> &'static Arc<Config> {
|
|
||||||
crate::conf::CONFIG.get().unwrap()
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Load a global config
|
|
||||||
pub fn load_config(path: &str) {
|
|
||||||
// TODO : Other load locations
|
|
||||||
if let Ok(file_content) = std::fs::read_to_string(path) {
|
|
||||||
let conf: Config =
|
|
||||||
toml::from_str(&file_content).expect("Could not deserialize config file");
|
|
||||||
crate::conf::CONFIG.set(std::sync::Arc::new(conf)).unwrap();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Load a default global config
|
|
||||||
pub fn load_default_config() {
|
|
||||||
if crate::conf::CONFIG.get().is_none() {
|
|
||||||
crate::conf::CONFIG
|
|
||||||
.set(std::sync::Arc::new(Config::default()))
|
|
||||||
.unwrap();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[allow(non_snake_case)]
|
|
||||||
#[derive(Debug, Clone, Deserialize)]
|
|
||||||
pub struct Config {
|
|
||||||
pub ROUTE_INTERNAL: bool,
|
|
||||||
pub DOWNLOAD_ON_DEMAND: bool,
|
|
||||||
pub ai: Option<AIConfig>,
|
|
||||||
pub websites: Option<WebsiteConfig>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Config {
|
|
||||||
pub fn get_domain_config(&self, domain: &str) -> Option<&DomainConfig> {
|
|
||||||
if let Some(websites) = &self.websites {
|
|
||||||
if let Some(domains) = &websites.domains {
|
|
||||||
let domain = domains.iter().find(|x| x.domain == domain);
|
|
||||||
return domain;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
None
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[allow(non_snake_case)]
|
|
||||||
#[derive(Debug, Clone, Deserialize)]
|
|
||||||
pub struct AIConfig {
|
|
||||||
pub OLLAMA_URL: String,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[allow(non_snake_case)]
|
|
||||||
#[derive(Debug, Clone, Deserialize)]
|
|
||||||
pub struct WebsiteConfig {
|
|
||||||
pub BLACKLIST_DOMAINS: Option<Vec<String>>,
|
|
||||||
pub domains: Option<Vec<DomainConfig>>,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Clone, Deserialize)]
|
|
||||||
pub struct DomainConfig {
|
|
||||||
pub domain: String,
|
|
||||||
pub blacklist_paths: Option<Vec<String>>,
|
|
||||||
pub no_audio: Option<bool>,
|
|
||||||
pub no_video: Option<bool>,
|
|
||||||
pub no_image: Option<bool>,
|
|
||||||
pub no_css: Option<bool>,
|
|
||||||
pub no_javascript: Option<bool>,
|
|
||||||
pub no_fonts: Option<bool>,
|
|
||||||
pub no_frames: Option<bool>,
|
|
||||||
pub user_agent: Option<String>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Default for DomainConfig {
|
|
||||||
fn default() -> Self {
|
|
||||||
Self {
|
|
||||||
domain: String::new(),
|
|
||||||
blacklist_paths: None,
|
|
||||||
no_audio: Some(false),
|
|
||||||
no_video: Some(false),
|
|
||||||
no_image: Some(false),
|
|
||||||
no_css: Some(false),
|
|
||||||
no_javascript: Some(false),
|
|
||||||
no_fonts: Some(false),
|
|
||||||
no_frames: Some(false),
|
|
||||||
user_agent: None,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Default for Config {
|
|
||||||
fn default() -> Self {
|
|
||||||
Self {
|
|
||||||
ROUTE_INTERNAL: false,
|
|
||||||
DOWNLOAD_ON_DEMAND: false,
|
|
||||||
ai: None,
|
|
||||||
websites: None,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
29
src/lib.rs
29
src/lib.rs
|
@ -1,29 +0,0 @@
|
||||||
use based::{
|
|
||||||
page::Shell,
|
|
||||||
request::{RequestContext, StringResponse},
|
|
||||||
};
|
|
||||||
use maud::{html, PreEscaped};
|
|
||||||
|
|
||||||
pub mod ai;
|
|
||||||
pub mod archive;
|
|
||||||
pub mod blacklist;
|
|
||||||
pub mod conf;
|
|
||||||
pub mod favicon;
|
|
||||||
|
|
||||||
pub async fn render_page(content: PreEscaped<String>, ctx: RequestContext) -> StringResponse {
|
|
||||||
based::page::render_page(
|
|
||||||
content,
|
|
||||||
"Website Archive",
|
|
||||||
ctx,
|
|
||||||
&Shell::new(
|
|
||||||
html! {
|
|
||||||
script src="https://cdn.tailwindcss.com" {};
|
|
||||||
meta name="viewport" content="width=device-width, initial-scale=1.0" {};
|
|
||||||
script src="/assets/htmx.min.js" {};
|
|
||||||
},
|
|
||||||
html! {},
|
|
||||||
Some("bg-zinc-950 text-white min-h-screen flex pt-8 justify-center".to_string()),
|
|
||||||
),
|
|
||||||
)
|
|
||||||
.await
|
|
||||||
}
|
|
152
src/main.rs
152
src/main.rs
|
@ -1,42 +1,33 @@
|
||||||
|
use ai::EmbedStore;
|
||||||
|
use archive::WebsiteArchive;
|
||||||
use based::get_pg;
|
use based::get_pg;
|
||||||
use rocket::routes;
|
use rocket::routes;
|
||||||
use webarc::ai::EmbedStore;
|
|
||||||
use webarc::archive::WebsiteArchive;
|
|
||||||
use webarc::conf::{get_config, load_config, load_default_config};
|
|
||||||
|
|
||||||
mod args;
|
mod ai;
|
||||||
|
mod archive;
|
||||||
|
mod blacklist;
|
||||||
|
mod favicon;
|
||||||
mod pages;
|
mod pages;
|
||||||
|
|
||||||
#[tokio::main]
|
#[rocket::launch]
|
||||||
async fn main() {
|
async fn launch() -> _ {
|
||||||
env_logger::init();
|
env_logger::init();
|
||||||
|
|
||||||
let args = args::get_args();
|
let arc = WebsiteArchive::new("./websites");
|
||||||
|
|
||||||
let archive_dir: &String = args.get_one("dir").unwrap();
|
|
||||||
|
|
||||||
match args.subcommand() {
|
|
||||||
Some(("serve", serve_args)) => {
|
|
||||||
let config: &String = serve_args.get_one("config").unwrap();
|
|
||||||
load_config(config);
|
|
||||||
|
|
||||||
let arc = WebsiteArchive::new(archive_dir);
|
|
||||||
|
|
||||||
if std::env::var("DATABASE_URL").is_ok() {
|
if std::env::var("DATABASE_URL").is_ok() {
|
||||||
let pg = get_pg!();
|
let pg = get_pg!();
|
||||||
sqlx::migrate!("./migrations").run(pg).await.unwrap();
|
sqlx::migrate!("./migrations").run(pg).await.unwrap();
|
||||||
}
|
}
|
||||||
|
|
||||||
let archive = arc.clone();
|
if std::env::var("OLLAMA_URL").is_ok() {
|
||||||
if get_config().ai.is_some() {
|
EmbedStore::generate_embeddings_for(&arc).await;
|
||||||
tokio::spawn(async move {
|
|
||||||
EmbedStore::generate_embeddings_for(&archive).await;
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
|
|
||||||
let archive = arc.clone();
|
let archive = arc.clone();
|
||||||
|
|
||||||
tokio::spawn(async move {
|
tokio::spawn(async move {
|
||||||
webarc::favicon::download_favicons_for_sites(&archive.domains()).await;
|
favicon::download_favicons_for_sites(&archive.domains()).await;
|
||||||
});
|
});
|
||||||
|
|
||||||
rocket::build()
|
rocket::build()
|
||||||
|
@ -48,123 +39,8 @@ async fn main() {
|
||||||
pages::render_website,
|
pages::render_website,
|
||||||
pages::domain_info_route,
|
pages::domain_info_route,
|
||||||
pages::favicon_route,
|
pages::favicon_route,
|
||||||
pages::vector_search,
|
pages::vector_search
|
||||||
pages::render_txt_website
|
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
.manage(arc)
|
.manage(arc)
|
||||||
.launch()
|
|
||||||
.await
|
|
||||||
.unwrap();
|
|
||||||
}
|
|
||||||
Some(("archive", archive_args)) => {
|
|
||||||
let arc = WebsiteArchive::new(archive_dir);
|
|
||||||
|
|
||||||
match archive_args.subcommand() {
|
|
||||||
Some(("list", list_args)) => {
|
|
||||||
let json = list_args.get_flag("json");
|
|
||||||
|
|
||||||
load_default_config();
|
|
||||||
|
|
||||||
let elements = if let Some(domain) = list_args.get_one::<String>("DOMAIN") {
|
|
||||||
arc.get_domain(domain)
|
|
||||||
.all_paths()
|
|
||||||
.into_iter()
|
|
||||||
.map(|x| x.path().clone())
|
|
||||||
.collect()
|
|
||||||
} else {
|
|
||||||
arc.domains()
|
|
||||||
};
|
|
||||||
|
|
||||||
if json {
|
|
||||||
println!(
|
|
||||||
"{}",
|
|
||||||
serde_json::to_string(&serde_json::json!(elements)).unwrap()
|
|
||||||
);
|
|
||||||
} else {
|
|
||||||
if let Some(domain) = list_args.get_one::<String>("DOMAIN") {
|
|
||||||
println!("Paths in {domain}:");
|
|
||||||
} else {
|
|
||||||
println!("Domains in {}:", archive_dir);
|
|
||||||
}
|
|
||||||
|
|
||||||
if elements.is_empty() {
|
|
||||||
println!("No domains");
|
|
||||||
}
|
|
||||||
|
|
||||||
for d in elements {
|
|
||||||
println!("- {d}");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Some(("download", dl_args)) => {
|
|
||||||
let url: &String = dl_args.get_one("URL").unwrap();
|
|
||||||
|
|
||||||
let config: &String = dl_args.get_one("config").unwrap();
|
|
||||||
load_config(config);
|
|
||||||
|
|
||||||
arc.archive_url(url).await;
|
|
||||||
println!("Saved {url} to archive");
|
|
||||||
}
|
|
||||||
Some(("versions", ver_args)) => {
|
|
||||||
load_default_config();
|
|
||||||
|
|
||||||
let domain: &String = ver_args.get_one("DOMAIN").unwrap();
|
|
||||||
let path: String = if let Some(path) = ver_args.get_one::<String>("PATH") {
|
|
||||||
path.clone()
|
|
||||||
} else {
|
|
||||||
"/".to_string()
|
|
||||||
};
|
|
||||||
let versions = arc.get_domain(domain).path(&path).versions();
|
|
||||||
|
|
||||||
let json = ver_args.get_flag("json");
|
|
||||||
|
|
||||||
if json {
|
|
||||||
println!("{}", serde_json::to_string(&versions).unwrap());
|
|
||||||
} else {
|
|
||||||
println!("Versions for {domain} / {path}:");
|
|
||||||
for v in versions {
|
|
||||||
println!("- {v}");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Some(("get", get_args)) => {
|
|
||||||
load_default_config();
|
|
||||||
|
|
||||||
let domain: &String = get_args.get_one("DOMAIN").unwrap();
|
|
||||||
let path = if let Some(path) = get_args.get_one::<String>("PATH") {
|
|
||||||
path.clone()
|
|
||||||
} else {
|
|
||||||
"/".to_string()
|
|
||||||
};
|
|
||||||
let doc = arc.get_domain(domain).path(&path);
|
|
||||||
let ver = if let Some(ver) = get_args.get_one::<String>("VERSION") {
|
|
||||||
ver.clone()
|
|
||||||
} else {
|
|
||||||
doc.versions().first().unwrap().clone()
|
|
||||||
};
|
|
||||||
|
|
||||||
let md = get_args.get_flag("md");
|
|
||||||
|
|
||||||
let content = doc.render_local(Some(ver)).await;
|
|
||||||
|
|
||||||
if content.is_none() {
|
|
||||||
println!("No document found");
|
|
||||||
std::process::exit(1);
|
|
||||||
}
|
|
||||||
|
|
||||||
if md {
|
|
||||||
let markdown = html2md::parse_html(&content.unwrap());
|
|
||||||
println!("{markdown}");
|
|
||||||
} else {
|
|
||||||
println!("{}", content.unwrap());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Some((&_, _)) => {}
|
|
||||||
None => {}
|
|
||||||
};
|
|
||||||
}
|
|
||||||
Some((&_, _)) => {}
|
|
||||||
None => {}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,3 +1,7 @@
|
||||||
|
use based::{
|
||||||
|
page::Shell,
|
||||||
|
request::{RequestContext, StringResponse},
|
||||||
|
};
|
||||||
use maud::{html, PreEscaped};
|
use maud::{html, PreEscaped};
|
||||||
|
|
||||||
/// Generates an SVG arrow icon with the specified color.
|
/// Generates an SVG arrow icon with the specified color.
|
||||||
|
@ -74,8 +78,20 @@ pub fn gen_path_header(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn favicon(site: &str) -> PreEscaped<String> {
|
pub async fn render_page(content: PreEscaped<String>, ctx: RequestContext) -> StringResponse {
|
||||||
|
based::page::render_page(
|
||||||
|
content,
|
||||||
|
"Website Archive",
|
||||||
|
ctx,
|
||||||
|
&Shell::new(
|
||||||
html! {
|
html! {
|
||||||
img class="h-8 w-8 m-2" src=(format!("/favicon/{site}")) {};
|
script src="https://cdn.tailwindcss.com" {};
|
||||||
}
|
meta name="viewport" content="width=device-width, initial-scale=1.0" {};
|
||||||
|
script src="/assets/htmx.min.js" {};
|
||||||
|
},
|
||||||
|
html! {},
|
||||||
|
Some("bg-zinc-950 text-white min-h-screen flex pt-8 justify-center".to_string()),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
.await
|
||||||
}
|
}
|
||||||
|
|
113
src/pages/mod.rs
113
src/pages/mod.rs
|
@ -7,20 +7,18 @@ use based::{
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
use maud::{html, PreEscaped};
|
use maud::{html, PreEscaped};
|
||||||
use rocket::{get, request::FromSegments, State};
|
use rocket::{get, State};
|
||||||
|
|
||||||
pub mod component;
|
pub mod component;
|
||||||
use component::*;
|
use component::*;
|
||||||
use serde_json::json;
|
use serde_json::json;
|
||||||
|
|
||||||
use webarc::{
|
use crate::{
|
||||||
ai::{generate_embedding, remove_data_urls, EmbedStore, SearchResult},
|
ai::{generate_embedding, DocEmbedding, EmbedStore},
|
||||||
archive::{extract_domains, WebsiteArchive},
|
archive::WebsiteArchive,
|
||||||
conf::get_config,
|
|
||||||
render_page,
|
|
||||||
};
|
};
|
||||||
|
|
||||||
const SEARCH_BAR_STYLE: &str = "w-full px-4 mb-4 py-2 text-white bg-black border-2 border-neon-blue placeholder-neon-blue focus:ring-2 focus:ring-neon-pink focus:outline-none font-mono text-lg";
|
const SEARCH_BAR_STYLE: &'static str = "w-full px-4 mb-4 py-2 text-white bg-black border-2 border-neon-blue placeholder-neon-blue focus:ring-2 focus:ring-neon-pink focus:outline-none font-mono text-lg";
|
||||||
|
|
||||||
/// Get the favicon of a domain
|
/// Get the favicon of a domain
|
||||||
#[get("/favicon/<domain>")]
|
#[get("/favicon/<domain>")]
|
||||||
|
@ -31,8 +29,6 @@ pub async fn favicon_route(domain: &str) -> Option<DataResponse> {
|
||||||
.read_to_end(&mut buf)
|
.read_to_end(&mut buf)
|
||||||
.ok()?;
|
.ok()?;
|
||||||
|
|
||||||
// TODO : Default favicon
|
|
||||||
|
|
||||||
Some(DataResponse::new(
|
Some(DataResponse::new(
|
||||||
buf,
|
buf,
|
||||||
"image/x-icon".to_string(),
|
"image/x-icon".to_string(),
|
||||||
|
@ -61,7 +57,7 @@ pub async fn index(ctx: RequestContext, arc: &State<WebsiteArchive>) -> StringRe
|
||||||
@for site in websites {
|
@for site in websites {
|
||||||
a href=(format!("/d/{site}")) class="bg-neutral-900 shadow-md rounded-lg hover:bg-neutral-800 bg-gray-1 hover:cursor-pointer transition-all duration-300 flex flex-col items-center justify-center aspect-square max-w-60" {
|
a href=(format!("/d/{site}")) class="bg-neutral-900 shadow-md rounded-lg hover:bg-neutral-800 bg-gray-1 hover:cursor-pointer transition-all duration-300 flex flex-col items-center justify-center aspect-square max-w-60" {
|
||||||
div class="bg-blue-500 text-white rounded-full p-4" {
|
div class="bg-blue-500 text-white rounded-full p-4" {
|
||||||
(favicon(&site))
|
img class="h-8 w-8" src=(format!("/favicon/{site}")) {};
|
||||||
};
|
};
|
||||||
p class="mt-4 text-base font-medium" { (site) };
|
p class="mt-4 text-base font-medium" { (site) };
|
||||||
};
|
};
|
||||||
|
@ -88,7 +84,7 @@ pub async fn domain_info_route(
|
||||||
let (path_entries, is_doc) = domain.paths(paths.to_str().unwrap());
|
let (path_entries, is_doc) = domain.paths(paths.to_str().unwrap());
|
||||||
let path_seperations: Vec<&str> = paths.to_str().unwrap().split('/').collect();
|
let path_seperations: Vec<&str> = paths.to_str().unwrap().split('/').collect();
|
||||||
|
|
||||||
let domains = extract_domains(&document.render_local(None).await.unwrap_or_default());
|
// TODO : Show domains beeing linked on the page
|
||||||
|
|
||||||
let content = html! {
|
let content = html! {
|
||||||
h2 class="text-xl font-bold mb-4 flex items-center" {
|
h2 class="text-xl font-bold mb-4 flex items-center" {
|
||||||
|
@ -134,89 +130,20 @@ pub async fn domain_info_route(
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
@if !domains.is_empty() {
|
|
||||||
div class="max-w-md mx-auto p-4 bg-neutral-900 rounded-lg shadow-md" {
|
|
||||||
h3 class="font-bold mb-2" { "Domains linked on this page:" };
|
|
||||||
ul class="space-y-2 p-4" {
|
|
||||||
@for domain in domains {
|
|
||||||
a href=(format!("/d/{domain}")) class="flex items-center gap-2 p-3 border bg-neutral-800 rounded hover:shadow-lg transition" {
|
|
||||||
(favicon(&domain));
|
|
||||||
span class="font-medium" { (domain) };
|
|
||||||
};
|
|
||||||
};
|
|
||||||
};
|
|
||||||
};
|
|
||||||
};
|
|
||||||
};
|
};
|
||||||
|
|
||||||
render_page(content, ctx).await
|
render_page(content, ctx).await
|
||||||
}
|
}
|
||||||
|
|
||||||
#[get("/txt/<domain>/<path..>?<time>&<no_data_urls>")]
|
|
||||||
pub async fn render_txt_website(
|
|
||||||
domain: &str,
|
|
||||||
path: PathBuf,
|
|
||||||
time: Option<&str>,
|
|
||||||
no_data_urls: Option<&str>,
|
|
||||||
arc: &State<WebsiteArchive>,
|
|
||||||
) -> Option<String> {
|
|
||||||
let document = arc.get_domain(domain).path(path.to_str().unwrap());
|
|
||||||
|
|
||||||
let mut content = document
|
|
||||||
.render_local(time.map(|time| time.to_string()))
|
|
||||||
.await?;
|
|
||||||
|
|
||||||
if no_data_urls.is_some() {
|
|
||||||
content = remove_data_urls(&content);
|
|
||||||
}
|
|
||||||
|
|
||||||
Some(html2md::parse_html(&content))
|
|
||||||
}
|
|
||||||
|
|
||||||
pub struct PathSegment {
|
|
||||||
segments: Vec<String>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl PathSegment {
|
|
||||||
pub fn to_str(&self) -> String {
|
|
||||||
self.segments.join("/")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<'r> FromSegments<'r> for PathSegment {
|
|
||||||
type Error = ();
|
|
||||||
|
|
||||||
fn from_segments(
|
|
||||||
segments: rocket::http::uri::Segments<'r, rocket::http::uri::fmt::Path>,
|
|
||||||
) -> Result<Self, Self::Error> {
|
|
||||||
let paths: Vec<_> = segments
|
|
||||||
.filter_map(|x| {
|
|
||||||
if x == "." {
|
|
||||||
return None;
|
|
||||||
}
|
|
||||||
|
|
||||||
if x == ".." {
|
|
||||||
return None;
|
|
||||||
}
|
|
||||||
|
|
||||||
Some(x.to_string())
|
|
||||||
})
|
|
||||||
.collect();
|
|
||||||
|
|
||||||
Ok(PathSegment { segments: paths })
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Return archived version of `domain` / `path` at `time`
|
/// Return archived version of `domain` / `path` at `time`
|
||||||
#[get("/s/<domain>/<path..>?<time>")]
|
#[get("/s/<domain>/<path..>?<time>")]
|
||||||
pub async fn render_website(
|
pub async fn render_website(
|
||||||
domain: &str,
|
domain: &str,
|
||||||
path: PathSegment,
|
path: PathBuf,
|
||||||
time: Option<&str>,
|
time: Option<&str>,
|
||||||
arc: &State<WebsiteArchive>,
|
arc: &State<WebsiteArchive>,
|
||||||
) -> Option<DataResponse> {
|
) -> Option<DataResponse> {
|
||||||
let document = arc.get_domain(domain).path(&path.to_str());
|
let document = arc.get_domain(domain).path(path.to_str().unwrap());
|
||||||
|
|
||||||
let content = document
|
let content = document
|
||||||
.render_local(time.map(|time| time.to_string()))
|
.render_local(time.map(|time| time.to_string()))
|
||||||
|
@ -228,8 +155,13 @@ pub async fn render_website(
|
||||||
"text/html".to_string(),
|
"text/html".to_string(),
|
||||||
Some(60 * 60 * 24),
|
Some(60 * 60 * 24),
|
||||||
));
|
));
|
||||||
} else if get_config().DOWNLOAD_ON_DEMAND && time.is_none() {
|
} else if std::env::var("DOWNLOAD_ON_DEMAND")
|
||||||
arc.archive_url(&format!("https://{domain}/{}", path.to_str()))
|
.unwrap_or("false".to_string())
|
||||||
|
.as_str()
|
||||||
|
== "true"
|
||||||
|
&& time.is_none()
|
||||||
|
{
|
||||||
|
arc.archive_url(&format!("https://{domain}/{}", path.to_str().unwrap()))
|
||||||
.await;
|
.await;
|
||||||
|
|
||||||
let content = document.render_local(None).await?;
|
let content = document.render_local(None).await?;
|
||||||
|
@ -244,17 +176,17 @@ pub async fn render_website(
|
||||||
None
|
None
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn gen_search_element(x: &SearchResult) -> PreEscaped<String> {
|
pub fn gen_search_element(x: &DocEmbedding) -> PreEscaped<String> {
|
||||||
html! {
|
html! {
|
||||||
div class="text-xl font-bold mt-4 p-4 flex items-center w-full max-w-4xl max-h-40 mx-auto bg-neutral-800 shadow-md rounded-lg overflow-hidden border border-neutral-900 hover:cursor-pointer"
|
div class="text-xl font-bold mt-4 p-4 flex items-center w-full max-w-4xl max-h-40 mx-auto bg-neutral-800 shadow-md rounded-lg overflow-hidden border border-neutral-900 hover:cursor-pointer"
|
||||||
hx-get=(format!("/d/{}/{}", x.domain, x.path))
|
hx-get=(format!("/d/{}/{}", x.domain, x.path))
|
||||||
hx-target="#main_content" hx-push-url="true" hx-swap="innerHTML"
|
hx-target="#main_content" hx-push-url="true" hx-swap="innerHTML"
|
||||||
{
|
{
|
||||||
(favicon(&x.domain))
|
img class="p-2" src=(format!("/favicon/{}", &x.domain));
|
||||||
a { (x.domain) };
|
a { (x.domain) };
|
||||||
(slash_seperator());
|
(slash_seperator());
|
||||||
(gen_path_header(x.path.split('/').collect(), &x.domain, false));
|
(gen_path_header(x.path.split('/').collect(), &x.domain, false));
|
||||||
p class="font-bold p-2 text-stone-400" { (format!("{:.2} % [{} matches]", x.similarity() * 100.0, x.chunks.len())) };
|
p class="font-bold p-2 text-stone-400" { (format!("{:.2} %", x.similarity * 100.0)) };
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -265,7 +197,9 @@ pub async fn vector_search(
|
||||||
page: Option<i64>,
|
page: Option<i64>,
|
||||||
ctx: RequestContext,
|
ctx: RequestContext,
|
||||||
) -> Option<StringResponse> {
|
) -> Option<StringResponse> {
|
||||||
get_config().ai.as_ref()?;
|
if std::env::var("OLLAMA_URL").is_err() {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
|
||||||
let page = page.unwrap_or(1);
|
let page = page.unwrap_or(1);
|
||||||
|
|
||||||
|
@ -286,14 +220,13 @@ pub async fn vector_search(
|
||||||
EmbedStore::search_vector(&input, limit as i64, offset as i64).await
|
EmbedStore::search_vector(&input, limit as i64, offset as i64).await
|
||||||
})
|
})
|
||||||
},
|
},
|
||||||
1500,
|
5,
|
||||||
)
|
)
|
||||||
.pager(page as u64, vector)
|
.pager(page as u64, vector)
|
||||||
.await;
|
.await;
|
||||||
|
|
||||||
// API Route
|
// API Route
|
||||||
if query.ends_with(".json") {
|
if query.ends_with(".json") {
|
||||||
// TODO : Better search API
|
|
||||||
return Some(respond_json(&json!(&results.page(page as u64))));
|
return Some(respond_json(&json!(&results.page(page as u64))));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Add table
Reference in a new issue