Compare commits

..

No commits in common. "main" and "ollama" have entirely different histories.
main ... ollama

20 changed files with 476 additions and 1383 deletions

125
Cargo.lock generated
View file

@ -59,55 +59,6 @@ dependencies = [
"libc",
]
[[package]]
name = "anstream"
version = "0.6.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8acc5369981196006228e28809f761875c0327210a891e941f4c683b3a99529b"
dependencies = [
"anstyle",
"anstyle-parse",
"anstyle-query",
"anstyle-wincon",
"colorchoice",
"is_terminal_polyfill",
"utf8parse",
]
[[package]]
name = "anstyle"
version = "1.0.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "55cc3b69f167a1ef2e161439aa98aed94e6028e5f9a59be9a6ffb47aef1651f9"
[[package]]
name = "anstyle-parse"
version = "0.2.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3b2d16507662817a6a20a9ea92df6652ee4f94f914589377d69f3b21bc5798a9"
dependencies = [
"utf8parse",
]
[[package]]
name = "anstyle-query"
version = "1.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "79947af37f4177cfead1110013d678905c37501914fba0efea834c3fe9a8d60c"
dependencies = [
"windows-sys 0.59.0",
]
[[package]]
name = "anstyle-wincon"
version = "3.0.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2109dbce0e72be3ec00bed26e6a7479ca384ad226efdd66db8fa2e3a38c83125"
dependencies = [
"anstyle",
"windows-sys 0.59.0",
]
[[package]]
name = "async-stream"
version = "0.3.6"
@ -362,52 +313,6 @@ dependencies = [
"inout",
]
[[package]]
name = "clap"
version = "4.5.23"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3135e7ec2ef7b10c6ed8950f0f792ed96ee093fa088608f1c76e569722700c84"
dependencies = [
"clap_builder",
"clap_derive",
]
[[package]]
name = "clap_builder"
version = "4.5.23"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "30582fc632330df2bd26877bde0c1f4470d57c582bbc070376afcd04d8cb4838"
dependencies = [
"anstream",
"anstyle",
"clap_lex",
"strsim",
]
[[package]]
name = "clap_derive"
version = "4.5.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4ac6a0c7b1a9e9a5186361f67dfa1b88213572f427fb9ab038efb2bd8c582dab"
dependencies = [
"heck",
"proc-macro2",
"quote",
"syn 2.0.93",
]
[[package]]
name = "clap_lex"
version = "0.7.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f46ad14479a25103f283c0f10005961cf086d8dc42205bb44c46ac563475dca6"
[[package]]
name = "colorchoice"
version = "1.0.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5b63caa9aa9397e2d9480a9b13673856c78d8ac123288526c37d7839f2a86990"
[[package]]
name = "combine"
version = "4.6.7"
@ -1446,12 +1351,6 @@ dependencies = [
"windows-sys 0.52.0",
]
[[package]]
name = "is_terminal_polyfill"
version = "1.70.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf"
[[package]]
name = "itoa"
version = "1.0.14"
@ -3043,12 +2942,6 @@ dependencies = [
"unicode-properties",
]
[[package]]
name = "strsim"
version = "0.11.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
[[package]]
name = "subtle"
version = "2.6.1"
@ -3569,15 +3462,6 @@ dependencies = [
"percent-encoding",
]
[[package]]
name = "url-escape"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "44e0ce4d1246d075ca5abec4b41d33e87a6054d08e2366b63205665e950db218"
dependencies = [
"percent-encoding",
]
[[package]]
name = "utf-8"
version = "0.7.6"
@ -3596,12 +3480,6 @@ version = "1.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be"
[[package]]
name = "utf8parse"
version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
[[package]]
name = "uuid"
version = "1.11.0"
@ -3744,7 +3622,6 @@ version = "0.1.0"
dependencies = [
"based",
"chrono",
"clap",
"env_logger",
"futures",
"html2md",
@ -3759,9 +3636,7 @@ dependencies = [
"serde_json",
"sqlx",
"tokio",
"toml",
"url",
"url-escape",
"uuid",
]

View file

@ -22,6 +22,3 @@ reqwest = "0.12.11"
ollama-rs = "0.2.2"
pgvector = { version = "0.4", features = ["sqlx"] }
html2md = "0.2.14"
clap = { version = "4.5.23", features = ["cargo", "derive"] }
toml = "0.8.19"
url-escape = "0.1.1"

View file

@ -1,13 +1,13 @@
FROM rust:buster AS builder
RUN rustup default nightly
RUN git clone "https://github.com/Y2Z/monolith" /monolith
WORKDIR /monolith
RUN cargo build --release
FROM rust:buster as builder
COPY . /app
WORKDIR /app
RUN rustup default nightly
RUN cargo build --release
RUN git clone "https://github.com/Y2Z/monolith" /monolith
WORKDIR /monolith
RUN cargo build --release
FROM debian:buster

View file

@ -1,47 +1,14 @@
# WebArc
`webarc` is a local website archive based on [monolith](https://github.com/Y2Z/monolith).
## Archive Format
A web archive is defined as a directory containing domains in this structure:
## Configuration
You can configure the application using environment variables:
```
web_archive/
├─ domain.com/
│ ├─ sub/
│ │ ├─ path/
│ │ │ ├─ index_YYYY_MM_DD.html
├─ sub.domain.com/
```
Every document of this web archive can then be found at `archive/domain/paths.../index_YYYY_MM_DD.html`.
- `$ROUTE_INTERNAL` : Rewrite links to point back to the archive itself
- `$DOWNLOAD_ON_DEMAND` : Download missing routes with monolith on demand
- `$BLACKLIST_DOMAINS` : Blacklisted domains (Comma-seperated regex, example: `google.com,.*.youtube.com`)
## Usage
webarc provides a CLI tool to work with the archive structure.
```sh
# List domains in archive
webarc [--dir ARCHIVE] archive list [-j, --json]
# List all paths on a domain
webarc [--dir ARCHIVE] archive list [-j, --json] [DOMAIN]
# List all versions of a document
webarc [--dir ARCHIVE] archive versions [-j, --json] [DOMAIN] [PATH]
# Get a document
# `--md` will return a markdown version
webarc [--dir ARCHIVE] archive get [--md] [DOMAIN] [PATH] [VERSION]
# Archive a website
webarc [--dir ARCHIVE] archive download [URL]
```
## Configuration
You can configure the application using a config file. Look at the [config.toml](config.toml) file for more information.
## Web Server
You can start a webserver serving an archive with `webarc serve`.
Archived pages can be viewed at `/s/<domain>/<path..>`.
For example, `/s/en.wikipedia.org/wiki/Website` will serve `en.wikipedia.org` at `/wiki/Website`.

View file

@ -1,73 +0,0 @@
# Rewrite links to point back to the archive itself
ROUTE_INTERNAL=true
# Download missing routes on demand
DOWNLOAD_ON_DEMAND=true
[websites]
# You can blacklist sites which wont work well
BLACKLIST_DOMAINS = [
"^gitlab", # All domains starting with gitlab
"youtube" # YouTube
]
# Domain configuration (Example)
[[websites.domains]]
# The domain the config applies to
domain = "example.com"
# Blacklisted Path (Regexes)
blacklist_paths = ["/.*"]
# Exclude <audio> tags
no_audio = false
# Exclude <video> tags
no_video = false
# Exclude <img> tags
no_image = false
# Exclude CSS
no_css = false
# Exclude Javascript
no_javascript = false
# Exclude fonts
no_fonts = false
# Exclude iframes
no_frames = false
# User Agent
user_agent = "Safari"
[ai]
# Ollama URL (Enables vector search)
OLLAMA_URL="127.0.0.1:11434"
# --- Website Config
[[websites.domains]]
domain = "developer.mozilla.org"
no_javascript = true
[[websites.domains]]
domain = "github.com"
no_javascript = true
[[websites.domains]]
domain = "en.wikipedia.org"
no_javascript = true
[[websites.domains]]
domain = "api.flutter.dev"
no_javascript = true
no_video = true
[[websites.domains]]
domain = "docs.flutter.dev"
no_javascript = true
no_video = true

View file

@ -6,12 +6,7 @@ services:
volumes:
- ./websites:/websites
- ./favicon:/favicon
- ./config.toml:/config.toml
environment:
- "RUST_LOG=info"
- "ROCKET_ADDRESS=0.0.0.0"
- "DATABASE_URL=postgres://user:pass@postgres/webarc"
command: "/webarc serve"
env_file: env
postgres:
# Any Postgres with support for pgvector

18
env Normal file
View file

@ -0,0 +1,18 @@
# Logging
RUST_LOG=info
ROCKET_ADDRESS=0.0.0.0
# Rewrite links to point back to the archive itself
ROUTE_INTERNAL=true
# Download missing routes on demand
DOWNLOAD_ON_DEMAND=true
# Blacklisted domains (Comma-seperated regex)
BLACKLIST_DOMAINS="google.com,.*.youtube.com"
# Database
DATABASE_URL=postgres://user:pass@postgres/webarc
# Ollama URL (Enables vector search)
OLLAMA_URL=127.0.0.1:11434

View file

@ -5,9 +5,6 @@ CREATE TABLE doc_embedding (
domain VARCHAR(500) NOT NULL,
path VARCHAR(1000) NOT NULL,
ver VARCHAR(10) NOT NULL,
chunk INTEGER NOT NULL,
embed_mxbai_embed_large vector(1024) NOT NULL,
PRIMARY KEY (domain, path, ver, chunk)
);
CREATE INDEX ON doc_embedding USING ivfflat (embed_mxbai_embed_large vector_cosine_ops) WITH (lists = 200);
PRIMARY KEY (domain, path, ver)
)

181
src/ai.rs
View file

@ -1,4 +1,4 @@
use std::collections::{HashMap, VecDeque};
use std::collections::VecDeque;
use based::{get_pg, request::api::ToAPI, result::LogNoneAndPass};
use ollama_rs::generation::embeddings::request::{EmbeddingsInput, GenerateEmbeddingsRequest};
@ -6,21 +6,14 @@ use serde::Serialize;
use serde_json::json;
use sqlx::FromRow;
use crate::{
archive::{Document, Domain, WebsiteArchive},
conf::get_config,
};
// TODO : Cite found chunks in search res?
use crate::archive::{Document, Domain, WebsiteArchive};
#[derive(Debug, Clone, FromRow, Serialize)]
pub struct DocEmbedding {
pub domain: String,
pub path: String,
pub ver: String,
pub chunk: i32,
#[allow(dead_code)]
#[serde(skip)]
embed_mxbai_embed_large: pgvector::Vector,
@ -28,73 +21,6 @@ pub struct DocEmbedding {
pub similarity: f64,
}
impl DocEmbedding {
pub async fn total_chunks(&self) -> i64 {
let res: (i64,) = sqlx::query_as(
"SELECT COUNT(chunk) FROM doc_embedding WHERE domain = $1 AND path = $2",
)
.bind(&self.domain)
.bind(&self.path)
.fetch_one(get_pg!())
.await
.unwrap();
res.0
}
}
#[derive(Debug, Clone, Serialize)]
pub struct SearchResult {
pub domain: String,
pub path: String,
pub total_chunks: i64,
pub chunks: Vec<DocEmbedding>,
}
impl SearchResult {
pub fn new(domain: String, path: String, total_chunks: i64) -> Self {
Self {
domain,
path,
total_chunks,
chunks: vec![],
}
}
pub fn similarity(&self) -> f64 {
let chunks = f64::from(self.chunks.len() as u32);
let total = f64::from(self.total_chunks as i32);
let match_percent = chunks / total;
total_score(&self.chunks) * match_percent
}
}
pub fn avg_sim(e: &[DocEmbedding]) -> f64 {
let mut score = 0.0;
for e in e {
score += e.similarity;
}
score / e.len() as f64
}
pub fn max_sim(e: &[DocEmbedding]) -> f64 {
let mut score = 0.0;
for e in e {
if e.similarity > score {
score = e.similarity;
}
}
score
}
pub fn total_score(e: &[DocEmbedding]) -> f64 {
(avg_sim(e) + max_sim(e)) / 2.0
}
impl ToAPI for DocEmbedding {
async fn api(&self) -> serde_json::Value {
json!({
@ -107,30 +33,12 @@ impl ToAPI for DocEmbedding {
}
pub trait Embedding {
fn embedding(
&self,
ver: Option<String>,
) -> impl std::future::Future<Output = Option<Vec<Vec<f32>>>>;
}
pub fn chunked(s: &str) -> Vec<String> {
const CHUNK_SIZE: usize = 2500;
s.chars()
.collect::<Vec<char>>()
.chunks(CHUNK_SIZE)
.map(|chunk| chunk.iter().collect())
.collect()
}
pub fn remove_data_urls(input: &str) -> String {
let re = regex::Regex::new("data:(.*?)(;base64)?,(.*)").unwrap();
// Replace all occurrences of data URLs with an empty string
re.replace_all(input, "").to_string()
fn embedding(&self, ver: Option<String>)
-> impl std::future::Future<Output = Option<Vec<f32>>>;
}
impl Embedding for Document {
async fn embedding(&self, ver: Option<String>) -> Option<Vec<Vec<f32>>> {
async fn embedding(&self, ver: Option<String>) -> Option<Vec<f32>> {
let latest = "latest".to_string();
log::info!(
"Generating Vector embeddings for {} / {} @ {}",
@ -139,32 +47,14 @@ impl Embedding for Document {
ver.as_ref().unwrap_or(&latest)
);
let content_html = self.render_local(ver.clone()).await?;
let content = remove_data_urls(&html2md::parse_html(&content_html));
let mut embeddings = Vec::new();
let content = chunked(&content);
let len = content.len();
for (index, c) in content.into_iter().enumerate() {
log::info!(
"Generating Vector embeddings for {} / {} @ {} [ {} / {} ]",
self.domain,
self.path,
ver.as_ref().unwrap_or(&latest),
index + 1,
len
);
embeddings.push(generate_embedding(c).await?);
}
Some(embeddings)
let content_html = self.render_local(ver).await?;
let content = html2md::parse_html(&content_html);
generate_embedding(content).await
}
}
pub async fn generate_embedding(mut input: String) -> Option<Vec<f32>> {
// TODO : Ollama load balancing
if let Some(ollama_url) = get_config().ai.as_ref().map(|x| x.OLLAMA_URL.clone()) {
if let Ok(ollama_url) = std::env::var("OLLAMA_URL") {
let (host, port) = ollama_url.split_once(':')?;
let ollama = ollama_rs::Ollama::new(format!("http://{host}"), port.parse().ok()?);
@ -239,17 +129,14 @@ impl EmbedStore {
.execute(get_pg!())
.await;
for (index, embed) in embed.iter().enumerate() {
sqlx::query("INSERT INTO doc_embedding VALUES ($1, $2, $3, $4, $5)")
.bind(&doc.domain)
.bind(&doc.path)
.bind(ver)
.bind(index as i64)
.bind(embed)
.execute(get_pg!())
.await
.unwrap();
}
sqlx::query("INSERT INTO doc_embedding VALUES ($1, $2, $3, $4)")
.bind(&doc.domain)
.bind(&doc.path)
.bind(ver)
.bind(embed)
.execute(get_pg!())
.await
.unwrap();
}
}
@ -261,42 +148,16 @@ impl EmbedStore {
}
}
pub async fn search_vector(v: &pgvector::Vector, limit: i64, offset: i64) -> Vec<SearchResult> {
// limit should cover SearchResults not the query -> rework
let results: Vec<DocEmbedding> = sqlx::query_as(
"SELECT *, 1 / (1 + (embed_mxbai_embed_large <-> $1)) AS similarity FROM doc_embedding ORDER BY embed_mxbai_embed_large <=> $1 LIMIT $2 OFFSET $3",
pub async fn search_vector(v: &pgvector::Vector, limit: i64, offset: i64) -> Vec<DocEmbedding> {
sqlx::query_as(
"SELECT *, 1 / (1 + (embed_mxbai_embed_large <-> $1)) AS similarity FROM doc_embedding ORDER BY embed_mxbai_embed_large <-> $1 LIMIT $2 OFFSET $3",
)
.bind(v)
.bind(limit)
.bind(offset)
.fetch_all(get_pg!())
.await
.unwrap();
let mut search_res: HashMap<String, HashMap<String, SearchResult>> = HashMap::new();
for res in results {
let domain = search_res.entry(res.domain.clone()).or_default();
let doc = domain.entry(res.path.clone()).or_insert(SearchResult::new(
res.domain.clone(),
res.path.clone(),
res.total_chunks().await,
));
doc.chunks.push(res);
}
let mut flat = search_res
.into_values()
.flat_map(|x| x.into_values().collect::<Vec<SearchResult>>())
.collect::<Vec<SearchResult>>();
flat.sort_by(|a, b| {
b.similarity()
.partial_cmp(&a.similarity())
.unwrap_or(std::cmp::Ordering::Equal)
});
flat
.unwrap()
}
pub async fn generate_embeddings_for(arc: &WebsiteArchive) {

335
src/archive.rs Normal file
View file

@ -0,0 +1,335 @@
use std::path::PathBuf;
use based::request::RequestContext;
use maud::html;
use crate::{blacklist::check_blacklist, favicon::download_fav_for, pages::component::render_page};
/// Read directory entries into `Vec<String>`
pub fn read_dir(dir: &PathBuf) -> Vec<String> {
let mut list = Vec::new();
if let Ok(entries) = std::fs::read_dir(dir) {
for entry in entries.flatten() {
if let Some(file_name) = entry.file_name().to_str() {
list.push(file_name.to_string());
}
}
}
list
}
/// Rewrite all URLs in `input` to the format `/s/<domain>/<path..>`
fn internalize_urls(input: &str) -> String {
let url_pattern = r"https?://([a-zA-Z0-9.-]+)(/[\w./-]*)";
let re = regex::Regex::new(url_pattern).unwrap();
re.replace_all(input, |caps: &regex::Captures| {
format!(
"/s/{}/{}",
&caps[1].trim_start_matches("www."), // Domain
&caps[2] // Path
)
})
.to_string()
}
/// Represents a directory containg archived websites
#[derive(Debug, Clone)]
pub struct WebsiteArchive {
pub dir: PathBuf,
}
/// Represents a domain within the website archive
pub struct Domain {
/// Domain name
pub name: String,
dir: PathBuf,
}
impl Domain {
/// Creates a new `Domain` instance.
///
/// If the domain name is not blacklisted, a directory is created.
///
/// # Parameters
/// - `name`: The name of the domain.
/// - `dir`: The directory path for the domain.
///
/// # Returns
/// A new `Domain` instance.
pub fn new(name: &str, dir: PathBuf) -> Self {
if !check_blacklist(name) {
std::fs::create_dir_all(&dir).unwrap();
}
Self {
name: name.to_string(),
dir,
}
}
/// Resolves a specific path within the domain and returns a `Document` representing it.
///
/// # Parameters
/// - `path`: The path to resolve within the domain.
///
/// # Returns
/// A `Document` instance corresponding to the given path.
pub fn path(&self, path: &str) -> Document {
Document::new(&self.name, path, self.dir.parent().unwrap().to_path_buf())
}
/// Retrieves entries and metadata for a given path within the domain.
///
/// # Parameters
/// - `path`: The path to inspect.
///
/// # Returns
/// A tuple containing:
/// - A vector of `PathEntry` instances representing the contents of the path.
/// - A boolean indicating whether the path is itself a `Document`
pub fn paths(&self, path: &str) -> (Vec<PathEntry>, bool) {
let mut base_path = self.dir.clone();
for p in path.split('/') {
base_path = base_path.join(p);
}
let dir_content = read_dir(&base_path);
let mut ret = Vec::new();
let mut is_doc = false;
for entry in dir_content {
let url_path = format!("{path}/{entry}");
if entry.starts_with("index_") && entry.ends_with(".html") {
is_doc = true;
continue;
}
ret.push(PathEntry(self.name.clone(), url_path));
}
(ret, is_doc)
}
}
/// Represents an entry within a domain's path, containing its name and URL path.
pub struct PathEntry(String, String);
impl PathEntry {
pub fn url(&self) -> String {
format!("/d/{}/{}", self.0, self.1)
}
pub fn path(&self) -> &String {
&self.1
}
}
/// Represents a document within a domain
pub struct Document {
/// The domain associated with the document.
pub domain: String,
/// The path of the document within the domain.
pub path: String,
base_dir: PathBuf,
}
impl Document {
/// Creates a new `Document` instance.
///
/// # Parameters
/// - `domain`: The domain to which the document belongs.
/// - `path`: The path of the document within the domain.
/// - `base_dir`: The base directory of the archive storage.
///
/// # Returns
/// A new `Document` instance.
pub fn new(domain: &str, path: &str, base_dir: PathBuf) -> Self {
Self {
domain: domain.to_string(),
path: path
.split('/')
.filter(|x| !x.is_empty())
.collect::<Vec<&str>>()
.join("/"),
base_dir,
}
}
/// Renders the document, returning its content as a string.
///
/// If the environment variable `$ROUTE_INTERNAL` is set to `true`, all links will be rewritten to point to internal archived routes.
///
/// # Parameters
/// - `version`: An optional version of the document to render in the format `YYYY-MM-DD`.
///
/// # Returns
/// An `Option` containing the rendered content as a string, or `None` if nothing could be rendered.
pub async fn render_local(&self, version: Option<String>) -> Option<String> {
if check_blacklist(&self.domain) {
let content = html! {
h3 { "This site is blacklisted" };
};
return Some(render_page(content, RequestContext::default()).await.1 .1);
}
let mut file_path = self.doc_dir();
let latest_version = if let Some(version) = version {
format!("index_{version}.html")
} else {
let versions = self.versions();
versions.first().cloned()?
};
file_path = file_path.join(latest_version);
let content = std::fs::read_to_string(file_path).ok()?;
if std::env::var("ROUTE_INTERNAL").unwrap_or("false".to_string()) == "true" {
Some(internalize_urls(&content))
} else {
Some(content)
}
}
/// Determines the directory where the document is stored.
///
/// # Returns
/// A `PathBuf` representing the document directory.
pub fn doc_dir(&self) -> PathBuf {
let mut file_path = self.base_dir.join(&self.domain);
for p in self.path.split('/') {
file_path = file_path.join(p);
}
file_path
}
/// Retrieves available versions of the document.
///
/// # Returns
/// A vector of strings representing the available versions of the document, sorted in descending order.
pub fn versions(&self) -> Vec<String> {
let mut res: Vec<String> = read_dir(&self.doc_dir())
.into_iter()
.filter_map(|x| {
if x.starts_with("index_") && x.ends_with(".html") {
return Some(
x.trim_start_matches("index_")
.trim_end_matches(".html")
.to_string(),
);
}
None
})
.collect();
res.sort();
res.reverse();
res
}
}
impl WebsiteArchive {
/// Creates a new `WebsiteArchive` instance.
///
/// # Parameters
/// - `dir`: The directory path where the archive will be stored.
///
/// # Returns
/// A new `WebsiteArchive` instance.
pub fn new(dir: &str) -> Self {
Self {
dir: PathBuf::from(dir),
}
}
/// Retrieves the list of domain names stored in the archive.
///
/// # Returns
/// A vector of domain names as strings.
pub fn domains(&self) -> Vec<String> {
read_dir(&self.dir)
}
/// Retrieves a `Domain` instance for a specified domain name.
///
/// # Parameters
/// - `domain`: The name of the domain to retrieve.
///
/// # Returns
/// A `Domain` instance corresponding to the specified domain.
pub fn get_domain(&self, domain: &str) -> Domain {
Domain::new(domain, self.dir.join(domain))
}
/// Archives a URL by downloading and storing its content.
///
/// If the URL does not pass the blacklist check, it will not be archived.
///
/// # Parameters
/// - `url`: The URL to archive.
///
/// This function downloads the content of the URL, processes it, and saves it to the archive.
pub async fn archive_url(&self, url: &str) {
let parsed_url = url::Url::parse(url).unwrap();
let domain = parsed_url.domain().unwrap().trim_start_matches("www.");
// Deny blacklist
if check_blacklist(domain) {
return;
}
let path = parsed_url.path();
let mut folder_name = self.dir.join(domain);
download_fav_for(domain).await;
for paths in path.split('/') {
if !paths.is_empty() {
folder_name = folder_name.join(paths);
}
}
std::fs::create_dir_all(&folder_name).unwrap();
let timestamp = chrono::Utc::now().format("%Y-%m-%d").to_string();
let filename = folder_name.join(format!("index_{timestamp}.html"));
log::info!("Archiving {url} to {}", filename.to_str().unwrap());
run_command(&[
"monolith",
"-I",
"-o",
filename.to_str().unwrap(),
&format!("https://{}/{}", domain, path),
]);
}
}
// full text search
// add new sites?
// transparent auto page downloading
// redownload after threshold
fn run_command(cmd: &[&str]) {
let mut cmd_setup = std::process::Command::new(cmd[0]);
let cmd_setup = cmd_setup
.args(cmd.iter().skip(1).collect::<Vec<_>>())
.stdout(std::process::Stdio::inherit())
.stderr(std::process::Stdio::inherit());
let child = cmd_setup.spawn().unwrap();
let status = child.wait_with_output().unwrap();
assert!(status.status.success());
}

View file

@ -1,126 +0,0 @@
use std::{io::Read, path::PathBuf};
use based::request::RequestContext;
use maud::html;
use crate::{blacklist::check_blacklist, conf::get_config, render_page};
use super::{internalize_urls, read_dir};
/// Represents a document within a domain
pub struct Document {
/// The domain associated with the document.
pub domain: String,
/// The path of the document within the domain.
pub path: String,
base_dir: PathBuf,
}
impl Document {
/// Creates a new `Document` instance.
///
/// # Parameters
/// - `domain`: The domain to which the document belongs.
/// - `path`: The path of the document within the domain.
/// - `base_dir`: The base directory of the archive storage.
///
/// # Returns
/// A new `Document` instance.
pub fn new(domain: &str, path: &str, base_dir: PathBuf) -> Self {
let split = path
.split('/')
.filter(|x| !x.is_empty())
.collect::<Vec<&str>>();
Self {
domain: domain.to_string(),
path: if split.is_empty() {
"/".to_string()
} else {
split.join("/")
},
base_dir,
}
}
/// Renders the document, returning its content as a string.
///
/// If the environment variable `$ROUTE_INTERNAL` is set to `true`, all links will be rewritten to point to internal archived routes.
///
/// # Parameters
/// - `version`: An optional version of the document to render in the format `YYYY-MM-DD`.
///
/// # Returns
/// An `Option` containing the rendered content as a string, or `None` if nothing could be rendered.
pub async fn render_local(&self, version: Option<String>) -> Option<String> {
if check_blacklist(&self.domain) {
let content = html! {
h3 { "This site is blacklisted" };
};
return Some(render_page(content, RequestContext::default()).await.1 .1);
}
let mut file_path = self.doc_dir();
let latest_version = if let Some(version) = version {
format!("index_{version}.html")
} else {
let versions = self.versions();
let version = versions.first().cloned()?;
format!("index_{version}.html")
};
file_path = file_path.join(latest_version);
let mut buf = Vec::new();
std::fs::File::open(file_path)
.ok()?
.read_to_end(&mut buf)
.unwrap();
let content = String::from_utf8_lossy(&buf);
if get_config().ROUTE_INTERNAL {
Some(internalize_urls(&content, &self.domain))
} else {
Some(content.to_string())
}
}
/// Determines the directory where the document is stored.
///
/// # Returns
/// A `PathBuf` representing the document directory.
pub fn doc_dir(&self) -> PathBuf {
let mut file_path = self.base_dir.join(&self.domain);
for p in self.path.split('/').filter(|x| !x.is_empty()) {
file_path = file_path.join(p);
}
file_path
}
/// Retrieves available versions of the document.
///
/// # Returns
/// A vector of strings representing the available versions of the document, sorted in descending order.
pub fn versions(&self) -> Vec<String> {
let mut res: Vec<String> = read_dir(&self.doc_dir())
.into_iter()
.filter_map(|x| {
if x.starts_with("index_") && x.ends_with(".html") {
return Some(
x.trim_start_matches("index_")
.trim_end_matches(".html")
.to_string(),
);
}
None
})
.collect();
res.sort();
res.reverse();
res
}
}

View file

@ -1,125 +0,0 @@
use std::path::PathBuf;
use based::result::LogAndIgnore;
use crate::blacklist::check_blacklist;
use super::{read_dir, Document};
/// Represents a domain within the website archive
pub struct Domain {
/// Domain name
pub name: String,
dir: PathBuf,
}
impl Domain {
/// Creates a new `Domain` instance.
///
/// If the domain name is not blacklisted, a directory is created.
///
/// # Parameters
/// - `name`: The name of the domain.
/// - `dir`: The directory path for the domain.
///
/// # Returns
/// A new `Domain` instance.
pub fn new(name: &str, dir: PathBuf) -> Self {
if !check_blacklist(name) {
std::fs::create_dir_all(&dir)
.log_err_and_ignore(&format!("Could not create domain dir {name}"));
}
Self {
name: name.to_string(),
dir,
}
}
/// Resolves a specific path within the domain and returns a `Document` representing it.
///
/// # Parameters
/// - `path`: The path to resolve within the domain.
///
/// # Returns
/// A `Document` instance corresponding to the given path.
pub fn path(&self, path: &str) -> Document {
Document::new(&self.name, path, self.dir.parent().unwrap().to_path_buf())
}
/// Get all paths associated with the domain
pub fn all_paths(&self) -> Vec<PathEntry> {
let mut queue = self.paths("/").0;
let mut ret = Vec::new();
ret.push(PathEntry(self.name.clone(), "/".to_string()));
while let Some(el) = queue.pop() {
ret.push(el.clone());
let paths = self.paths(&el.1).0;
queue.extend(paths);
}
ret
}
/// Retrieves entries and metadata for a given path within the domain.
///
/// # Parameters
/// - `path`: The path to inspect.
///
/// # Returns
/// A tuple containing:
/// - A vector of `PathEntry` instances representing the contents of the path.
/// - A boolean indicating whether the path is itself a `Document`
pub fn paths(&self, path: &str) -> (Vec<PathEntry>, bool) {
let mut base_path = self.dir.clone();
for p in path.split('/') {
base_path = base_path.join(p);
}
let path = path
.split("/")
.filter(|x| !x.is_empty())
.collect::<Vec<&str>>()
.join("/");
let dir_content = read_dir(&base_path);
let mut ret = Vec::new();
let mut is_doc = false;
for entry in dir_content {
let url_path = format!("{path}/{entry}");
let url_path = url_path
.split("/")
.filter(|x| !x.is_empty())
.collect::<Vec<&str>>()
.join("/");
if entry.starts_with("index_") && entry.ends_with(".html") {
is_doc = true;
continue;
}
ret.push(PathEntry(self.name.clone(), url_path));
}
(ret, is_doc)
}
}
/// Represents an entry within a domain's path, containing its name and URL path.
#[derive(Debug, Clone)]
pub struct PathEntry(String, String);
impl PathEntry {
pub fn url(&self) -> String {
format!("/d/{}/{}", self.0, self.1)
}
pub fn path(&self) -> &String {
&self.1
}
}

View file

@ -1,232 +0,0 @@
use std::{collections::HashSet, path::PathBuf};
use crate::{
blacklist::{check_blacklist, check_blacklist_path},
conf::get_config,
favicon::download_fav_for,
};
mod document;
mod domain;
pub use document::Document;
pub use domain::*;
/// Read directory entries into `Vec<String>`
pub fn read_dir(dir: &PathBuf) -> Vec<String> {
let mut list = Vec::new();
if let Ok(entries) = std::fs::read_dir(dir) {
for entry in entries.flatten() {
if let Some(file_name) = entry.file_name().to_str() {
list.push(file_name.to_string());
}
}
}
list
}
/// Rewrite all URLs in `input` to the format `/s/<domain>/<path..>`
fn internalize_urls(input: &str, base: &str) -> String {
let url_pattern = r#"(\ |"|')(?:(<?)(https?:\/\/([a-zA-Z0-9.-]+))?(\/[\w./-]*))"#;
let re = regex::Regex::new(url_pattern).unwrap();
re.replace_all(input, |caps: &regex::Captures| {
if caps.get(2).map(|x| x.as_str()).unwrap_or_default() == "<" {
return caps.get(0).unwrap().as_str().to_string();
}
if caps.get(0).unwrap().as_str() == " //" {
return " //".to_string();
}
let wrap = caps.get(1).map(|x| x.as_str()).unwrap_or_default();
if let Some(domain) = caps.get(3) {
let domain = domain.as_str();
let (protocol, domain) = if domain.starts_with("https://") {
("https", domain.trim_start_matches("https://"))
} else {
("http", domain.trim_start_matches("http://"))
};
let domain = domain.trim_start_matches("www.");
let path = caps.get(5).map_or("", |m| m.as_str());
// Skip transformation if the domain is in the blacklist
if check_blacklist(domain) {
format!("{wrap}{protocol}://{domain}{path}")
} else {
format!("{wrap}/s/{domain}{path}")
}
} else if let Some(path) = caps.get(5) {
// Handle relative paths
format!("{wrap}/s/{base}{}", path.as_str())
} else {
// Default fallback
caps[0].to_string()
}
})
.to_string()
}
/// Extract all domains
pub fn extract_domains(input: &str) -> Vec<String> {
let url_pattern = r"https?://([a-zA-Z0-9.-]+)(/[\w./-]*)?";
let re = regex::Regex::new(url_pattern).unwrap();
let mut domains = HashSet::new();
for caps in re.captures_iter(input) {
let domain = caps[1].trim_start_matches("www.");
domains.insert(domain.to_string());
}
let mut domains: Vec<_> = domains.into_iter().collect();
domains.sort();
domains
}
/// Represents a directory containg archived websites
#[derive(Debug, Clone)]
pub struct WebsiteArchive {
pub dir: PathBuf,
}
impl WebsiteArchive {
/// Creates a new `WebsiteArchive` instance.
///
/// # Parameters
/// - `dir`: The directory path where the archive will be stored.
///
/// # Returns
/// A new `WebsiteArchive` instance.
pub fn new(dir: &str) -> Self {
Self {
dir: PathBuf::from(dir),
}
}
/// Retrieves the list of domain names stored in the archive.
///
/// # Returns
/// A vector of domain names as strings.
pub fn domains(&self) -> Vec<String> {
read_dir(&self.dir)
}
/// Retrieves a `Domain` instance for a specified domain name.
///
/// # Parameters
/// - `domain`: The name of the domain to retrieve.
///
/// # Returns
/// A `Domain` instance corresponding to the specified domain.
pub fn get_domain(&self, domain: &str) -> Domain {
Domain::new(domain, self.dir.join(domain))
}
/// Archives a URL by downloading and storing its content.
///
/// If the URL does not pass the blacklist check, it will not be archived.
///
/// # Parameters
/// - `url`: The URL to archive.
///
/// This function downloads the content of the URL, processes it, and saves it to the archive.
pub async fn archive_url(&self, url: &str) {
let parsed_url = url::Url::parse(url).unwrap();
let domain = parsed_url.domain().unwrap().trim_start_matches("www.");
// Deny blacklist
if check_blacklist(domain) {
return;
}
let path = parsed_url.path();
if check_blacklist_path(domain, path) {
return;
}
let mut folder_name = self.dir.join(domain);
download_fav_for(domain).await;
for paths in path.split('/') {
let paths = url_escape::decode(paths).to_string();
if !paths.is_empty() {
folder_name = folder_name.join(paths);
}
}
std::fs::create_dir_all(&folder_name).unwrap();
let timestamp = chrono::Utc::now().format("%Y-%m-%d").to_string();
let filename = folder_name.join(format!("index_{timestamp}.html"));
log::info!("Archiving {url} to {}", filename.to_str().unwrap());
let conf = get_config()
.get_domain_config(domain)
.cloned()
.unwrap_or_default();
let mut cmd = vec!["monolith", "--isolate", "-o", filename.to_str().unwrap()];
if conf.no_audio.unwrap_or_default() {
cmd.push("--no-audio");
}
if conf.no_css.unwrap_or_default() {
cmd.push("--no-css");
}
if conf.no_frames.unwrap_or_default() {
cmd.push("--no-frames");
}
if conf.no_fonts.unwrap_or_default() {
cmd.push("--no-frames");
}
if conf.no_image.unwrap_or_default() {
cmd.push("--no-images");
}
if conf.no_javascript.unwrap_or_default() {
cmd.push("--no-js");
cmd.push("--unwrap-noscript");
}
if conf.no_video.unwrap_or_default() {
cmd.push("--no-video");
}
if let Some(ua) = &conf.user_agent {
cmd.push("--user-agent");
cmd.push(ua.as_str());
}
let mut url = url::Url::parse(&format!("https://{domain}")).unwrap();
url = url.join(path).unwrap();
let url = url.to_string();
cmd.push(&url);
run_command(&cmd);
}
}
fn run_command(cmd: &[&str]) {
let mut cmd_setup = std::process::Command::new(cmd[0]);
let cmd_setup = cmd_setup
.args(cmd.iter().skip(1).collect::<Vec<_>>())
.stdout(std::process::Stdio::inherit())
.stderr(std::process::Stdio::inherit());
let child = cmd_setup.spawn().unwrap();
let status = child.wait_with_output().unwrap();
assert!(status.status.success());
}

View file

@ -1,61 +0,0 @@
use clap::{arg, command};
pub fn get_args() -> clap::ArgMatches {
command!()
.about("Web Archive")
.arg(
arg!(-d --dir <dir> "Web archive directory")
.required(false)
.default_value("./websites"),
)
.subcommand(
command!()
.name("serve")
.about("Start web archive server")
.arg(
arg!(-c --config <config> "Web archive config file")
.required(false)
.default_value("./config.toml"),
),
)
.subcommand(
command!()
.name("archive")
.about("Work with web archives")
.subcommand(
command!()
.name("download")
.about("Download a new URL into the archive")
.arg(
arg!(-c --config <config> "Web archive config file")
.required(false)
.default_value("./config.toml"),
)
.arg(arg!([URL] "The URL to download").required(true))
)
.subcommand(
command!()
.name("list")
.about("List domains contained in the archive. If a domain is provided all paths of this domain will be listed.")
.arg(arg!([DOMAIN] "A domain to list").required(false))
.arg(arg!(-j --json "Ouput JSON").required(false)),
)
.subcommand(
command!()
.name("versions")
.about("List saved versions of a document")
.arg(arg!(-j --json "Ouput JSON").required(false))
.arg(arg!([DOMAIN] "A domain").required(true))
.arg(arg!([PATH] "A path").required(false))
)
.subcommand(
command!()
.name("get")
.about("Get a saved document")
.arg(arg!(--md "Ouput Markdown").required(false))
.arg(arg!([DOMAIN] "A domain").required(true))
.arg(arg!([PATH] "A path").required(false))
.arg(arg!([VERSION] "A version").required(false))
))
.get_matches()
}

View file

@ -1,43 +1,19 @@
use crate::conf::get_config;
/// Checks if a domain is present in the blacklist of unwanted domains.
///
/// This function checks the `$BLACKLIST_DOMAINS` environment variable for a comma-separated list of regular expressions to match against.
/// If a match is found, it immediately returns `true`. Otherwise, it returns `false`.
pub fn check_blacklist(domain: &str) -> bool {
let conf = get_config();
let conf = conf.websites.as_ref();
let blacklist_raw = std::env::var("BLACKLIST_DOMAINS").unwrap_or_default();
// TODO : Block IPs
// TODO : Test SSRF
let blacklisted_domains = conf
.map(|x| x.BLACKLIST_DOMAINS.as_ref())
.unwrap_or_default();
check_regex(domain, blacklisted_domains.unwrap_or(&Vec::new()))
}
pub fn check_blacklist_path(domain: &str, path: &str) -> bool {
let conf = get_config();
let conf = conf.websites.as_ref();
if let Some(website) = conf {
let empty = Vec::new();
let domain_conf = website.domains.as_ref().unwrap_or(&empty);
if let Some(domain_conf) = domain_conf.iter().find(|x| x.domain == domain) {
let empty = Vec::new();
let blacklist = domain_conf.blacklist_paths.as_ref().unwrap_or(&empty);
return check_regex(path, blacklist);
}
if blacklist_raw.is_empty() {
return false;
}
false
}
let blacklist: Vec<&str> = blacklist_raw.split(',').collect();
pub fn check_regex(input: &str, regexes: &Vec<String>) -> bool {
for regex in regexes {
let rgx = regex::Regex::new(regex).unwrap();
if rgx.is_match(input) {
for domain_regex in blacklist {
let rgx = regex::Regex::new(domain_regex).unwrap();
if rgx.is_match(domain) {
return true;
}
}

View file

@ -1,107 +0,0 @@
use std::sync::Arc;
use serde::Deserialize;
use tokio::sync::OnceCell;
pub static CONFIG: OnceCell<Arc<Config>> = OnceCell::const_new();
/// Get a reference to global config
pub fn get_config() -> &'static Arc<Config> {
crate::conf::CONFIG.get().unwrap()
}
/// Load a global config
pub fn load_config(path: &str) {
// TODO : Other load locations
if let Ok(file_content) = std::fs::read_to_string(path) {
let conf: Config =
toml::from_str(&file_content).expect("Could not deserialize config file");
crate::conf::CONFIG.set(std::sync::Arc::new(conf)).unwrap();
}
}
/// Load a default global config
pub fn load_default_config() {
if crate::conf::CONFIG.get().is_none() {
crate::conf::CONFIG
.set(std::sync::Arc::new(Config::default()))
.unwrap();
}
}
#[allow(non_snake_case)]
#[derive(Debug, Clone, Deserialize)]
pub struct Config {
pub ROUTE_INTERNAL: bool,
pub DOWNLOAD_ON_DEMAND: bool,
pub ai: Option<AIConfig>,
pub websites: Option<WebsiteConfig>,
}
impl Config {
pub fn get_domain_config(&self, domain: &str) -> Option<&DomainConfig> {
if let Some(websites) = &self.websites {
if let Some(domains) = &websites.domains {
let domain = domains.iter().find(|x| x.domain == domain);
return domain;
}
}
None
}
}
#[allow(non_snake_case)]
#[derive(Debug, Clone, Deserialize)]
pub struct AIConfig {
pub OLLAMA_URL: String,
}
#[allow(non_snake_case)]
#[derive(Debug, Clone, Deserialize)]
pub struct WebsiteConfig {
pub BLACKLIST_DOMAINS: Option<Vec<String>>,
pub domains: Option<Vec<DomainConfig>>,
}
#[derive(Debug, Clone, Deserialize)]
pub struct DomainConfig {
pub domain: String,
pub blacklist_paths: Option<Vec<String>>,
pub no_audio: Option<bool>,
pub no_video: Option<bool>,
pub no_image: Option<bool>,
pub no_css: Option<bool>,
pub no_javascript: Option<bool>,
pub no_fonts: Option<bool>,
pub no_frames: Option<bool>,
pub user_agent: Option<String>,
}
impl Default for DomainConfig {
fn default() -> Self {
Self {
domain: String::new(),
blacklist_paths: None,
no_audio: Some(false),
no_video: Some(false),
no_image: Some(false),
no_css: Some(false),
no_javascript: Some(false),
no_fonts: Some(false),
no_frames: Some(false),
user_agent: None,
}
}
}
impl Default for Config {
fn default() -> Self {
Self {
ROUTE_INTERNAL: false,
DOWNLOAD_ON_DEMAND: false,
ai: None,
websites: None,
}
}
}

View file

@ -1,29 +0,0 @@
use based::{
page::Shell,
request::{RequestContext, StringResponse},
};
use maud::{html, PreEscaped};
pub mod ai;
pub mod archive;
pub mod blacklist;
pub mod conf;
pub mod favicon;
pub async fn render_page(content: PreEscaped<String>, ctx: RequestContext) -> StringResponse {
based::page::render_page(
content,
"Website Archive",
ctx,
&Shell::new(
html! {
script src="https://cdn.tailwindcss.com" {};
meta name="viewport" content="width=device-width, initial-scale=1.0" {};
script src="/assets/htmx.min.js" {};
},
html! {},
Some("bg-zinc-950 text-white min-h-screen flex pt-8 justify-center".to_string()),
),
)
.await
}

View file

@ -1,170 +1,46 @@
use ai::EmbedStore;
use archive::WebsiteArchive;
use based::get_pg;
use rocket::routes;
use webarc::ai::EmbedStore;
use webarc::archive::WebsiteArchive;
use webarc::conf::{get_config, load_config, load_default_config};
mod args;
mod ai;
mod archive;
mod blacklist;
mod favicon;
mod pages;
#[tokio::main]
async fn main() {
#[rocket::launch]
async fn launch() -> _ {
env_logger::init();
let args = args::get_args();
let arc = WebsiteArchive::new("./websites");
let archive_dir: &String = args.get_one("dir").unwrap();
match args.subcommand() {
Some(("serve", serve_args)) => {
let config: &String = serve_args.get_one("config").unwrap();
load_config(config);
let arc = WebsiteArchive::new(archive_dir);
if std::env::var("DATABASE_URL").is_ok() {
let pg = get_pg!();
sqlx::migrate!("./migrations").run(pg).await.unwrap();
}
let archive = arc.clone();
if get_config().ai.is_some() {
tokio::spawn(async move {
EmbedStore::generate_embeddings_for(&archive).await;
});
}
let archive = arc.clone();
tokio::spawn(async move {
webarc::favicon::download_favicons_for_sites(&archive.domains()).await;
});
rocket::build()
.mount(
"/",
routes![
based::htmx::htmx_script_route,
pages::index,
pages::render_website,
pages::domain_info_route,
pages::favicon_route,
pages::vector_search,
pages::render_txt_website
],
)
.manage(arc)
.launch()
.await
.unwrap();
}
Some(("archive", archive_args)) => {
let arc = WebsiteArchive::new(archive_dir);
match archive_args.subcommand() {
Some(("list", list_args)) => {
let json = list_args.get_flag("json");
load_default_config();
let elements = if let Some(domain) = list_args.get_one::<String>("DOMAIN") {
arc.get_domain(domain)
.all_paths()
.into_iter()
.map(|x| x.path().clone())
.collect()
} else {
arc.domains()
};
if json {
println!(
"{}",
serde_json::to_string(&serde_json::json!(elements)).unwrap()
);
} else {
if let Some(domain) = list_args.get_one::<String>("DOMAIN") {
println!("Paths in {domain}:");
} else {
println!("Domains in {}:", archive_dir);
}
if elements.is_empty() {
println!("No domains");
}
for d in elements {
println!("- {d}");
}
}
}
Some(("download", dl_args)) => {
let url: &String = dl_args.get_one("URL").unwrap();
let config: &String = dl_args.get_one("config").unwrap();
load_config(config);
arc.archive_url(url).await;
println!("Saved {url} to archive");
}
Some(("versions", ver_args)) => {
load_default_config();
let domain: &String = ver_args.get_one("DOMAIN").unwrap();
let path: String = if let Some(path) = ver_args.get_one::<String>("PATH") {
path.clone()
} else {
"/".to_string()
};
let versions = arc.get_domain(domain).path(&path).versions();
let json = ver_args.get_flag("json");
if json {
println!("{}", serde_json::to_string(&versions).unwrap());
} else {
println!("Versions for {domain} / {path}:");
for v in versions {
println!("- {v}");
}
}
}
Some(("get", get_args)) => {
load_default_config();
let domain: &String = get_args.get_one("DOMAIN").unwrap();
let path = if let Some(path) = get_args.get_one::<String>("PATH") {
path.clone()
} else {
"/".to_string()
};
let doc = arc.get_domain(domain).path(&path);
let ver = if let Some(ver) = get_args.get_one::<String>("VERSION") {
ver.clone()
} else {
doc.versions().first().unwrap().clone()
};
let md = get_args.get_flag("md");
let content = doc.render_local(Some(ver)).await;
if content.is_none() {
println!("No document found");
std::process::exit(1);
}
if md {
let markdown = html2md::parse_html(&content.unwrap());
println!("{markdown}");
} else {
println!("{}", content.unwrap());
}
}
Some((&_, _)) => {}
None => {}
};
}
Some((&_, _)) => {}
None => {}
if std::env::var("DATABASE_URL").is_ok() {
let pg = get_pg!();
sqlx::migrate!("./migrations").run(pg).await.unwrap();
}
if std::env::var("OLLAMA_URL").is_ok() {
EmbedStore::generate_embeddings_for(&arc).await;
}
let archive = arc.clone();
tokio::spawn(async move {
favicon::download_favicons_for_sites(&archive.domains()).await;
});
rocket::build()
.mount(
"/",
routes![
based::htmx::htmx_script_route,
pages::index,
pages::render_website,
pages::domain_info_route,
pages::favicon_route,
pages::vector_search
],
)
.manage(arc)
}

View file

@ -1,3 +1,7 @@
use based::{
page::Shell,
request::{RequestContext, StringResponse},
};
use maud::{html, PreEscaped};
/// Generates an SVG arrow icon with the specified color.
@ -74,8 +78,20 @@ pub fn gen_path_header(
}
}
pub fn favicon(site: &str) -> PreEscaped<String> {
html! {
img class="h-8 w-8 m-2" src=(format!("/favicon/{site}")) {};
}
pub async fn render_page(content: PreEscaped<String>, ctx: RequestContext) -> StringResponse {
based::page::render_page(
content,
"Website Archive",
ctx,
&Shell::new(
html! {
script src="https://cdn.tailwindcss.com" {};
meta name="viewport" content="width=device-width, initial-scale=1.0" {};
script src="/assets/htmx.min.js" {};
},
html! {},
Some("bg-zinc-950 text-white min-h-screen flex pt-8 justify-center".to_string()),
),
)
.await
}

View file

@ -7,20 +7,18 @@ use based::{
},
};
use maud::{html, PreEscaped};
use rocket::{get, request::FromSegments, State};
use rocket::{get, State};
pub mod component;
use component::*;
use serde_json::json;
use webarc::{
ai::{generate_embedding, remove_data_urls, EmbedStore, SearchResult},
archive::{extract_domains, WebsiteArchive},
conf::get_config,
render_page,
use crate::{
ai::{generate_embedding, DocEmbedding, EmbedStore},
archive::WebsiteArchive,
};
const SEARCH_BAR_STYLE: &str = "w-full px-4 mb-4 py-2 text-white bg-black border-2 border-neon-blue placeholder-neon-blue focus:ring-2 focus:ring-neon-pink focus:outline-none font-mono text-lg";
const SEARCH_BAR_STYLE: &'static str = "w-full px-4 mb-4 py-2 text-white bg-black border-2 border-neon-blue placeholder-neon-blue focus:ring-2 focus:ring-neon-pink focus:outline-none font-mono text-lg";
/// Get the favicon of a domain
#[get("/favicon/<domain>")]
@ -31,8 +29,6 @@ pub async fn favicon_route(domain: &str) -> Option<DataResponse> {
.read_to_end(&mut buf)
.ok()?;
// TODO : Default favicon
Some(DataResponse::new(
buf,
"image/x-icon".to_string(),
@ -61,7 +57,7 @@ pub async fn index(ctx: RequestContext, arc: &State<WebsiteArchive>) -> StringRe
@for site in websites {
a href=(format!("/d/{site}")) class="bg-neutral-900 shadow-md rounded-lg hover:bg-neutral-800 bg-gray-1 hover:cursor-pointer transition-all duration-300 flex flex-col items-center justify-center aspect-square max-w-60" {
div class="bg-blue-500 text-white rounded-full p-4" {
(favicon(&site))
img class="h-8 w-8" src=(format!("/favicon/{site}")) {};
};
p class="mt-4 text-base font-medium" { (site) };
};
@ -88,7 +84,7 @@ pub async fn domain_info_route(
let (path_entries, is_doc) = domain.paths(paths.to_str().unwrap());
let path_seperations: Vec<&str> = paths.to_str().unwrap().split('/').collect();
let domains = extract_domains(&document.render_local(None).await.unwrap_or_default());
// TODO : Show domains beeing linked on the page
let content = html! {
h2 class="text-xl font-bold mb-4 flex items-center" {
@ -134,89 +130,20 @@ pub async fn domain_info_route(
};
};
};
@if !domains.is_empty() {
div class="max-w-md mx-auto p-4 bg-neutral-900 rounded-lg shadow-md" {
h3 class="font-bold mb-2" { "Domains linked on this page:" };
ul class="space-y-2 p-4" {
@for domain in domains {
a href=(format!("/d/{domain}")) class="flex items-center gap-2 p-3 border bg-neutral-800 rounded hover:shadow-lg transition" {
(favicon(&domain));
span class="font-medium" { (domain) };
};
};
};
};
};
};
render_page(content, ctx).await
}
#[get("/txt/<domain>/<path..>?<time>&<no_data_urls>")]
pub async fn render_txt_website(
domain: &str,
path: PathBuf,
time: Option<&str>,
no_data_urls: Option<&str>,
arc: &State<WebsiteArchive>,
) -> Option<String> {
let document = arc.get_domain(domain).path(path.to_str().unwrap());
let mut content = document
.render_local(time.map(|time| time.to_string()))
.await?;
if no_data_urls.is_some() {
content = remove_data_urls(&content);
}
Some(html2md::parse_html(&content))
}
pub struct PathSegment {
segments: Vec<String>,
}
impl PathSegment {
pub fn to_str(&self) -> String {
self.segments.join("/")
}
}
impl<'r> FromSegments<'r> for PathSegment {
type Error = ();
fn from_segments(
segments: rocket::http::uri::Segments<'r, rocket::http::uri::fmt::Path>,
) -> Result<Self, Self::Error> {
let paths: Vec<_> = segments
.filter_map(|x| {
if x == "." {
return None;
}
if x == ".." {
return None;
}
Some(x.to_string())
})
.collect();
Ok(PathSegment { segments: paths })
}
}
/// Return archived version of `domain` / `path` at `time`
#[get("/s/<domain>/<path..>?<time>")]
pub async fn render_website(
domain: &str,
path: PathSegment,
path: PathBuf,
time: Option<&str>,
arc: &State<WebsiteArchive>,
) -> Option<DataResponse> {
let document = arc.get_domain(domain).path(&path.to_str());
let document = arc.get_domain(domain).path(path.to_str().unwrap());
let content = document
.render_local(time.map(|time| time.to_string()))
@ -228,8 +155,13 @@ pub async fn render_website(
"text/html".to_string(),
Some(60 * 60 * 24),
));
} else if get_config().DOWNLOAD_ON_DEMAND && time.is_none() {
arc.archive_url(&format!("https://{domain}/{}", path.to_str()))
} else if std::env::var("DOWNLOAD_ON_DEMAND")
.unwrap_or("false".to_string())
.as_str()
== "true"
&& time.is_none()
{
arc.archive_url(&format!("https://{domain}/{}", path.to_str().unwrap()))
.await;
let content = document.render_local(None).await?;
@ -244,17 +176,17 @@ pub async fn render_website(
None
}
pub fn gen_search_element(x: &SearchResult) -> PreEscaped<String> {
pub fn gen_search_element(x: &DocEmbedding) -> PreEscaped<String> {
html! {
div class="text-xl font-bold mt-4 p-4 flex items-center w-full max-w-4xl max-h-40 mx-auto bg-neutral-800 shadow-md rounded-lg overflow-hidden border border-neutral-900 hover:cursor-pointer"
hx-get=(format!("/d/{}/{}", x.domain, x.path))
hx-target="#main_content" hx-push-url="true" hx-swap="innerHTML"
{
(favicon(&x.domain))
img class="p-2" src=(format!("/favicon/{}", &x.domain));
a { (x.domain) };
(slash_seperator());
(gen_path_header(x.path.split('/').collect(), &x.domain, false));
p class="font-bold p-2 text-stone-400" { (format!("{:.2} % [{} matches]", x.similarity() * 100.0, x.chunks.len())) };
p class="font-bold p-2 text-stone-400" { (format!("{:.2} %", x.similarity * 100.0)) };
};
}
}
@ -265,7 +197,9 @@ pub async fn vector_search(
page: Option<i64>,
ctx: RequestContext,
) -> Option<StringResponse> {
get_config().ai.as_ref()?;
if std::env::var("OLLAMA_URL").is_err() {
return None;
}
let page = page.unwrap_or(1);
@ -286,14 +220,13 @@ pub async fn vector_search(
EmbedStore::search_vector(&input, limit as i64, offset as i64).await
})
},
1500,
5,
)
.pager(page as u64, vector)
.await;
// API Route
if query.ends_with(".json") {
// TODO : Better search API
return Some(respond_json(&json!(&results.page(page as u64))));
}