parent
b530ae4dc3
commit
3e77ec5008
11 changed files with 476 additions and 346 deletions
630
Cargo.lock
generated
630
Cargo.lock
generated
File diff suppressed because it is too large
Load diff
|
@ -16,7 +16,7 @@ tokio = { version = "1.35.1", features = ["full"] }
|
|||
uuid = { version = "1.8.0", features = ["v4", "serde"] }
|
||||
sqlx = { version = "0.8", features = ["postgres", "runtime-tokio-native-tls", "derive", "uuid", "chrono", "json"] }
|
||||
maud = "0.26.0"
|
||||
based = { git = "https://git.hydrar.de/jmarya/based", features = ["htmx"] }
|
||||
based = { git = "https://git.hydrar.de/jmarya/based", branch = "ui" }
|
||||
url = "2.5.4"
|
||||
reqwest = "0.12.11"
|
||||
ollama-rs = "0.2.2"
|
||||
|
|
10
config.toml
10
config.toml
|
@ -17,6 +17,12 @@ BLACKLIST_DOMAINS = [
|
|||
# The domain the config applies to
|
||||
domain = "example.com"
|
||||
|
||||
# Consider a refresh after interval (in days)
|
||||
outdated = 30
|
||||
|
||||
# Keep last n versions
|
||||
keep_n = 5
|
||||
|
||||
# Blacklisted Path (Regexes)
|
||||
blacklist_paths = ["/.*"]
|
||||
|
||||
|
@ -71,3 +77,7 @@ no_video = true
|
|||
domain = "docs.flutter.dev"
|
||||
no_javascript = true
|
||||
no_video = true
|
||||
|
||||
[[websites.domains]]
|
||||
domain = "home-assistant.io"
|
||||
no_javascript = true
|
||||
|
|
25
src/ai.rs
25
src/ai.rs
|
@ -1,6 +1,6 @@
|
|||
use std::collections::{HashMap, VecDeque};
|
||||
|
||||
use based::{get_pg, request::api::ToAPI, result::LogNoneAndPass};
|
||||
use based::{get_pg, request::api::ToAPI, result::LogNoneAndPass, ui::components::Shell};
|
||||
use ollama_rs::generation::embeddings::request::{EmbeddingsInput, GenerateEmbeddingsRequest};
|
||||
use serde::Serialize;
|
||||
use serde_json::json;
|
||||
|
@ -110,6 +110,7 @@ pub trait Embedding {
|
|||
fn embedding(
|
||||
&self,
|
||||
ver: Option<String>,
|
||||
shell: &Shell,
|
||||
) -> impl std::future::Future<Output = Option<Vec<Vec<f32>>>>;
|
||||
}
|
||||
|
||||
|
@ -130,7 +131,7 @@ pub fn remove_data_urls(input: &str) -> String {
|
|||
}
|
||||
|
||||
impl Embedding for Document {
|
||||
async fn embedding(&self, ver: Option<String>) -> Option<Vec<Vec<f32>>> {
|
||||
async fn embedding(&self, ver: Option<String>, shell: &Shell) -> Option<Vec<Vec<f32>>> {
|
||||
let latest = "latest".to_string();
|
||||
log::info!(
|
||||
"Generating Vector embeddings for {} / {} @ {}",
|
||||
|
@ -139,7 +140,7 @@ impl Embedding for Document {
|
|||
ver.as_ref().unwrap_or(&latest)
|
||||
);
|
||||
|
||||
let content_html = self.render_local(ver.clone()).await?;
|
||||
let content_html = self.render_local(ver.clone(), shell).await?;
|
||||
let content = remove_data_urls(&html2md::parse_html(&content_html));
|
||||
|
||||
let mut embeddings = Vec::new();
|
||||
|
@ -219,9 +220,9 @@ impl EmbedStore {
|
|||
.unwrap()
|
||||
}
|
||||
|
||||
pub async fn embed_document(doc: &Document, ver: &str) {
|
||||
pub async fn embed_document(doc: &Document, ver: &str, shell: &Shell) {
|
||||
if let Some(embed) = doc
|
||||
.embedding(Some(ver.to_string()))
|
||||
.embedding(Some(ver.to_string()), shell)
|
||||
.await
|
||||
.log_warn_none_and_pass(|| {
|
||||
format!(
|
||||
|
@ -253,10 +254,10 @@ impl EmbedStore {
|
|||
}
|
||||
}
|
||||
|
||||
pub async fn ensure_embedding(doc: &Document) {
|
||||
pub async fn ensure_embedding(doc: &Document, shell: &Shell) {
|
||||
for ver in doc.versions() {
|
||||
if Self::get_embedding(doc, Some(ver.as_str())).await.is_none() {
|
||||
Self::embed_document(doc, &ver).await;
|
||||
Self::embed_document(doc, &ver, shell).await;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -299,25 +300,25 @@ impl EmbedStore {
|
|||
flat
|
||||
}
|
||||
|
||||
pub async fn generate_embeddings_for(arc: &WebsiteArchive) {
|
||||
pub async fn generate_embeddings_for(arc: &WebsiteArchive, shell: &Shell) {
|
||||
log::info!("Generating embeddings");
|
||||
|
||||
for dom in arc.domains() {
|
||||
let dom = arc.get_domain(&dom);
|
||||
embed_path(&dom, "/").await;
|
||||
embed_path(&dom, "/", shell).await;
|
||||
}
|
||||
|
||||
log::info!("Done generating embeddings");
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn embed_path(dom: &Domain, path: &str) {
|
||||
pub async fn embed_path(dom: &Domain, path: &str, shell: &Shell) {
|
||||
let (paths, is_doc) = dom.paths(path);
|
||||
|
||||
// If the path is a document, process the root path.
|
||||
if is_doc {
|
||||
let doc = dom.path("/");
|
||||
EmbedStore::ensure_embedding(&doc).await;
|
||||
EmbedStore::ensure_embedding(&doc, shell).await;
|
||||
}
|
||||
|
||||
// Create a queue to process paths iteratively
|
||||
|
@ -331,7 +332,7 @@ pub async fn embed_path(dom: &Domain, path: &str) {
|
|||
|
||||
if is_doc {
|
||||
let doc = dom.path(next_path.path());
|
||||
EmbedStore::ensure_embedding(&doc).await;
|
||||
EmbedStore::ensure_embedding(&doc, shell).await;
|
||||
}
|
||||
|
||||
queue.extend(next_paths);
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
use std::{io::Read, path::PathBuf};
|
||||
|
||||
use based::request::RequestContext;
|
||||
use based::{request::RequestContext, ui::components::Shell};
|
||||
use maud::html;
|
||||
|
||||
use crate::{blacklist::check_blacklist, conf::get_config, render_page};
|
||||
|
@ -52,12 +52,17 @@ impl Document {
|
|||
///
|
||||
/// # Returns
|
||||
/// An `Option` containing the rendered content as a string, or `None` if nothing could be rendered.
|
||||
pub async fn render_local(&self, version: Option<String>) -> Option<String> {
|
||||
pub async fn render_local(&self, version: Option<String>, shell: &Shell) -> Option<String> {
|
||||
if check_blacklist(&self.domain) {
|
||||
let content = html! {
|
||||
h3 { "This site is blacklisted" };
|
||||
};
|
||||
return Some(render_page(content, RequestContext::default()).await.1 .1);
|
||||
return Some(
|
||||
render_page(content, RequestContext::default(), shell)
|
||||
.await
|
||||
.1
|
||||
.1,
|
||||
);
|
||||
}
|
||||
|
||||
let mut file_path = self.doc_dir();
|
||||
|
@ -100,6 +105,20 @@ impl Document {
|
|||
file_path
|
||||
}
|
||||
|
||||
fn latest_time_since(&self) -> Option<usize> {
|
||||
if let Some(t_str) = self.versions().first() {
|
||||
let given_date = chrono::NaiveDate::parse_from_str(t_str, "%Y-%m-%d")
|
||||
.expect("Invalid date format. Expected yyyy-mm-dd.");
|
||||
|
||||
let today = chrono::Local::now().date_naive();
|
||||
|
||||
let duration = today.signed_duration_since(given_date);
|
||||
return Some(duration.num_days() as usize);
|
||||
}
|
||||
|
||||
None
|
||||
}
|
||||
|
||||
/// Retrieves available versions of the document.
|
||||
///
|
||||
/// # Returns
|
||||
|
|
|
@ -28,6 +28,7 @@ pub fn read_dir(dir: &PathBuf) -> Vec<String> {
|
|||
|
||||
/// Rewrite all URLs in `input` to the format `/s/<domain>/<path..>`
|
||||
fn internalize_urls(input: &str, base: &str) -> String {
|
||||
// todo : fix regex, domains without path are not captured
|
||||
let url_pattern = r#"(\ |"|')(?:(<?)(https?:\/\/([a-zA-Z0-9.-]+))?(\/[\w./-]*))"#;
|
||||
let re = regex::Regex::new(url_pattern).unwrap();
|
||||
|
||||
|
|
|
@ -67,6 +67,8 @@ pub struct WebsiteConfig {
|
|||
#[derive(Debug, Clone, Deserialize)]
|
||||
pub struct DomainConfig {
|
||||
pub domain: String,
|
||||
pub outdated: Option<usize>,
|
||||
pub keep_n: Option<usize>,
|
||||
pub blacklist_paths: Option<Vec<String>>,
|
||||
pub no_audio: Option<bool>,
|
||||
pub no_video: Option<bool>,
|
||||
|
@ -83,6 +85,8 @@ impl Default for DomainConfig {
|
|||
Self {
|
||||
domain: String::new(),
|
||||
blacklist_paths: None,
|
||||
outdated: None,
|
||||
keep_n: None,
|
||||
no_audio: Some(false),
|
||||
no_video: Some(false),
|
||||
no_image: Some(false),
|
||||
|
|
28
src/lib.rs
28
src/lib.rs
|
@ -1,7 +1,5 @@
|
|||
use based::{
|
||||
page::Shell,
|
||||
request::{RequestContext, StringResponse},
|
||||
};
|
||||
use based::request::{RequestContext, StringResponse};
|
||||
use based::ui::components::Shell;
|
||||
use maud::{html, PreEscaped};
|
||||
|
||||
pub mod ai;
|
||||
|
@ -10,20 +8,10 @@ pub mod blacklist;
|
|||
pub mod conf;
|
||||
pub mod favicon;
|
||||
|
||||
pub async fn render_page(content: PreEscaped<String>, ctx: RequestContext) -> StringResponse {
|
||||
based::page::render_page(
|
||||
content,
|
||||
"Website Archive",
|
||||
ctx,
|
||||
&Shell::new(
|
||||
html! {
|
||||
script src="https://cdn.tailwindcss.com" {};
|
||||
meta name="viewport" content="width=device-width, initial-scale=1.0" {};
|
||||
script src="/assets/htmx.min.js" {};
|
||||
},
|
||||
html! {},
|
||||
Some("bg-zinc-950 text-white min-h-screen flex pt-8 justify-center".to_string()),
|
||||
),
|
||||
)
|
||||
.await
|
||||
pub async fn render_page(
|
||||
content: PreEscaped<String>,
|
||||
ctx: RequestContext,
|
||||
shell: &Shell,
|
||||
) -> StringResponse {
|
||||
shell.render_page(content, "Website Archive", ctx).await
|
||||
}
|
||||
|
|
30
src/main.rs
30
src/main.rs
|
@ -1,5 +1,9 @@
|
|||
use based::asset::AssetRoutes;
|
||||
use based::get_pg;
|
||||
use based::ui::components::Shell;
|
||||
use based::ui::prelude::*;
|
||||
use rocket::routes;
|
||||
use rocket::time::format_description::modifier::Padding;
|
||||
use webarc::ai::EmbedStore;
|
||||
use webarc::archive::WebsiteArchive;
|
||||
use webarc::conf::{get_config, load_config, load_default_config};
|
||||
|
@ -15,6 +19,8 @@ async fn main() {
|
|||
|
||||
let archive_dir: &String = args.get_one("dir").unwrap();
|
||||
|
||||
let shell = get_shell();
|
||||
|
||||
match args.subcommand() {
|
||||
Some(("serve", serve_args)) => {
|
||||
let config: &String = serve_args.get_one("config").unwrap();
|
||||
|
@ -30,7 +36,7 @@ async fn main() {
|
|||
let archive = arc.clone();
|
||||
if get_config().ai.is_some() {
|
||||
tokio::spawn(async move {
|
||||
EmbedStore::generate_embeddings_for(&archive).await;
|
||||
EmbedStore::generate_embeddings_for(&archive, &get_shell()).await;
|
||||
});
|
||||
}
|
||||
|
||||
|
@ -40,10 +46,10 @@ async fn main() {
|
|||
});
|
||||
|
||||
rocket::build()
|
||||
.mount_assets()
|
||||
.mount(
|
||||
"/",
|
||||
routes![
|
||||
based::htmx::htmx_script_route,
|
||||
pages::index,
|
||||
pages::render_website,
|
||||
pages::domain_info_route,
|
||||
|
@ -53,6 +59,7 @@ async fn main() {
|
|||
],
|
||||
)
|
||||
.manage(arc)
|
||||
.manage(shell)
|
||||
.launch()
|
||||
.await
|
||||
.unwrap();
|
||||
|
@ -146,7 +153,7 @@ async fn main() {
|
|||
|
||||
let md = get_args.get_flag("md");
|
||||
|
||||
let content = doc.render_local(Some(ver)).await;
|
||||
let content = doc.render_local(Some(ver), &shell).await;
|
||||
|
||||
if content.is_none() {
|
||||
println!("No document found");
|
||||
|
@ -168,3 +175,20 @@ async fn main() {
|
|||
None => {}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_shell() -> Shell {
|
||||
Shell::new(
|
||||
Nothing(),
|
||||
Nothing(),
|
||||
Background(MinHeight(
|
||||
ScreenValue::screen,
|
||||
Padding(Text("").white()).top(ScreenValue::_8),
|
||||
))
|
||||
.color(Zinc::_950),
|
||||
)
|
||||
.use_ui()
|
||||
}
|
||||
|
||||
// TODO : redownload after threshold
|
||||
// TODO : keep n versions
|
||||
// TODO : archive cleanup code
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
use maud::{html, PreEscaped};
|
||||
use based::ui::prelude::*;
|
||||
use maud::{html, PreEscaped, Render};
|
||||
|
||||
/// Generates an SVG arrow icon with the specified color.
|
||||
///
|
||||
|
@ -20,9 +21,9 @@ pub fn arrow_icon(color: &str) -> PreEscaped<String> {
|
|||
/// # Returns
|
||||
/// A `PreEscaped<String>` containing the HTML markup for a slash separator.
|
||||
pub fn slash_seperator() -> PreEscaped<String> {
|
||||
html! {
|
||||
p class="font-bold p-2 text-gray-400" { " / " };
|
||||
}
|
||||
Padding(Text("/").bold().color(&Gray::_400))
|
||||
.all(ScreenValue::_2)
|
||||
.render()
|
||||
}
|
||||
|
||||
/// Generates a hyperlink for a specific path within a domain.
|
||||
|
@ -42,9 +43,12 @@ pub fn gen_path_link(
|
|||
domain: &str,
|
||||
) -> PreEscaped<String> {
|
||||
let upto: Vec<&str> = path_seperations.iter().take(index + 1).cloned().collect();
|
||||
html! {
|
||||
a href=(format!("/d/{}/{}", domain, upto.join("/"))) { (path)}
|
||||
}
|
||||
Link(
|
||||
&format!("/d/{}/{}", domain, upto.join("/")),
|
||||
path.to_string(),
|
||||
)
|
||||
.use_htmx()
|
||||
.render()
|
||||
}
|
||||
|
||||
/// Generates a breadcrumb-like header for a path within a domain.
|
||||
|
|
|
@ -1,10 +1,11 @@
|
|||
use std::{io::Read, path::PathBuf};
|
||||
use std::{io::Read, path::PathBuf, sync::Arc};
|
||||
|
||||
use based::ui::prelude::*;
|
||||
use based::{
|
||||
page::search::Search,
|
||||
request::{
|
||||
api::GeneratedPager, assets::DataResponse, respond_json, RequestContext, StringResponse,
|
||||
},
|
||||
ui::components::{Search, Shell},
|
||||
};
|
||||
use maud::{html, PreEscaped};
|
||||
use rocket::{get, request::FromSegments, State};
|
||||
|
@ -20,6 +21,9 @@ use webarc::{
|
|||
render_page,
|
||||
};
|
||||
|
||||
// TODO : Implement archive timeline page (chrono sorted documents)
|
||||
// TODO : impl archive index to db
|
||||
|
||||
const SEARCH_BAR_STYLE: &str = "w-full px-4 mb-4 py-2 text-white bg-black border-2 border-neon-blue placeholder-neon-blue focus:ring-2 focus:ring-neon-pink focus:outline-none font-mono text-lg";
|
||||
|
||||
/// Get the favicon of a domain
|
||||
|
@ -42,7 +46,11 @@ pub async fn favicon_route(domain: &str) -> Option<DataResponse> {
|
|||
|
||||
/// Websites Overview
|
||||
#[get("/")]
|
||||
pub async fn index(ctx: RequestContext, arc: &State<WebsiteArchive>) -> StringResponse {
|
||||
pub async fn index(
|
||||
ctx: RequestContext,
|
||||
arc: &State<WebsiteArchive>,
|
||||
shell: &State<Shell>,
|
||||
) -> StringResponse {
|
||||
let websites = arc.domains();
|
||||
|
||||
let content = html! {
|
||||
|
@ -71,7 +79,7 @@ pub async fn index(ctx: RequestContext, arc: &State<WebsiteArchive>) -> StringRe
|
|||
}
|
||||
};
|
||||
|
||||
render_page(content, ctx).await
|
||||
render_page(content, ctx, &shell).await
|
||||
}
|
||||
|
||||
/// Overview on <domain> / <path>
|
||||
|
@ -81,6 +89,7 @@ pub async fn domain_info_route(
|
|||
domain: &str,
|
||||
paths: PathBuf,
|
||||
arc: &State<WebsiteArchive>,
|
||||
shell: &State<Shell>,
|
||||
) -> StringResponse {
|
||||
let domain = arc.get_domain(domain);
|
||||
let document = domain.path(paths.to_str().unwrap());
|
||||
|
@ -88,10 +97,15 @@ pub async fn domain_info_route(
|
|||
let (path_entries, is_doc) = domain.paths(paths.to_str().unwrap());
|
||||
let path_seperations: Vec<&str> = paths.to_str().unwrap().split('/').collect();
|
||||
|
||||
let domains = extract_domains(&document.render_local(None).await.unwrap_or_default());
|
||||
let domains = extract_domains(
|
||||
&document
|
||||
.render_local(None, &shell)
|
||||
.await
|
||||
.unwrap_or_default(),
|
||||
);
|
||||
|
||||
let content = html! {
|
||||
h2 class="text-xl font-bold mb-4 flex items-center" {
|
||||
h2 class="text-xl font-bold mb-4 flex items-center w-fit mx-auto" {
|
||||
img class="p-2" src=(format!("/favicon/{}", &domain.name)) {};
|
||||
a href=(format!("/d/{}", &domain.name)) { (domain.name) };
|
||||
(slash_seperator())
|
||||
|
@ -150,7 +164,7 @@ pub async fn domain_info_route(
|
|||
};
|
||||
};
|
||||
|
||||
render_page(content, ctx).await
|
||||
render_page(content, ctx, &shell).await
|
||||
}
|
||||
|
||||
#[get("/txt/<domain>/<path..>?<time>&<no_data_urls>")]
|
||||
|
@ -160,11 +174,12 @@ pub async fn render_txt_website(
|
|||
time: Option<&str>,
|
||||
no_data_urls: Option<&str>,
|
||||
arc: &State<WebsiteArchive>,
|
||||
shell: &State<Shell>,
|
||||
) -> Option<String> {
|
||||
let document = arc.get_domain(domain).path(path.to_str().unwrap());
|
||||
|
||||
let mut content = document
|
||||
.render_local(time.map(|time| time.to_string()))
|
||||
.render_local(time.map(|time| time.to_string()), &shell)
|
||||
.await?;
|
||||
|
||||
if no_data_urls.is_some() {
|
||||
|
@ -215,11 +230,12 @@ pub async fn render_website(
|
|||
path: PathSegment,
|
||||
time: Option<&str>,
|
||||
arc: &State<WebsiteArchive>,
|
||||
shell: &State<Shell>,
|
||||
) -> Option<DataResponse> {
|
||||
let document = arc.get_domain(domain).path(&path.to_str());
|
||||
|
||||
let content = document
|
||||
.render_local(time.map(|time| time.to_string()))
|
||||
.render_local(time.map(|time| time.to_string()), &shell)
|
||||
.await;
|
||||
|
||||
if let Some(content) = content {
|
||||
|
@ -232,7 +248,7 @@ pub async fn render_website(
|
|||
arc.archive_url(&format!("https://{domain}/{}", path.to_str()))
|
||||
.await;
|
||||
|
||||
let content = document.render_local(None).await?;
|
||||
let content = document.render_local(None, &shell).await?;
|
||||
|
||||
return Some(DataResponse::new(
|
||||
content.as_bytes().to_vec(),
|
||||
|
@ -264,6 +280,7 @@ pub async fn vector_search(
|
|||
query: Option<&str>,
|
||||
page: Option<i64>,
|
||||
ctx: RequestContext,
|
||||
shell: &State<Shell>,
|
||||
) -> Option<StringResponse> {
|
||||
get_config().ai.as_ref()?;
|
||||
|
||||
|
@ -278,10 +295,12 @@ pub async fn vector_search(
|
|||
let real_query = query.trim_end_matches(".json");
|
||||
|
||||
// Search Results
|
||||
let vector = pgvector::Vector::from(generate_embedding(real_query.to_string()).await?);
|
||||
let vector = Arc::new(pgvector::Vector::from(
|
||||
generate_embedding(real_query.to_string()).await?,
|
||||
));
|
||||
|
||||
let results = GeneratedPager::new(
|
||||
|input, offset, limit| {
|
||||
|input: Arc<pgvector::Vector>, offset, limit| {
|
||||
Box::pin(async move {
|
||||
EmbedStore::search_vector(&input, limit as i64, offset as i64).await
|
||||
})
|
||||
|
@ -299,10 +318,10 @@ pub async fn vector_search(
|
|||
|
||||
let content = search.build_response(&ctx, results, page, real_query, gen_search_element);
|
||||
|
||||
return Some(render_page(content, ctx).await);
|
||||
return Some(render_page(content, ctx, &shell).await);
|
||||
}
|
||||
|
||||
// Return new search site
|
||||
let content = search.build("", html! {});
|
||||
Some(render_page(content, ctx).await)
|
||||
Some(render_page(content, ctx, &shell).await)
|
||||
}
|
||||
|
|
Loading…
Add table
Reference in a new issue