update
Some checks failed
ci/woodpecker/push/build Pipeline failed

This commit is contained in:
JMARyA 2025-02-09 00:07:28 +01:00
parent b530ae4dc3
commit 3e77ec5008
Signed by: jmarya
GPG key ID: 901B2ADDF27C2263
11 changed files with 476 additions and 346 deletions

630
Cargo.lock generated

File diff suppressed because it is too large Load diff

View file

@ -16,7 +16,7 @@ tokio = { version = "1.35.1", features = ["full"] }
uuid = { version = "1.8.0", features = ["v4", "serde"] } uuid = { version = "1.8.0", features = ["v4", "serde"] }
sqlx = { version = "0.8", features = ["postgres", "runtime-tokio-native-tls", "derive", "uuid", "chrono", "json"] } sqlx = { version = "0.8", features = ["postgres", "runtime-tokio-native-tls", "derive", "uuid", "chrono", "json"] }
maud = "0.26.0" maud = "0.26.0"
based = { git = "https://git.hydrar.de/jmarya/based", features = ["htmx"] } based = { git = "https://git.hydrar.de/jmarya/based", branch = "ui" }
url = "2.5.4" url = "2.5.4"
reqwest = "0.12.11" reqwest = "0.12.11"
ollama-rs = "0.2.2" ollama-rs = "0.2.2"

View file

@ -17,6 +17,12 @@ BLACKLIST_DOMAINS = [
# The domain the config applies to # The domain the config applies to
domain = "example.com" domain = "example.com"
# Consider a refresh after interval (in days)
outdated = 30
# Keep last n versions
keep_n = 5
# Blacklisted Path (Regexes) # Blacklisted Path (Regexes)
blacklist_paths = ["/.*"] blacklist_paths = ["/.*"]
@ -70,4 +76,8 @@ no_video = true
[[websites.domains]] [[websites.domains]]
domain = "docs.flutter.dev" domain = "docs.flutter.dev"
no_javascript = true no_javascript = true
no_video = true no_video = true
[[websites.domains]]
domain = "home-assistant.io"
no_javascript = true

View file

@ -1,6 +1,6 @@
use std::collections::{HashMap, VecDeque}; use std::collections::{HashMap, VecDeque};
use based::{get_pg, request::api::ToAPI, result::LogNoneAndPass}; use based::{get_pg, request::api::ToAPI, result::LogNoneAndPass, ui::components::Shell};
use ollama_rs::generation::embeddings::request::{EmbeddingsInput, GenerateEmbeddingsRequest}; use ollama_rs::generation::embeddings::request::{EmbeddingsInput, GenerateEmbeddingsRequest};
use serde::Serialize; use serde::Serialize;
use serde_json::json; use serde_json::json;
@ -110,6 +110,7 @@ pub trait Embedding {
fn embedding( fn embedding(
&self, &self,
ver: Option<String>, ver: Option<String>,
shell: &Shell,
) -> impl std::future::Future<Output = Option<Vec<Vec<f32>>>>; ) -> impl std::future::Future<Output = Option<Vec<Vec<f32>>>>;
} }
@ -130,7 +131,7 @@ pub fn remove_data_urls(input: &str) -> String {
} }
impl Embedding for Document { impl Embedding for Document {
async fn embedding(&self, ver: Option<String>) -> Option<Vec<Vec<f32>>> { async fn embedding(&self, ver: Option<String>, shell: &Shell) -> Option<Vec<Vec<f32>>> {
let latest = "latest".to_string(); let latest = "latest".to_string();
log::info!( log::info!(
"Generating Vector embeddings for {} / {} @ {}", "Generating Vector embeddings for {} / {} @ {}",
@ -139,7 +140,7 @@ impl Embedding for Document {
ver.as_ref().unwrap_or(&latest) ver.as_ref().unwrap_or(&latest)
); );
let content_html = self.render_local(ver.clone()).await?; let content_html = self.render_local(ver.clone(), shell).await?;
let content = remove_data_urls(&html2md::parse_html(&content_html)); let content = remove_data_urls(&html2md::parse_html(&content_html));
let mut embeddings = Vec::new(); let mut embeddings = Vec::new();
@ -219,9 +220,9 @@ impl EmbedStore {
.unwrap() .unwrap()
} }
pub async fn embed_document(doc: &Document, ver: &str) { pub async fn embed_document(doc: &Document, ver: &str, shell: &Shell) {
if let Some(embed) = doc if let Some(embed) = doc
.embedding(Some(ver.to_string())) .embedding(Some(ver.to_string()), shell)
.await .await
.log_warn_none_and_pass(|| { .log_warn_none_and_pass(|| {
format!( format!(
@ -253,10 +254,10 @@ impl EmbedStore {
} }
} }
pub async fn ensure_embedding(doc: &Document) { pub async fn ensure_embedding(doc: &Document, shell: &Shell) {
for ver in doc.versions() { for ver in doc.versions() {
if Self::get_embedding(doc, Some(ver.as_str())).await.is_none() { if Self::get_embedding(doc, Some(ver.as_str())).await.is_none() {
Self::embed_document(doc, &ver).await; Self::embed_document(doc, &ver, shell).await;
} }
} }
} }
@ -299,25 +300,25 @@ impl EmbedStore {
flat flat
} }
pub async fn generate_embeddings_for(arc: &WebsiteArchive) { pub async fn generate_embeddings_for(arc: &WebsiteArchive, shell: &Shell) {
log::info!("Generating embeddings"); log::info!("Generating embeddings");
for dom in arc.domains() { for dom in arc.domains() {
let dom = arc.get_domain(&dom); let dom = arc.get_domain(&dom);
embed_path(&dom, "/").await; embed_path(&dom, "/", shell).await;
} }
log::info!("Done generating embeddings"); log::info!("Done generating embeddings");
} }
} }
pub async fn embed_path(dom: &Domain, path: &str) { pub async fn embed_path(dom: &Domain, path: &str, shell: &Shell) {
let (paths, is_doc) = dom.paths(path); let (paths, is_doc) = dom.paths(path);
// If the path is a document, process the root path. // If the path is a document, process the root path.
if is_doc { if is_doc {
let doc = dom.path("/"); let doc = dom.path("/");
EmbedStore::ensure_embedding(&doc).await; EmbedStore::ensure_embedding(&doc, shell).await;
} }
// Create a queue to process paths iteratively // Create a queue to process paths iteratively
@ -331,7 +332,7 @@ pub async fn embed_path(dom: &Domain, path: &str) {
if is_doc { if is_doc {
let doc = dom.path(next_path.path()); let doc = dom.path(next_path.path());
EmbedStore::ensure_embedding(&doc).await; EmbedStore::ensure_embedding(&doc, shell).await;
} }
queue.extend(next_paths); queue.extend(next_paths);

View file

@ -1,6 +1,6 @@
use std::{io::Read, path::PathBuf}; use std::{io::Read, path::PathBuf};
use based::request::RequestContext; use based::{request::RequestContext, ui::components::Shell};
use maud::html; use maud::html;
use crate::{blacklist::check_blacklist, conf::get_config, render_page}; use crate::{blacklist::check_blacklist, conf::get_config, render_page};
@ -52,12 +52,17 @@ impl Document {
/// ///
/// # Returns /// # Returns
/// An `Option` containing the rendered content as a string, or `None` if nothing could be rendered. /// An `Option` containing the rendered content as a string, or `None` if nothing could be rendered.
pub async fn render_local(&self, version: Option<String>) -> Option<String> { pub async fn render_local(&self, version: Option<String>, shell: &Shell) -> Option<String> {
if check_blacklist(&self.domain) { if check_blacklist(&self.domain) {
let content = html! { let content = html! {
h3 { "This site is blacklisted" }; h3 { "This site is blacklisted" };
}; };
return Some(render_page(content, RequestContext::default()).await.1 .1); return Some(
render_page(content, RequestContext::default(), shell)
.await
.1
.1,
);
} }
let mut file_path = self.doc_dir(); let mut file_path = self.doc_dir();
@ -100,6 +105,20 @@ impl Document {
file_path file_path
} }
fn latest_time_since(&self) -> Option<usize> {
if let Some(t_str) = self.versions().first() {
let given_date = chrono::NaiveDate::parse_from_str(t_str, "%Y-%m-%d")
.expect("Invalid date format. Expected yyyy-mm-dd.");
let today = chrono::Local::now().date_naive();
let duration = today.signed_duration_since(given_date);
return Some(duration.num_days() as usize);
}
None
}
/// Retrieves available versions of the document. /// Retrieves available versions of the document.
/// ///
/// # Returns /// # Returns

View file

@ -28,6 +28,7 @@ pub fn read_dir(dir: &PathBuf) -> Vec<String> {
/// Rewrite all URLs in `input` to the format `/s/<domain>/<path..>` /// Rewrite all URLs in `input` to the format `/s/<domain>/<path..>`
fn internalize_urls(input: &str, base: &str) -> String { fn internalize_urls(input: &str, base: &str) -> String {
// todo : fix regex, domains without path are not captured
let url_pattern = r#"(\ |"|')(?:(<?)(https?:\/\/([a-zA-Z0-9.-]+))?(\/[\w./-]*))"#; let url_pattern = r#"(\ |"|')(?:(<?)(https?:\/\/([a-zA-Z0-9.-]+))?(\/[\w./-]*))"#;
let re = regex::Regex::new(url_pattern).unwrap(); let re = regex::Regex::new(url_pattern).unwrap();

View file

@ -67,6 +67,8 @@ pub struct WebsiteConfig {
#[derive(Debug, Clone, Deserialize)] #[derive(Debug, Clone, Deserialize)]
pub struct DomainConfig { pub struct DomainConfig {
pub domain: String, pub domain: String,
pub outdated: Option<usize>,
pub keep_n: Option<usize>,
pub blacklist_paths: Option<Vec<String>>, pub blacklist_paths: Option<Vec<String>>,
pub no_audio: Option<bool>, pub no_audio: Option<bool>,
pub no_video: Option<bool>, pub no_video: Option<bool>,
@ -83,6 +85,8 @@ impl Default for DomainConfig {
Self { Self {
domain: String::new(), domain: String::new(),
blacklist_paths: None, blacklist_paths: None,
outdated: None,
keep_n: None,
no_audio: Some(false), no_audio: Some(false),
no_video: Some(false), no_video: Some(false),
no_image: Some(false), no_image: Some(false),

View file

@ -1,7 +1,5 @@
use based::{ use based::request::{RequestContext, StringResponse};
page::Shell, use based::ui::components::Shell;
request::{RequestContext, StringResponse},
};
use maud::{html, PreEscaped}; use maud::{html, PreEscaped};
pub mod ai; pub mod ai;
@ -10,20 +8,10 @@ pub mod blacklist;
pub mod conf; pub mod conf;
pub mod favicon; pub mod favicon;
pub async fn render_page(content: PreEscaped<String>, ctx: RequestContext) -> StringResponse { pub async fn render_page(
based::page::render_page( content: PreEscaped<String>,
content, ctx: RequestContext,
"Website Archive", shell: &Shell,
ctx, ) -> StringResponse {
&Shell::new( shell.render_page(content, "Website Archive", ctx).await
html! {
script src="https://cdn.tailwindcss.com" {};
meta name="viewport" content="width=device-width, initial-scale=1.0" {};
script src="/assets/htmx.min.js" {};
},
html! {},
Some("bg-zinc-950 text-white min-h-screen flex pt-8 justify-center".to_string()),
),
)
.await
} }

View file

@ -1,5 +1,9 @@
use based::asset::AssetRoutes;
use based::get_pg; use based::get_pg;
use based::ui::components::Shell;
use based::ui::prelude::*;
use rocket::routes; use rocket::routes;
use rocket::time::format_description::modifier::Padding;
use webarc::ai::EmbedStore; use webarc::ai::EmbedStore;
use webarc::archive::WebsiteArchive; use webarc::archive::WebsiteArchive;
use webarc::conf::{get_config, load_config, load_default_config}; use webarc::conf::{get_config, load_config, load_default_config};
@ -15,6 +19,8 @@ async fn main() {
let archive_dir: &String = args.get_one("dir").unwrap(); let archive_dir: &String = args.get_one("dir").unwrap();
let shell = get_shell();
match args.subcommand() { match args.subcommand() {
Some(("serve", serve_args)) => { Some(("serve", serve_args)) => {
let config: &String = serve_args.get_one("config").unwrap(); let config: &String = serve_args.get_one("config").unwrap();
@ -30,7 +36,7 @@ async fn main() {
let archive = arc.clone(); let archive = arc.clone();
if get_config().ai.is_some() { if get_config().ai.is_some() {
tokio::spawn(async move { tokio::spawn(async move {
EmbedStore::generate_embeddings_for(&archive).await; EmbedStore::generate_embeddings_for(&archive, &get_shell()).await;
}); });
} }
@ -40,10 +46,10 @@ async fn main() {
}); });
rocket::build() rocket::build()
.mount_assets()
.mount( .mount(
"/", "/",
routes![ routes![
based::htmx::htmx_script_route,
pages::index, pages::index,
pages::render_website, pages::render_website,
pages::domain_info_route, pages::domain_info_route,
@ -53,6 +59,7 @@ async fn main() {
], ],
) )
.manage(arc) .manage(arc)
.manage(shell)
.launch() .launch()
.await .await
.unwrap(); .unwrap();
@ -146,7 +153,7 @@ async fn main() {
let md = get_args.get_flag("md"); let md = get_args.get_flag("md");
let content = doc.render_local(Some(ver)).await; let content = doc.render_local(Some(ver), &shell).await;
if content.is_none() { if content.is_none() {
println!("No document found"); println!("No document found");
@ -168,3 +175,20 @@ async fn main() {
None => {} None => {}
} }
} }
pub fn get_shell() -> Shell {
Shell::new(
Nothing(),
Nothing(),
Background(MinHeight(
ScreenValue::screen,
Padding(Text("").white()).top(ScreenValue::_8),
))
.color(Zinc::_950),
)
.use_ui()
}
// TODO : redownload after threshold
// TODO : keep n versions
// TODO : archive cleanup code

View file

@ -1,4 +1,5 @@
use maud::{html, PreEscaped}; use based::ui::prelude::*;
use maud::{html, PreEscaped, Render};
/// Generates an SVG arrow icon with the specified color. /// Generates an SVG arrow icon with the specified color.
/// ///
@ -20,9 +21,9 @@ pub fn arrow_icon(color: &str) -> PreEscaped<String> {
/// # Returns /// # Returns
/// A `PreEscaped<String>` containing the HTML markup for a slash separator. /// A `PreEscaped<String>` containing the HTML markup for a slash separator.
pub fn slash_seperator() -> PreEscaped<String> { pub fn slash_seperator() -> PreEscaped<String> {
html! { Padding(Text("/").bold().color(&Gray::_400))
p class="font-bold p-2 text-gray-400" { " / " }; .all(ScreenValue::_2)
} .render()
} }
/// Generates a hyperlink for a specific path within a domain. /// Generates a hyperlink for a specific path within a domain.
@ -42,9 +43,12 @@ pub fn gen_path_link(
domain: &str, domain: &str,
) -> PreEscaped<String> { ) -> PreEscaped<String> {
let upto: Vec<&str> = path_seperations.iter().take(index + 1).cloned().collect(); let upto: Vec<&str> = path_seperations.iter().take(index + 1).cloned().collect();
html! { Link(
a href=(format!("/d/{}/{}", domain, upto.join("/"))) { (path)} &format!("/d/{}/{}", domain, upto.join("/")),
} path.to_string(),
)
.use_htmx()
.render()
} }
/// Generates a breadcrumb-like header for a path within a domain. /// Generates a breadcrumb-like header for a path within a domain.

View file

@ -1,10 +1,11 @@
use std::{io::Read, path::PathBuf}; use std::{io::Read, path::PathBuf, sync::Arc};
use based::ui::prelude::*;
use based::{ use based::{
page::search::Search,
request::{ request::{
api::GeneratedPager, assets::DataResponse, respond_json, RequestContext, StringResponse, api::GeneratedPager, assets::DataResponse, respond_json, RequestContext, StringResponse,
}, },
ui::components::{Search, Shell},
}; };
use maud::{html, PreEscaped}; use maud::{html, PreEscaped};
use rocket::{get, request::FromSegments, State}; use rocket::{get, request::FromSegments, State};
@ -20,6 +21,9 @@ use webarc::{
render_page, render_page,
}; };
// TODO : Implement archive timeline page (chrono sorted documents)
// TODO : impl archive index to db
const SEARCH_BAR_STYLE: &str = "w-full px-4 mb-4 py-2 text-white bg-black border-2 border-neon-blue placeholder-neon-blue focus:ring-2 focus:ring-neon-pink focus:outline-none font-mono text-lg"; const SEARCH_BAR_STYLE: &str = "w-full px-4 mb-4 py-2 text-white bg-black border-2 border-neon-blue placeholder-neon-blue focus:ring-2 focus:ring-neon-pink focus:outline-none font-mono text-lg";
/// Get the favicon of a domain /// Get the favicon of a domain
@ -42,7 +46,11 @@ pub async fn favicon_route(domain: &str) -> Option<DataResponse> {
/// Websites Overview /// Websites Overview
#[get("/")] #[get("/")]
pub async fn index(ctx: RequestContext, arc: &State<WebsiteArchive>) -> StringResponse { pub async fn index(
ctx: RequestContext,
arc: &State<WebsiteArchive>,
shell: &State<Shell>,
) -> StringResponse {
let websites = arc.domains(); let websites = arc.domains();
let content = html! { let content = html! {
@ -71,7 +79,7 @@ pub async fn index(ctx: RequestContext, arc: &State<WebsiteArchive>) -> StringRe
} }
}; };
render_page(content, ctx).await render_page(content, ctx, &shell).await
} }
/// Overview on <domain> / <path> /// Overview on <domain> / <path>
@ -81,6 +89,7 @@ pub async fn domain_info_route(
domain: &str, domain: &str,
paths: PathBuf, paths: PathBuf,
arc: &State<WebsiteArchive>, arc: &State<WebsiteArchive>,
shell: &State<Shell>,
) -> StringResponse { ) -> StringResponse {
let domain = arc.get_domain(domain); let domain = arc.get_domain(domain);
let document = domain.path(paths.to_str().unwrap()); let document = domain.path(paths.to_str().unwrap());
@ -88,10 +97,15 @@ pub async fn domain_info_route(
let (path_entries, is_doc) = domain.paths(paths.to_str().unwrap()); let (path_entries, is_doc) = domain.paths(paths.to_str().unwrap());
let path_seperations: Vec<&str> = paths.to_str().unwrap().split('/').collect(); let path_seperations: Vec<&str> = paths.to_str().unwrap().split('/').collect();
let domains = extract_domains(&document.render_local(None).await.unwrap_or_default()); let domains = extract_domains(
&document
.render_local(None, &shell)
.await
.unwrap_or_default(),
);
let content = html! { let content = html! {
h2 class="text-xl font-bold mb-4 flex items-center" { h2 class="text-xl font-bold mb-4 flex items-center w-fit mx-auto" {
img class="p-2" src=(format!("/favicon/{}", &domain.name)) {}; img class="p-2" src=(format!("/favicon/{}", &domain.name)) {};
a href=(format!("/d/{}", &domain.name)) { (domain.name) }; a href=(format!("/d/{}", &domain.name)) { (domain.name) };
(slash_seperator()) (slash_seperator())
@ -150,7 +164,7 @@ pub async fn domain_info_route(
}; };
}; };
render_page(content, ctx).await render_page(content, ctx, &shell).await
} }
#[get("/txt/<domain>/<path..>?<time>&<no_data_urls>")] #[get("/txt/<domain>/<path..>?<time>&<no_data_urls>")]
@ -160,11 +174,12 @@ pub async fn render_txt_website(
time: Option<&str>, time: Option<&str>,
no_data_urls: Option<&str>, no_data_urls: Option<&str>,
arc: &State<WebsiteArchive>, arc: &State<WebsiteArchive>,
shell: &State<Shell>,
) -> Option<String> { ) -> Option<String> {
let document = arc.get_domain(domain).path(path.to_str().unwrap()); let document = arc.get_domain(domain).path(path.to_str().unwrap());
let mut content = document let mut content = document
.render_local(time.map(|time| time.to_string())) .render_local(time.map(|time| time.to_string()), &shell)
.await?; .await?;
if no_data_urls.is_some() { if no_data_urls.is_some() {
@ -215,11 +230,12 @@ pub async fn render_website(
path: PathSegment, path: PathSegment,
time: Option<&str>, time: Option<&str>,
arc: &State<WebsiteArchive>, arc: &State<WebsiteArchive>,
shell: &State<Shell>,
) -> Option<DataResponse> { ) -> Option<DataResponse> {
let document = arc.get_domain(domain).path(&path.to_str()); let document = arc.get_domain(domain).path(&path.to_str());
let content = document let content = document
.render_local(time.map(|time| time.to_string())) .render_local(time.map(|time| time.to_string()), &shell)
.await; .await;
if let Some(content) = content { if let Some(content) = content {
@ -232,7 +248,7 @@ pub async fn render_website(
arc.archive_url(&format!("https://{domain}/{}", path.to_str())) arc.archive_url(&format!("https://{domain}/{}", path.to_str()))
.await; .await;
let content = document.render_local(None).await?; let content = document.render_local(None, &shell).await?;
return Some(DataResponse::new( return Some(DataResponse::new(
content.as_bytes().to_vec(), content.as_bytes().to_vec(),
@ -264,6 +280,7 @@ pub async fn vector_search(
query: Option<&str>, query: Option<&str>,
page: Option<i64>, page: Option<i64>,
ctx: RequestContext, ctx: RequestContext,
shell: &State<Shell>,
) -> Option<StringResponse> { ) -> Option<StringResponse> {
get_config().ai.as_ref()?; get_config().ai.as_ref()?;
@ -278,10 +295,12 @@ pub async fn vector_search(
let real_query = query.trim_end_matches(".json"); let real_query = query.trim_end_matches(".json");
// Search Results // Search Results
let vector = pgvector::Vector::from(generate_embedding(real_query.to_string()).await?); let vector = Arc::new(pgvector::Vector::from(
generate_embedding(real_query.to_string()).await?,
));
let results = GeneratedPager::new( let results = GeneratedPager::new(
|input, offset, limit| { |input: Arc<pgvector::Vector>, offset, limit| {
Box::pin(async move { Box::pin(async move {
EmbedStore::search_vector(&input, limit as i64, offset as i64).await EmbedStore::search_vector(&input, limit as i64, offset as i64).await
}) })
@ -299,10 +318,10 @@ pub async fn vector_search(
let content = search.build_response(&ctx, results, page, real_query, gen_search_element); let content = search.build_response(&ctx, results, page, real_query, gen_search_element);
return Some(render_page(content, ctx).await); return Some(render_page(content, ctx, &shell).await);
} }
// Return new search site // Return new search site
let content = search.build("", html! {}); let content = search.build("", html! {});
Some(render_page(content, ctx).await) Some(render_page(content, ctx, &shell).await)
} }