update
Some checks failed
ci/woodpecker/push/build Pipeline failed

This commit is contained in:
JMARyA 2025-02-09 00:07:28 +01:00
parent b530ae4dc3
commit 3e77ec5008
Signed by: jmarya
GPG key ID: 901B2ADDF27C2263
11 changed files with 476 additions and 346 deletions

630
Cargo.lock generated

File diff suppressed because it is too large Load diff

View file

@ -16,7 +16,7 @@ tokio = { version = "1.35.1", features = ["full"] }
uuid = { version = "1.8.0", features = ["v4", "serde"] }
sqlx = { version = "0.8", features = ["postgres", "runtime-tokio-native-tls", "derive", "uuid", "chrono", "json"] }
maud = "0.26.0"
based = { git = "https://git.hydrar.de/jmarya/based", features = ["htmx"] }
based = { git = "https://git.hydrar.de/jmarya/based", branch = "ui" }
url = "2.5.4"
reqwest = "0.12.11"
ollama-rs = "0.2.2"

View file

@ -17,6 +17,12 @@ BLACKLIST_DOMAINS = [
# The domain the config applies to
domain = "example.com"
# Consider a refresh after interval (in days)
outdated = 30
# Keep last n versions
keep_n = 5
# Blacklisted Path (Regexes)
blacklist_paths = ["/.*"]
@ -70,4 +76,8 @@ no_video = true
[[websites.domains]]
domain = "docs.flutter.dev"
no_javascript = true
no_video = true
no_video = true
[[websites.domains]]
domain = "home-assistant.io"
no_javascript = true

View file

@ -1,6 +1,6 @@
use std::collections::{HashMap, VecDeque};
use based::{get_pg, request::api::ToAPI, result::LogNoneAndPass};
use based::{get_pg, request::api::ToAPI, result::LogNoneAndPass, ui::components::Shell};
use ollama_rs::generation::embeddings::request::{EmbeddingsInput, GenerateEmbeddingsRequest};
use serde::Serialize;
use serde_json::json;
@ -110,6 +110,7 @@ pub trait Embedding {
fn embedding(
&self,
ver: Option<String>,
shell: &Shell,
) -> impl std::future::Future<Output = Option<Vec<Vec<f32>>>>;
}
@ -130,7 +131,7 @@ pub fn remove_data_urls(input: &str) -> String {
}
impl Embedding for Document {
async fn embedding(&self, ver: Option<String>) -> Option<Vec<Vec<f32>>> {
async fn embedding(&self, ver: Option<String>, shell: &Shell) -> Option<Vec<Vec<f32>>> {
let latest = "latest".to_string();
log::info!(
"Generating Vector embeddings for {} / {} @ {}",
@ -139,7 +140,7 @@ impl Embedding for Document {
ver.as_ref().unwrap_or(&latest)
);
let content_html = self.render_local(ver.clone()).await?;
let content_html = self.render_local(ver.clone(), shell).await?;
let content = remove_data_urls(&html2md::parse_html(&content_html));
let mut embeddings = Vec::new();
@ -219,9 +220,9 @@ impl EmbedStore {
.unwrap()
}
pub async fn embed_document(doc: &Document, ver: &str) {
pub async fn embed_document(doc: &Document, ver: &str, shell: &Shell) {
if let Some(embed) = doc
.embedding(Some(ver.to_string()))
.embedding(Some(ver.to_string()), shell)
.await
.log_warn_none_and_pass(|| {
format!(
@ -253,10 +254,10 @@ impl EmbedStore {
}
}
pub async fn ensure_embedding(doc: &Document) {
pub async fn ensure_embedding(doc: &Document, shell: &Shell) {
for ver in doc.versions() {
if Self::get_embedding(doc, Some(ver.as_str())).await.is_none() {
Self::embed_document(doc, &ver).await;
Self::embed_document(doc, &ver, shell).await;
}
}
}
@ -299,25 +300,25 @@ impl EmbedStore {
flat
}
pub async fn generate_embeddings_for(arc: &WebsiteArchive) {
pub async fn generate_embeddings_for(arc: &WebsiteArchive, shell: &Shell) {
log::info!("Generating embeddings");
for dom in arc.domains() {
let dom = arc.get_domain(&dom);
embed_path(&dom, "/").await;
embed_path(&dom, "/", shell).await;
}
log::info!("Done generating embeddings");
}
}
pub async fn embed_path(dom: &Domain, path: &str) {
pub async fn embed_path(dom: &Domain, path: &str, shell: &Shell) {
let (paths, is_doc) = dom.paths(path);
// If the path is a document, process the root path.
if is_doc {
let doc = dom.path("/");
EmbedStore::ensure_embedding(&doc).await;
EmbedStore::ensure_embedding(&doc, shell).await;
}
// Create a queue to process paths iteratively
@ -331,7 +332,7 @@ pub async fn embed_path(dom: &Domain, path: &str) {
if is_doc {
let doc = dom.path(next_path.path());
EmbedStore::ensure_embedding(&doc).await;
EmbedStore::ensure_embedding(&doc, shell).await;
}
queue.extend(next_paths);

View file

@ -1,6 +1,6 @@
use std::{io::Read, path::PathBuf};
use based::request::RequestContext;
use based::{request::RequestContext, ui::components::Shell};
use maud::html;
use crate::{blacklist::check_blacklist, conf::get_config, render_page};
@ -52,12 +52,17 @@ impl Document {
///
/// # Returns
/// An `Option` containing the rendered content as a string, or `None` if nothing could be rendered.
pub async fn render_local(&self, version: Option<String>) -> Option<String> {
pub async fn render_local(&self, version: Option<String>, shell: &Shell) -> Option<String> {
if check_blacklist(&self.domain) {
let content = html! {
h3 { "This site is blacklisted" };
};
return Some(render_page(content, RequestContext::default()).await.1 .1);
return Some(
render_page(content, RequestContext::default(), shell)
.await
.1
.1,
);
}
let mut file_path = self.doc_dir();
@ -100,6 +105,20 @@ impl Document {
file_path
}
fn latest_time_since(&self) -> Option<usize> {
if let Some(t_str) = self.versions().first() {
let given_date = chrono::NaiveDate::parse_from_str(t_str, "%Y-%m-%d")
.expect("Invalid date format. Expected yyyy-mm-dd.");
let today = chrono::Local::now().date_naive();
let duration = today.signed_duration_since(given_date);
return Some(duration.num_days() as usize);
}
None
}
/// Retrieves available versions of the document.
///
/// # Returns

View file

@ -28,6 +28,7 @@ pub fn read_dir(dir: &PathBuf) -> Vec<String> {
/// Rewrite all URLs in `input` to the format `/s/<domain>/<path..>`
fn internalize_urls(input: &str, base: &str) -> String {
// todo : fix regex, domains without path are not captured
let url_pattern = r#"(\ |"|')(?:(<?)(https?:\/\/([a-zA-Z0-9.-]+))?(\/[\w./-]*))"#;
let re = regex::Regex::new(url_pattern).unwrap();

View file

@ -67,6 +67,8 @@ pub struct WebsiteConfig {
#[derive(Debug, Clone, Deserialize)]
pub struct DomainConfig {
pub domain: String,
pub outdated: Option<usize>,
pub keep_n: Option<usize>,
pub blacklist_paths: Option<Vec<String>>,
pub no_audio: Option<bool>,
pub no_video: Option<bool>,
@ -83,6 +85,8 @@ impl Default for DomainConfig {
Self {
domain: String::new(),
blacklist_paths: None,
outdated: None,
keep_n: None,
no_audio: Some(false),
no_video: Some(false),
no_image: Some(false),

View file

@ -1,7 +1,5 @@
use based::{
page::Shell,
request::{RequestContext, StringResponse},
};
use based::request::{RequestContext, StringResponse};
use based::ui::components::Shell;
use maud::{html, PreEscaped};
pub mod ai;
@ -10,20 +8,10 @@ pub mod blacklist;
pub mod conf;
pub mod favicon;
pub async fn render_page(content: PreEscaped<String>, ctx: RequestContext) -> StringResponse {
based::page::render_page(
content,
"Website Archive",
ctx,
&Shell::new(
html! {
script src="https://cdn.tailwindcss.com" {};
meta name="viewport" content="width=device-width, initial-scale=1.0" {};
script src="/assets/htmx.min.js" {};
},
html! {},
Some("bg-zinc-950 text-white min-h-screen flex pt-8 justify-center".to_string()),
),
)
.await
pub async fn render_page(
content: PreEscaped<String>,
ctx: RequestContext,
shell: &Shell,
) -> StringResponse {
shell.render_page(content, "Website Archive", ctx).await
}

View file

@ -1,5 +1,9 @@
use based::asset::AssetRoutes;
use based::get_pg;
use based::ui::components::Shell;
use based::ui::prelude::*;
use rocket::routes;
use rocket::time::format_description::modifier::Padding;
use webarc::ai::EmbedStore;
use webarc::archive::WebsiteArchive;
use webarc::conf::{get_config, load_config, load_default_config};
@ -15,6 +19,8 @@ async fn main() {
let archive_dir: &String = args.get_one("dir").unwrap();
let shell = get_shell();
match args.subcommand() {
Some(("serve", serve_args)) => {
let config: &String = serve_args.get_one("config").unwrap();
@ -30,7 +36,7 @@ async fn main() {
let archive = arc.clone();
if get_config().ai.is_some() {
tokio::spawn(async move {
EmbedStore::generate_embeddings_for(&archive).await;
EmbedStore::generate_embeddings_for(&archive, &get_shell()).await;
});
}
@ -40,10 +46,10 @@ async fn main() {
});
rocket::build()
.mount_assets()
.mount(
"/",
routes![
based::htmx::htmx_script_route,
pages::index,
pages::render_website,
pages::domain_info_route,
@ -53,6 +59,7 @@ async fn main() {
],
)
.manage(arc)
.manage(shell)
.launch()
.await
.unwrap();
@ -146,7 +153,7 @@ async fn main() {
let md = get_args.get_flag("md");
let content = doc.render_local(Some(ver)).await;
let content = doc.render_local(Some(ver), &shell).await;
if content.is_none() {
println!("No document found");
@ -168,3 +175,20 @@ async fn main() {
None => {}
}
}
pub fn get_shell() -> Shell {
Shell::new(
Nothing(),
Nothing(),
Background(MinHeight(
ScreenValue::screen,
Padding(Text("").white()).top(ScreenValue::_8),
))
.color(Zinc::_950),
)
.use_ui()
}
// TODO : redownload after threshold
// TODO : keep n versions
// TODO : archive cleanup code

View file

@ -1,4 +1,5 @@
use maud::{html, PreEscaped};
use based::ui::prelude::*;
use maud::{html, PreEscaped, Render};
/// Generates an SVG arrow icon with the specified color.
///
@ -20,9 +21,9 @@ pub fn arrow_icon(color: &str) -> PreEscaped<String> {
/// # Returns
/// A `PreEscaped<String>` containing the HTML markup for a slash separator.
pub fn slash_seperator() -> PreEscaped<String> {
html! {
p class="font-bold p-2 text-gray-400" { " / " };
}
Padding(Text("/").bold().color(&Gray::_400))
.all(ScreenValue::_2)
.render()
}
/// Generates a hyperlink for a specific path within a domain.
@ -42,9 +43,12 @@ pub fn gen_path_link(
domain: &str,
) -> PreEscaped<String> {
let upto: Vec<&str> = path_seperations.iter().take(index + 1).cloned().collect();
html! {
a href=(format!("/d/{}/{}", domain, upto.join("/"))) { (path)}
}
Link(
&format!("/d/{}/{}", domain, upto.join("/")),
path.to_string(),
)
.use_htmx()
.render()
}
/// Generates a breadcrumb-like header for a path within a domain.

View file

@ -1,10 +1,11 @@
use std::{io::Read, path::PathBuf};
use std::{io::Read, path::PathBuf, sync::Arc};
use based::ui::prelude::*;
use based::{
page::search::Search,
request::{
api::GeneratedPager, assets::DataResponse, respond_json, RequestContext, StringResponse,
},
ui::components::{Search, Shell},
};
use maud::{html, PreEscaped};
use rocket::{get, request::FromSegments, State};
@ -20,6 +21,9 @@ use webarc::{
render_page,
};
// TODO : Implement archive timeline page (chrono sorted documents)
// TODO : impl archive index to db
const SEARCH_BAR_STYLE: &str = "w-full px-4 mb-4 py-2 text-white bg-black border-2 border-neon-blue placeholder-neon-blue focus:ring-2 focus:ring-neon-pink focus:outline-none font-mono text-lg";
/// Get the favicon of a domain
@ -42,7 +46,11 @@ pub async fn favicon_route(domain: &str) -> Option<DataResponse> {
/// Websites Overview
#[get("/")]
pub async fn index(ctx: RequestContext, arc: &State<WebsiteArchive>) -> StringResponse {
pub async fn index(
ctx: RequestContext,
arc: &State<WebsiteArchive>,
shell: &State<Shell>,
) -> StringResponse {
let websites = arc.domains();
let content = html! {
@ -71,7 +79,7 @@ pub async fn index(ctx: RequestContext, arc: &State<WebsiteArchive>) -> StringRe
}
};
render_page(content, ctx).await
render_page(content, ctx, &shell).await
}
/// Overview on <domain> / <path>
@ -81,6 +89,7 @@ pub async fn domain_info_route(
domain: &str,
paths: PathBuf,
arc: &State<WebsiteArchive>,
shell: &State<Shell>,
) -> StringResponse {
let domain = arc.get_domain(domain);
let document = domain.path(paths.to_str().unwrap());
@ -88,10 +97,15 @@ pub async fn domain_info_route(
let (path_entries, is_doc) = domain.paths(paths.to_str().unwrap());
let path_seperations: Vec<&str> = paths.to_str().unwrap().split('/').collect();
let domains = extract_domains(&document.render_local(None).await.unwrap_or_default());
let domains = extract_domains(
&document
.render_local(None, &shell)
.await
.unwrap_or_default(),
);
let content = html! {
h2 class="text-xl font-bold mb-4 flex items-center" {
h2 class="text-xl font-bold mb-4 flex items-center w-fit mx-auto" {
img class="p-2" src=(format!("/favicon/{}", &domain.name)) {};
a href=(format!("/d/{}", &domain.name)) { (domain.name) };
(slash_seperator())
@ -150,7 +164,7 @@ pub async fn domain_info_route(
};
};
render_page(content, ctx).await
render_page(content, ctx, &shell).await
}
#[get("/txt/<domain>/<path..>?<time>&<no_data_urls>")]
@ -160,11 +174,12 @@ pub async fn render_txt_website(
time: Option<&str>,
no_data_urls: Option<&str>,
arc: &State<WebsiteArchive>,
shell: &State<Shell>,
) -> Option<String> {
let document = arc.get_domain(domain).path(path.to_str().unwrap());
let mut content = document
.render_local(time.map(|time| time.to_string()))
.render_local(time.map(|time| time.to_string()), &shell)
.await?;
if no_data_urls.is_some() {
@ -215,11 +230,12 @@ pub async fn render_website(
path: PathSegment,
time: Option<&str>,
arc: &State<WebsiteArchive>,
shell: &State<Shell>,
) -> Option<DataResponse> {
let document = arc.get_domain(domain).path(&path.to_str());
let content = document
.render_local(time.map(|time| time.to_string()))
.render_local(time.map(|time| time.to_string()), &shell)
.await;
if let Some(content) = content {
@ -232,7 +248,7 @@ pub async fn render_website(
arc.archive_url(&format!("https://{domain}/{}", path.to_str()))
.await;
let content = document.render_local(None).await?;
let content = document.render_local(None, &shell).await?;
return Some(DataResponse::new(
content.as_bytes().to_vec(),
@ -264,6 +280,7 @@ pub async fn vector_search(
query: Option<&str>,
page: Option<i64>,
ctx: RequestContext,
shell: &State<Shell>,
) -> Option<StringResponse> {
get_config().ai.as_ref()?;
@ -278,10 +295,12 @@ pub async fn vector_search(
let real_query = query.trim_end_matches(".json");
// Search Results
let vector = pgvector::Vector::from(generate_embedding(real_query.to_string()).await?);
let vector = Arc::new(pgvector::Vector::from(
generate_embedding(real_query.to_string()).await?,
));
let results = GeneratedPager::new(
|input, offset, limit| {
|input: Arc<pgvector::Vector>, offset, limit| {
Box::pin(async move {
EmbedStore::search_vector(&input, limit as i64, offset as i64).await
})
@ -299,10 +318,10 @@ pub async fn vector_search(
let content = search.build_response(&ctx, results, page, real_query, gen_search_element);
return Some(render_page(content, ctx).await);
return Some(render_page(content, ctx, &shell).await);
}
// Return new search site
let content = search.build("", html! {});
Some(render_page(content, ctx).await)
Some(render_page(content, ctx, &shell).await)
}