From 5b545780a4d5edd75d648a621d9eedf1633a938b Mon Sep 17 00:00:00 2001 From: JMARyA Date: Sun, 9 Feb 2025 22:03:33 +0100 Subject: [PATCH] =?UTF-8?q?=E2=9C=A8=20archive=20index?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- config.toml | 6 ++++ migrations/0002_doc_index.sql | 7 ++++ src/archive/mod.rs | 61 +++++++++++++++++++++++++++++++++++ src/conf.rs | 16 +++++++++ src/lib.rs | 2 +- src/main.rs | 8 ++--- src/pages/mod.rs | 60 +++++++++++++++++++++++++++------- 7 files changed, 143 insertions(+), 17 deletions(-) create mode 100644 migrations/0002_doc_index.sql diff --git a/config.toml b/config.toml index b35ace6..de3dc5b 100644 --- a/config.toml +++ b/config.toml @@ -12,6 +12,12 @@ BLACKLIST_DOMAINS = [ "youtube" # YouTube ] +# Consider a refresh after interval (in days) (Global) +outdated = 3 + +# Keep last n versions (Global) +keep_n = 5 + # Domain configuration (Example) [[websites.domains]] # The domain the config applies to diff --git a/migrations/0002_doc_index.sql b/migrations/0002_doc_index.sql new file mode 100644 index 0000000..505e0f2 --- /dev/null +++ b/migrations/0002_doc_index.sql @@ -0,0 +1,7 @@ + +CREATE TABLE document_index IF NOT EXISTS ( + domain TEXT NOT NULL, + path TEXT NOT NULL, + version DATE NOT NULL, + PRIMARY KEY (domain, path, version) +); diff --git a/src/archive/mod.rs b/src/archive/mod.rs index adb6e5c..5888407 100644 --- a/src/archive/mod.rs +++ b/src/archive/mod.rs @@ -8,6 +8,7 @@ use crate::{ mod document; mod domain; +use based::get_pg; pub use document::Document; pub use domain::*; @@ -88,6 +89,8 @@ pub fn extract_domains(input: &str) -> Vec { domains } +// TODO : impl archive index to db + /// Represents a directory containg archived websites #[derive(Debug, Clone)] pub struct WebsiteArchive { @@ -231,3 +234,61 @@ fn run_command(cmd: &[&str]) { let status = child.wait_with_output().unwrap(); assert!(status.status.success()); } + +pub async fn index_archive_db(arc: &WebsiteArchive) { + log::info!("Indexing archive"); + + for dom in arc.domains() { + let dom = arc.get_domain(&dom); + index_path(&dom, "/").await; + } + + log::info!("Done indexing archive"); +} + +pub async fn index_path(dom: &Domain, path: &str) { + let (paths, is_doc) = dom.paths(path); + + // If the path is a document, process the root path. + if is_doc { + let doc = dom.path("/"); + index_document(&doc).await; + } + + // Create a queue to process paths iteratively + let mut queue = std::collections::VecDeque::new(); + + // Add the initial paths to the queue + queue.extend(paths); + + while let Some(next_path) = queue.pop_front() { + let (next_paths, is_doc) = dom.paths(next_path.path()); + + if is_doc { + let doc = dom.path(next_path.path()); + index_document(&doc).await; + } + + queue.extend(next_paths); + } +} + +pub async fn index_document(doc: &Document) { + for version_str in &doc.versions() { + if let Ok(version) = chrono::NaiveDate::parse_from_str(version_str, "%Y-%m-%d") { + sqlx::query( + r#" + INSERT INTO document_index (domain, path, version) + VALUES ($1, $2, $3) + ON CONFLICT (domain, path, version) DO NOTHING + "#, + ) + .bind(&doc.domain) + .bind(&doc.path) + .bind(version) + .execute(get_pg!()) + .await + .unwrap(); + } + } +} diff --git a/src/conf.rs b/src/conf.rs index 753b3cc..a2c824f 100644 --- a/src/conf.rs +++ b/src/conf.rs @@ -49,6 +49,20 @@ impl Config { None } + + pub fn get_outdated(&self, domain: &str) -> Option { + if let Some(conf) = self.get_domain_config(domain) { + if let Some(outdated) = conf.outdated { + return Some(outdated); + } + } + + if let Some(outdated) = self.websites.as_ref().map(|x| x.outdated) { + return outdated; + } + + None + } } #[allow(non_snake_case)] @@ -61,6 +75,8 @@ pub struct AIConfig { #[derive(Debug, Clone, Deserialize)] pub struct WebsiteConfig { pub BLACKLIST_DOMAINS: Option>, + pub outdated: Option, + pub keep_n: Option, pub domains: Option>, } diff --git a/src/lib.rs b/src/lib.rs index 4aae351..8b4eb5b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,6 +1,6 @@ use based::request::{RequestContext, StringResponse}; use based::ui::components::Shell; -use maud::{html, PreEscaped}; +use maud::PreEscaped; pub mod ai; pub mod archive; diff --git a/src/main.rs b/src/main.rs index cd04e07..91487fe 100644 --- a/src/main.rs +++ b/src/main.rs @@ -3,9 +3,8 @@ use based::get_pg; use based::ui::components::Shell; use based::ui::prelude::*; use rocket::routes; -use rocket::time::format_description::modifier::Padding; use webarc::ai::EmbedStore; -use webarc::archive::WebsiteArchive; +use webarc::archive::{index_archive_db, WebsiteArchive}; use webarc::conf::{get_config, load_config, load_default_config}; mod args; @@ -45,6 +44,9 @@ async fn main() { webarc::favicon::download_favicons_for_sites(&archive.domains()).await; }); + let archive = arc.clone(); + tokio::spawn(async move { index_archive_db(&archive).await }); + rocket::build() .mount_assets() .mount( @@ -189,6 +191,4 @@ pub fn get_shell() -> Shell { .use_ui() } -// TODO : redownload after threshold -// TODO : keep n versions // TODO : archive cleanup code diff --git a/src/pages/mod.rs b/src/pages/mod.rs index cab43bb..f186d53 100644 --- a/src/pages/mod.rs +++ b/src/pages/mod.rs @@ -1,6 +1,5 @@ use std::{io::Read, path::PathBuf, sync::Arc}; -use based::ui::prelude::*; use based::{ request::{ api::GeneratedPager, assets::DataResponse, respond_json, RequestContext, StringResponse, @@ -14,6 +13,7 @@ pub mod component; use component::*; use serde_json::json; +use webarc::archive::Document; use webarc::{ ai::{generate_embedding, remove_data_urls, EmbedStore, SearchResult}, archive::{extract_domains, WebsiteArchive}, @@ -22,7 +22,6 @@ use webarc::{ }; // TODO : Implement archive timeline page (chrono sorted documents) -// TODO : impl archive index to db const SEARCH_BAR_STYLE: &str = "w-full px-4 mb-4 py-2 text-white bg-black border-2 border-neon-blue placeholder-neon-blue focus:ring-2 focus:ring-neon-pink focus:outline-none font-mono text-lg"; @@ -193,6 +192,12 @@ pub struct PathSegment { segments: Vec, } +impl std::fmt::Display for PathSegment { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.write_str(&self.to_str()) + } +} + impl PathSegment { pub fn to_str(&self) -> String { self.segments.join("/") @@ -223,6 +228,33 @@ impl<'r> FromSegments<'r> for PathSegment { } } +fn is_older_than(date_str: &str, num_days: usize) -> bool { + let date = + chrono::NaiveDate::parse_from_str(date_str, "%Y-%m-%d").expect("Invalid date format"); + let threshold_date = chrono::Utc::now().date_naive(); + let diff = threshold_date - date; + diff.num_days() > num_days as i64 +} + +pub async fn redownload( + arc: &WebsiteArchive, + domain: &str, + path: PathSegment, + document: &Document, + shell: &Shell, +) -> Option { + arc.archive_url(&format!("https://{domain}/{}", path.to_str())) + .await; + + let content = document.render_local(None, &shell).await?; + + return Some(DataResponse::new( + content.as_bytes().to_vec(), + "text/html".to_string(), + Some(60 * 60 * 24), + )); +} + /// Return archived version of `domain` / `path` at `time` #[get("/s//?