archive index

This commit is contained in:
JMARyA 2025-02-09 22:03:33 +01:00
parent 3e77ec5008
commit 5b545780a4
Signed by: jmarya
GPG key ID: 901B2ADDF27C2263
7 changed files with 143 additions and 17 deletions

View file

@ -12,6 +12,12 @@ BLACKLIST_DOMAINS = [
"youtube" # YouTube
]
# Consider a refresh after interval (in days) (Global)
outdated = 3
# Keep last n versions (Global)
keep_n = 5
# Domain configuration (Example)
[[websites.domains]]
# The domain the config applies to

View file

@ -0,0 +1,7 @@
CREATE TABLE document_index IF NOT EXISTS (
domain TEXT NOT NULL,
path TEXT NOT NULL,
version DATE NOT NULL,
PRIMARY KEY (domain, path, version)
);

View file

@ -8,6 +8,7 @@ use crate::{
mod document;
mod domain;
use based::get_pg;
pub use document::Document;
pub use domain::*;
@ -88,6 +89,8 @@ pub fn extract_domains(input: &str) -> Vec<String> {
domains
}
// TODO : impl archive index to db
/// Represents a directory containg archived websites
#[derive(Debug, Clone)]
pub struct WebsiteArchive {
@ -231,3 +234,61 @@ fn run_command(cmd: &[&str]) {
let status = child.wait_with_output().unwrap();
assert!(status.status.success());
}
pub async fn index_archive_db(arc: &WebsiteArchive) {
log::info!("Indexing archive");
for dom in arc.domains() {
let dom = arc.get_domain(&dom);
index_path(&dom, "/").await;
}
log::info!("Done indexing archive");
}
pub async fn index_path(dom: &Domain, path: &str) {
let (paths, is_doc) = dom.paths(path);
// If the path is a document, process the root path.
if is_doc {
let doc = dom.path("/");
index_document(&doc).await;
}
// Create a queue to process paths iteratively
let mut queue = std::collections::VecDeque::new();
// Add the initial paths to the queue
queue.extend(paths);
while let Some(next_path) = queue.pop_front() {
let (next_paths, is_doc) = dom.paths(next_path.path());
if is_doc {
let doc = dom.path(next_path.path());
index_document(&doc).await;
}
queue.extend(next_paths);
}
}
pub async fn index_document(doc: &Document) {
for version_str in &doc.versions() {
if let Ok(version) = chrono::NaiveDate::parse_from_str(version_str, "%Y-%m-%d") {
sqlx::query(
r#"
INSERT INTO document_index (domain, path, version)
VALUES ($1, $2, $3)
ON CONFLICT (domain, path, version) DO NOTHING
"#,
)
.bind(&doc.domain)
.bind(&doc.path)
.bind(version)
.execute(get_pg!())
.await
.unwrap();
}
}
}

View file

@ -49,6 +49,20 @@ impl Config {
None
}
pub fn get_outdated(&self, domain: &str) -> Option<usize> {
if let Some(conf) = self.get_domain_config(domain) {
if let Some(outdated) = conf.outdated {
return Some(outdated);
}
}
if let Some(outdated) = self.websites.as_ref().map(|x| x.outdated) {
return outdated;
}
None
}
}
#[allow(non_snake_case)]
@ -61,6 +75,8 @@ pub struct AIConfig {
#[derive(Debug, Clone, Deserialize)]
pub struct WebsiteConfig {
pub BLACKLIST_DOMAINS: Option<Vec<String>>,
pub outdated: Option<usize>,
pub keep_n: Option<usize>,
pub domains: Option<Vec<DomainConfig>>,
}

View file

@ -1,6 +1,6 @@
use based::request::{RequestContext, StringResponse};
use based::ui::components::Shell;
use maud::{html, PreEscaped};
use maud::PreEscaped;
pub mod ai;
pub mod archive;

View file

@ -3,9 +3,8 @@ use based::get_pg;
use based::ui::components::Shell;
use based::ui::prelude::*;
use rocket::routes;
use rocket::time::format_description::modifier::Padding;
use webarc::ai::EmbedStore;
use webarc::archive::WebsiteArchive;
use webarc::archive::{index_archive_db, WebsiteArchive};
use webarc::conf::{get_config, load_config, load_default_config};
mod args;
@ -45,6 +44,9 @@ async fn main() {
webarc::favicon::download_favicons_for_sites(&archive.domains()).await;
});
let archive = arc.clone();
tokio::spawn(async move { index_archive_db(&archive).await });
rocket::build()
.mount_assets()
.mount(
@ -189,6 +191,4 @@ pub fn get_shell() -> Shell {
.use_ui()
}
// TODO : redownload after threshold
// TODO : keep n versions
// TODO : archive cleanup code

View file

@ -1,6 +1,5 @@
use std::{io::Read, path::PathBuf, sync::Arc};
use based::ui::prelude::*;
use based::{
request::{
api::GeneratedPager, assets::DataResponse, respond_json, RequestContext, StringResponse,
@ -14,6 +13,7 @@ pub mod component;
use component::*;
use serde_json::json;
use webarc::archive::Document;
use webarc::{
ai::{generate_embedding, remove_data_urls, EmbedStore, SearchResult},
archive::{extract_domains, WebsiteArchive},
@ -22,7 +22,6 @@ use webarc::{
};
// TODO : Implement archive timeline page (chrono sorted documents)
// TODO : impl archive index to db
const SEARCH_BAR_STYLE: &str = "w-full px-4 mb-4 py-2 text-white bg-black border-2 border-neon-blue placeholder-neon-blue focus:ring-2 focus:ring-neon-pink focus:outline-none font-mono text-lg";
@ -193,6 +192,12 @@ pub struct PathSegment {
segments: Vec<String>,
}
impl std::fmt::Display for PathSegment {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.write_str(&self.to_str())
}
}
impl PathSegment {
pub fn to_str(&self) -> String {
self.segments.join("/")
@ -223,6 +228,33 @@ impl<'r> FromSegments<'r> for PathSegment {
}
}
fn is_older_than(date_str: &str, num_days: usize) -> bool {
let date =
chrono::NaiveDate::parse_from_str(date_str, "%Y-%m-%d").expect("Invalid date format");
let threshold_date = chrono::Utc::now().date_naive();
let diff = threshold_date - date;
diff.num_days() > num_days as i64
}
pub async fn redownload(
arc: &WebsiteArchive,
domain: &str,
path: PathSegment,
document: &Document,
shell: &Shell,
) -> Option<DataResponse> {
arc.archive_url(&format!("https://{domain}/{}", path.to_str()))
.await;
let content = document.render_local(None, &shell).await?;
return Some(DataResponse::new(
content.as_bytes().to_vec(),
"text/html".to_string(),
Some(60 * 60 * 24),
));
}
/// Return archived version of `domain` / `path` at `time`
#[get("/s/<domain>/<path..>?<time>")]
pub async fn render_website(
@ -234,10 +266,23 @@ pub async fn render_website(
) -> Option<DataResponse> {
let document = arc.get_domain(domain).path(&path.to_str());
if time.is_none() {
let versions = document.versions();
let latest_version = versions.first()?;
if let Some(outdated) = get_config().get_outdated(domain) {
if is_older_than(latest_version, outdated) {
log::info!("Document {domain} / {path} is outdated, redownloading");
return redownload(&arc, domain, path, &document, &shell).await;
}
}
}
let content = document
.render_local(time.map(|time| time.to_string()), &shell)
.await;
// TODO : keep n versions
if let Some(content) = content {
return Some(DataResponse::new(
content.as_bytes().to_vec(),
@ -245,16 +290,7 @@ pub async fn render_website(
Some(60 * 60 * 24),
));
} else if get_config().DOWNLOAD_ON_DEMAND && time.is_none() {
arc.archive_url(&format!("https://{domain}/{}", path.to_str()))
.await;
let content = document.render_local(None, &shell).await?;
return Some(DataResponse::new(
content.as_bytes().to_vec(),
"text/html".to_string(),
Some(60 * 60 * 24),
));
return redownload(&arc, domain, path, &document, &shell).await;
}
None