✨ archive index
This commit is contained in:
parent
3e77ec5008
commit
5b545780a4
7 changed files with 143 additions and 17 deletions
|
@ -12,6 +12,12 @@ BLACKLIST_DOMAINS = [
|
||||||
"youtube" # YouTube
|
"youtube" # YouTube
|
||||||
]
|
]
|
||||||
|
|
||||||
|
# Consider a refresh after interval (in days) (Global)
|
||||||
|
outdated = 3
|
||||||
|
|
||||||
|
# Keep last n versions (Global)
|
||||||
|
keep_n = 5
|
||||||
|
|
||||||
# Domain configuration (Example)
|
# Domain configuration (Example)
|
||||||
[[websites.domains]]
|
[[websites.domains]]
|
||||||
# The domain the config applies to
|
# The domain the config applies to
|
||||||
|
|
7
migrations/0002_doc_index.sql
Normal file
7
migrations/0002_doc_index.sql
Normal file
|
@ -0,0 +1,7 @@
|
||||||
|
|
||||||
|
CREATE TABLE document_index IF NOT EXISTS (
|
||||||
|
domain TEXT NOT NULL,
|
||||||
|
path TEXT NOT NULL,
|
||||||
|
version DATE NOT NULL,
|
||||||
|
PRIMARY KEY (domain, path, version)
|
||||||
|
);
|
|
@ -8,6 +8,7 @@ use crate::{
|
||||||
|
|
||||||
mod document;
|
mod document;
|
||||||
mod domain;
|
mod domain;
|
||||||
|
use based::get_pg;
|
||||||
pub use document::Document;
|
pub use document::Document;
|
||||||
pub use domain::*;
|
pub use domain::*;
|
||||||
|
|
||||||
|
@ -88,6 +89,8 @@ pub fn extract_domains(input: &str) -> Vec<String> {
|
||||||
domains
|
domains
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TODO : impl archive index to db
|
||||||
|
|
||||||
/// Represents a directory containg archived websites
|
/// Represents a directory containg archived websites
|
||||||
#[derive(Debug, Clone)]
|
#[derive(Debug, Clone)]
|
||||||
pub struct WebsiteArchive {
|
pub struct WebsiteArchive {
|
||||||
|
@ -231,3 +234,61 @@ fn run_command(cmd: &[&str]) {
|
||||||
let status = child.wait_with_output().unwrap();
|
let status = child.wait_with_output().unwrap();
|
||||||
assert!(status.status.success());
|
assert!(status.status.success());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub async fn index_archive_db(arc: &WebsiteArchive) {
|
||||||
|
log::info!("Indexing archive");
|
||||||
|
|
||||||
|
for dom in arc.domains() {
|
||||||
|
let dom = arc.get_domain(&dom);
|
||||||
|
index_path(&dom, "/").await;
|
||||||
|
}
|
||||||
|
|
||||||
|
log::info!("Done indexing archive");
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn index_path(dom: &Domain, path: &str) {
|
||||||
|
let (paths, is_doc) = dom.paths(path);
|
||||||
|
|
||||||
|
// If the path is a document, process the root path.
|
||||||
|
if is_doc {
|
||||||
|
let doc = dom.path("/");
|
||||||
|
index_document(&doc).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create a queue to process paths iteratively
|
||||||
|
let mut queue = std::collections::VecDeque::new();
|
||||||
|
|
||||||
|
// Add the initial paths to the queue
|
||||||
|
queue.extend(paths);
|
||||||
|
|
||||||
|
while let Some(next_path) = queue.pop_front() {
|
||||||
|
let (next_paths, is_doc) = dom.paths(next_path.path());
|
||||||
|
|
||||||
|
if is_doc {
|
||||||
|
let doc = dom.path(next_path.path());
|
||||||
|
index_document(&doc).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
queue.extend(next_paths);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn index_document(doc: &Document) {
|
||||||
|
for version_str in &doc.versions() {
|
||||||
|
if let Ok(version) = chrono::NaiveDate::parse_from_str(version_str, "%Y-%m-%d") {
|
||||||
|
sqlx::query(
|
||||||
|
r#"
|
||||||
|
INSERT INTO document_index (domain, path, version)
|
||||||
|
VALUES ($1, $2, $3)
|
||||||
|
ON CONFLICT (domain, path, version) DO NOTHING
|
||||||
|
"#,
|
||||||
|
)
|
||||||
|
.bind(&doc.domain)
|
||||||
|
.bind(&doc.path)
|
||||||
|
.bind(version)
|
||||||
|
.execute(get_pg!())
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
16
src/conf.rs
16
src/conf.rs
|
@ -49,6 +49,20 @@ impl Config {
|
||||||
|
|
||||||
None
|
None
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn get_outdated(&self, domain: &str) -> Option<usize> {
|
||||||
|
if let Some(conf) = self.get_domain_config(domain) {
|
||||||
|
if let Some(outdated) = conf.outdated {
|
||||||
|
return Some(outdated);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(outdated) = self.websites.as_ref().map(|x| x.outdated) {
|
||||||
|
return outdated;
|
||||||
|
}
|
||||||
|
|
||||||
|
None
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[allow(non_snake_case)]
|
#[allow(non_snake_case)]
|
||||||
|
@ -61,6 +75,8 @@ pub struct AIConfig {
|
||||||
#[derive(Debug, Clone, Deserialize)]
|
#[derive(Debug, Clone, Deserialize)]
|
||||||
pub struct WebsiteConfig {
|
pub struct WebsiteConfig {
|
||||||
pub BLACKLIST_DOMAINS: Option<Vec<String>>,
|
pub BLACKLIST_DOMAINS: Option<Vec<String>>,
|
||||||
|
pub outdated: Option<usize>,
|
||||||
|
pub keep_n: Option<usize>,
|
||||||
pub domains: Option<Vec<DomainConfig>>,
|
pub domains: Option<Vec<DomainConfig>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
use based::request::{RequestContext, StringResponse};
|
use based::request::{RequestContext, StringResponse};
|
||||||
use based::ui::components::Shell;
|
use based::ui::components::Shell;
|
||||||
use maud::{html, PreEscaped};
|
use maud::PreEscaped;
|
||||||
|
|
||||||
pub mod ai;
|
pub mod ai;
|
||||||
pub mod archive;
|
pub mod archive;
|
||||||
|
|
|
@ -3,9 +3,8 @@ use based::get_pg;
|
||||||
use based::ui::components::Shell;
|
use based::ui::components::Shell;
|
||||||
use based::ui::prelude::*;
|
use based::ui::prelude::*;
|
||||||
use rocket::routes;
|
use rocket::routes;
|
||||||
use rocket::time::format_description::modifier::Padding;
|
|
||||||
use webarc::ai::EmbedStore;
|
use webarc::ai::EmbedStore;
|
||||||
use webarc::archive::WebsiteArchive;
|
use webarc::archive::{index_archive_db, WebsiteArchive};
|
||||||
use webarc::conf::{get_config, load_config, load_default_config};
|
use webarc::conf::{get_config, load_config, load_default_config};
|
||||||
|
|
||||||
mod args;
|
mod args;
|
||||||
|
@ -45,6 +44,9 @@ async fn main() {
|
||||||
webarc::favicon::download_favicons_for_sites(&archive.domains()).await;
|
webarc::favicon::download_favicons_for_sites(&archive.domains()).await;
|
||||||
});
|
});
|
||||||
|
|
||||||
|
let archive = arc.clone();
|
||||||
|
tokio::spawn(async move { index_archive_db(&archive).await });
|
||||||
|
|
||||||
rocket::build()
|
rocket::build()
|
||||||
.mount_assets()
|
.mount_assets()
|
||||||
.mount(
|
.mount(
|
||||||
|
@ -189,6 +191,4 @@ pub fn get_shell() -> Shell {
|
||||||
.use_ui()
|
.use_ui()
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO : redownload after threshold
|
|
||||||
// TODO : keep n versions
|
|
||||||
// TODO : archive cleanup code
|
// TODO : archive cleanup code
|
||||||
|
|
|
@ -1,6 +1,5 @@
|
||||||
use std::{io::Read, path::PathBuf, sync::Arc};
|
use std::{io::Read, path::PathBuf, sync::Arc};
|
||||||
|
|
||||||
use based::ui::prelude::*;
|
|
||||||
use based::{
|
use based::{
|
||||||
request::{
|
request::{
|
||||||
api::GeneratedPager, assets::DataResponse, respond_json, RequestContext, StringResponse,
|
api::GeneratedPager, assets::DataResponse, respond_json, RequestContext, StringResponse,
|
||||||
|
@ -14,6 +13,7 @@ pub mod component;
|
||||||
use component::*;
|
use component::*;
|
||||||
use serde_json::json;
|
use serde_json::json;
|
||||||
|
|
||||||
|
use webarc::archive::Document;
|
||||||
use webarc::{
|
use webarc::{
|
||||||
ai::{generate_embedding, remove_data_urls, EmbedStore, SearchResult},
|
ai::{generate_embedding, remove_data_urls, EmbedStore, SearchResult},
|
||||||
archive::{extract_domains, WebsiteArchive},
|
archive::{extract_domains, WebsiteArchive},
|
||||||
|
@ -22,7 +22,6 @@ use webarc::{
|
||||||
};
|
};
|
||||||
|
|
||||||
// TODO : Implement archive timeline page (chrono sorted documents)
|
// TODO : Implement archive timeline page (chrono sorted documents)
|
||||||
// TODO : impl archive index to db
|
|
||||||
|
|
||||||
const SEARCH_BAR_STYLE: &str = "w-full px-4 mb-4 py-2 text-white bg-black border-2 border-neon-blue placeholder-neon-blue focus:ring-2 focus:ring-neon-pink focus:outline-none font-mono text-lg";
|
const SEARCH_BAR_STYLE: &str = "w-full px-4 mb-4 py-2 text-white bg-black border-2 border-neon-blue placeholder-neon-blue focus:ring-2 focus:ring-neon-pink focus:outline-none font-mono text-lg";
|
||||||
|
|
||||||
|
@ -193,6 +192,12 @@ pub struct PathSegment {
|
||||||
segments: Vec<String>,
|
segments: Vec<String>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl std::fmt::Display for PathSegment {
|
||||||
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||||
|
f.write_str(&self.to_str())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl PathSegment {
|
impl PathSegment {
|
||||||
pub fn to_str(&self) -> String {
|
pub fn to_str(&self) -> String {
|
||||||
self.segments.join("/")
|
self.segments.join("/")
|
||||||
|
@ -223,6 +228,33 @@ impl<'r> FromSegments<'r> for PathSegment {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn is_older_than(date_str: &str, num_days: usize) -> bool {
|
||||||
|
let date =
|
||||||
|
chrono::NaiveDate::parse_from_str(date_str, "%Y-%m-%d").expect("Invalid date format");
|
||||||
|
let threshold_date = chrono::Utc::now().date_naive();
|
||||||
|
let diff = threshold_date - date;
|
||||||
|
diff.num_days() > num_days as i64
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn redownload(
|
||||||
|
arc: &WebsiteArchive,
|
||||||
|
domain: &str,
|
||||||
|
path: PathSegment,
|
||||||
|
document: &Document,
|
||||||
|
shell: &Shell,
|
||||||
|
) -> Option<DataResponse> {
|
||||||
|
arc.archive_url(&format!("https://{domain}/{}", path.to_str()))
|
||||||
|
.await;
|
||||||
|
|
||||||
|
let content = document.render_local(None, &shell).await?;
|
||||||
|
|
||||||
|
return Some(DataResponse::new(
|
||||||
|
content.as_bytes().to_vec(),
|
||||||
|
"text/html".to_string(),
|
||||||
|
Some(60 * 60 * 24),
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
/// Return archived version of `domain` / `path` at `time`
|
/// Return archived version of `domain` / `path` at `time`
|
||||||
#[get("/s/<domain>/<path..>?<time>")]
|
#[get("/s/<domain>/<path..>?<time>")]
|
||||||
pub async fn render_website(
|
pub async fn render_website(
|
||||||
|
@ -234,10 +266,23 @@ pub async fn render_website(
|
||||||
) -> Option<DataResponse> {
|
) -> Option<DataResponse> {
|
||||||
let document = arc.get_domain(domain).path(&path.to_str());
|
let document = arc.get_domain(domain).path(&path.to_str());
|
||||||
|
|
||||||
|
if time.is_none() {
|
||||||
|
let versions = document.versions();
|
||||||
|
let latest_version = versions.first()?;
|
||||||
|
if let Some(outdated) = get_config().get_outdated(domain) {
|
||||||
|
if is_older_than(latest_version, outdated) {
|
||||||
|
log::info!("Document {domain} / {path} is outdated, redownloading");
|
||||||
|
return redownload(&arc, domain, path, &document, &shell).await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
let content = document
|
let content = document
|
||||||
.render_local(time.map(|time| time.to_string()), &shell)
|
.render_local(time.map(|time| time.to_string()), &shell)
|
||||||
.await;
|
.await;
|
||||||
|
|
||||||
|
// TODO : keep n versions
|
||||||
|
|
||||||
if let Some(content) = content {
|
if let Some(content) = content {
|
||||||
return Some(DataResponse::new(
|
return Some(DataResponse::new(
|
||||||
content.as_bytes().to_vec(),
|
content.as_bytes().to_vec(),
|
||||||
|
@ -245,16 +290,7 @@ pub async fn render_website(
|
||||||
Some(60 * 60 * 24),
|
Some(60 * 60 * 24),
|
||||||
));
|
));
|
||||||
} else if get_config().DOWNLOAD_ON_DEMAND && time.is_none() {
|
} else if get_config().DOWNLOAD_ON_DEMAND && time.is_none() {
|
||||||
arc.archive_url(&format!("https://{domain}/{}", path.to_str()))
|
return redownload(&arc, domain, path, &document, &shell).await;
|
||||||
.await;
|
|
||||||
|
|
||||||
let content = document.render_local(None, &shell).await?;
|
|
||||||
|
|
||||||
return Some(DataResponse::new(
|
|
||||||
content.as_bytes().to_vec(),
|
|
||||||
"text/html".to_string(),
|
|
||||||
Some(60 * 60 * 24),
|
|
||||||
));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
None
|
None
|
||||||
|
|
Loading…
Add table
Reference in a new issue