archive index

This commit is contained in:
JMARyA 2025-02-09 22:03:33 +01:00
parent 3e77ec5008
commit 5b545780a4
Signed by: jmarya
GPG key ID: 901B2ADDF27C2263
7 changed files with 143 additions and 17 deletions

View file

@ -8,6 +8,7 @@ use crate::{
mod document;
mod domain;
use based::get_pg;
pub use document::Document;
pub use domain::*;
@ -88,6 +89,8 @@ pub fn extract_domains(input: &str) -> Vec<String> {
domains
}
// TODO : impl archive index to db
/// Represents a directory containg archived websites
#[derive(Debug, Clone)]
pub struct WebsiteArchive {
@ -231,3 +234,61 @@ fn run_command(cmd: &[&str]) {
let status = child.wait_with_output().unwrap();
assert!(status.status.success());
}
pub async fn index_archive_db(arc: &WebsiteArchive) {
log::info!("Indexing archive");
for dom in arc.domains() {
let dom = arc.get_domain(&dom);
index_path(&dom, "/").await;
}
log::info!("Done indexing archive");
}
pub async fn index_path(dom: &Domain, path: &str) {
let (paths, is_doc) = dom.paths(path);
// If the path is a document, process the root path.
if is_doc {
let doc = dom.path("/");
index_document(&doc).await;
}
// Create a queue to process paths iteratively
let mut queue = std::collections::VecDeque::new();
// Add the initial paths to the queue
queue.extend(paths);
while let Some(next_path) = queue.pop_front() {
let (next_paths, is_doc) = dom.paths(next_path.path());
if is_doc {
let doc = dom.path(next_path.path());
index_document(&doc).await;
}
queue.extend(next_paths);
}
}
pub async fn index_document(doc: &Document) {
for version_str in &doc.versions() {
if let Ok(version) = chrono::NaiveDate::parse_from_str(version_str, "%Y-%m-%d") {
sqlx::query(
r#"
INSERT INTO document_index (domain, path, version)
VALUES ($1, $2, $3)
ON CONFLICT (domain, path, version) DO NOTHING
"#,
)
.bind(&doc.domain)
.bind(&doc.path)
.bind(version)
.execute(get_pg!())
.await
.unwrap();
}
}
}