✨ archive index
This commit is contained in:
parent
3e77ec5008
commit
5b545780a4
7 changed files with 143 additions and 17 deletions
|
@ -8,6 +8,7 @@ use crate::{
|
|||
|
||||
mod document;
|
||||
mod domain;
|
||||
use based::get_pg;
|
||||
pub use document::Document;
|
||||
pub use domain::*;
|
||||
|
||||
|
@ -88,6 +89,8 @@ pub fn extract_domains(input: &str) -> Vec<String> {
|
|||
domains
|
||||
}
|
||||
|
||||
// TODO : impl archive index to db
|
||||
|
||||
/// Represents a directory containg archived websites
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct WebsiteArchive {
|
||||
|
@ -231,3 +234,61 @@ fn run_command(cmd: &[&str]) {
|
|||
let status = child.wait_with_output().unwrap();
|
||||
assert!(status.status.success());
|
||||
}
|
||||
|
||||
pub async fn index_archive_db(arc: &WebsiteArchive) {
|
||||
log::info!("Indexing archive");
|
||||
|
||||
for dom in arc.domains() {
|
||||
let dom = arc.get_domain(&dom);
|
||||
index_path(&dom, "/").await;
|
||||
}
|
||||
|
||||
log::info!("Done indexing archive");
|
||||
}
|
||||
|
||||
pub async fn index_path(dom: &Domain, path: &str) {
|
||||
let (paths, is_doc) = dom.paths(path);
|
||||
|
||||
// If the path is a document, process the root path.
|
||||
if is_doc {
|
||||
let doc = dom.path("/");
|
||||
index_document(&doc).await;
|
||||
}
|
||||
|
||||
// Create a queue to process paths iteratively
|
||||
let mut queue = std::collections::VecDeque::new();
|
||||
|
||||
// Add the initial paths to the queue
|
||||
queue.extend(paths);
|
||||
|
||||
while let Some(next_path) = queue.pop_front() {
|
||||
let (next_paths, is_doc) = dom.paths(next_path.path());
|
||||
|
||||
if is_doc {
|
||||
let doc = dom.path(next_path.path());
|
||||
index_document(&doc).await;
|
||||
}
|
||||
|
||||
queue.extend(next_paths);
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn index_document(doc: &Document) {
|
||||
for version_str in &doc.versions() {
|
||||
if let Ok(version) = chrono::NaiveDate::parse_from_str(version_str, "%Y-%m-%d") {
|
||||
sqlx::query(
|
||||
r#"
|
||||
INSERT INTO document_index (domain, path, version)
|
||||
VALUES ($1, $2, $3)
|
||||
ON CONFLICT (domain, path, version) DO NOTHING
|
||||
"#,
|
||||
)
|
||||
.bind(&doc.domain)
|
||||
.bind(&doc.path)
|
||||
.bind(version)
|
||||
.execute(get_pg!())
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue