fragments
Some checks are pending
ci/woodpecker/push/build Pipeline is pending

This commit is contained in:
JMARyA 2025-02-25 12:39:05 +01:00
parent aba031a047
commit dc4fb4079e
Signed by: jmarya
GPG key ID: 901B2ADDF27C2263
9 changed files with 433 additions and 14 deletions

37
src/archive/fragment.rs Normal file
View file

@ -0,0 +1,37 @@
use based::get_pg;
pub async fn get_fragment(hash: &str) -> Option<(Vec<u8>, String)> {
sqlx::query_as("SELECT blob, mime FROM fragments WHERE id = $1")
.bind(hash)
.fetch_optional(get_pg!())
.await
.unwrap()
}
pub async fn get_random_fragment_id() -> String {
let random_id: (String,) = sqlx::query_as("SELECT id FROM fragments ORDER BY RANDOM() LIMIT 1")
.fetch_one(get_pg!())
.await
.unwrap();
random_id.0
}
pub async fn domain_has_fragments(domain: &str) -> bool {
let exists: (bool,) =
sqlx::query_as("SELECT EXISTS(SELECT 1 FROM document_fragments WHERE domain = $1)")
.bind(domain)
.fetch_one(get_pg!())
.await
.unwrap();
exists.0
}
pub async fn get_fragments_of_domain(domain: &str) -> Vec<(String, String)> {
let res: Vec<(String, String)> = sqlx::query_as("SELECT df.fragment, f.mime FROM document_fragments df JOIN fragments f ON df.fragment = f.id WHERE df.domain = $1")
.bind(domain)
.fetch_all(get_pg!()).await.unwrap();
res.into_iter().map(|x| (x.0, x.1)).collect()
}

View file

@ -6,12 +6,15 @@ use std::{
use crate::{
blacklist::{check_blacklist, check_blacklist_path},
conf::get_config,
extract_data_urls,
favicon::download_fav_for,
get_mime_type,
get_mime_type, sha256_hash,
};
mod document;
mod domain;
mod fragment;
use based::{
get_pg,
ui::{components::prelude::Shell, prelude::Nothing},
@ -19,6 +22,8 @@ use based::{
use chrono::NaiveDate;
pub use document::Document;
pub use domain::*;
pub use fragment::*;
use sqlx::prelude::FromRow;
/// Read directory entries into `Vec<String>`
pub fn read_dir(dir: &PathBuf) -> Vec<String> {
@ -251,8 +256,6 @@ fn run_command(cmd: &[&str]) {
}
pub async fn index_archive_db(arc: &WebsiteArchive) {
// TODO : more index attrs size,mime
log::info!("Indexing archive");
for dom in arc.domains() {
@ -304,7 +307,27 @@ pub async fn index_document(doc: &Document) {
if mime.as_str() == "text/html" {
// TODO : domain links index
// TODO : data fragments
let mut hashes = Vec::new();
for (mime, data) in extract_data_urls(&String::from_utf8_lossy(&content)) {
let hash = sha256_hash(&data);
hashes.push(hash.clone());
sqlx::query("INSERT INTO fragments (id, mime, blob) VALUES ($1, $2, $3) ON CONFLICT DO NOTHING")
.bind(&hash)
.bind(&mime)
.bind(data)
.execute(get_pg!()).await.unwrap();
}
for hash in hashes {
sqlx::query("INSERT INTO document_fragments (domain, path, version, fragment) VALUES ($1, $2, $3, $4) ON CONFLICT DO NOTHING")
.bind(&doc.domain)
.bind(&doc.path)
.bind(chrono::NaiveDate::parse_from_str(version_str, "%Y-%m-%d").unwrap())
.bind(&hash)
.execute(get_pg!()).await.unwrap();
}
}
if let Ok(version) = chrono::NaiveDate::parse_from_str(version_str, "%Y-%m-%d") {
@ -328,9 +351,25 @@ pub async fn index_document(doc: &Document) {
}
}
pub struct DocumentIndex {}
#[derive(Debug, Clone, FromRow)]
pub struct DocumentIndex {
pub domain: String,
pub path: String,
pub version: chrono::NaiveDate,
pub size: i64,
pub mime: String,
}
impl DocumentIndex {
pub fn url(&self) -> String {
format!(
"/s/{}/{}?time={}",
self.domain,
self.path,
self.version.to_string()
)
}
pub async fn get_documents_of_day(
day: NaiveDate,
domain: Option<&str>,
@ -361,4 +400,12 @@ impl DocumentIndex {
ret
}
pub async fn get_documents_of_other_mime(domain: &str) -> Vec<DocumentIndex> {
sqlx::query_as("SELECT * FROM document_index WHERE mime != 'text/html' AND domain = $1")
.bind(domain)
.fetch_all(get_pg!())
.await
.unwrap()
}
}