This commit is contained in:
parent
aba031a047
commit
dc4fb4079e
9 changed files with 433 additions and 14 deletions
37
src/archive/fragment.rs
Normal file
37
src/archive/fragment.rs
Normal file
|
@ -0,0 +1,37 @@
|
|||
use based::get_pg;
|
||||
|
||||
pub async fn get_fragment(hash: &str) -> Option<(Vec<u8>, String)> {
|
||||
sqlx::query_as("SELECT blob, mime FROM fragments WHERE id = $1")
|
||||
.bind(hash)
|
||||
.fetch_optional(get_pg!())
|
||||
.await
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
pub async fn get_random_fragment_id() -> String {
|
||||
let random_id: (String,) = sqlx::query_as("SELECT id FROM fragments ORDER BY RANDOM() LIMIT 1")
|
||||
.fetch_one(get_pg!())
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
random_id.0
|
||||
}
|
||||
|
||||
pub async fn domain_has_fragments(domain: &str) -> bool {
|
||||
let exists: (bool,) =
|
||||
sqlx::query_as("SELECT EXISTS(SELECT 1 FROM document_fragments WHERE domain = $1)")
|
||||
.bind(domain)
|
||||
.fetch_one(get_pg!())
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
exists.0
|
||||
}
|
||||
|
||||
pub async fn get_fragments_of_domain(domain: &str) -> Vec<(String, String)> {
|
||||
let res: Vec<(String, String)> = sqlx::query_as("SELECT df.fragment, f.mime FROM document_fragments df JOIN fragments f ON df.fragment = f.id WHERE df.domain = $1")
|
||||
.bind(domain)
|
||||
.fetch_all(get_pg!()).await.unwrap();
|
||||
|
||||
res.into_iter().map(|x| (x.0, x.1)).collect()
|
||||
}
|
|
@ -6,12 +6,15 @@ use std::{
|
|||
use crate::{
|
||||
blacklist::{check_blacklist, check_blacklist_path},
|
||||
conf::get_config,
|
||||
extract_data_urls,
|
||||
favicon::download_fav_for,
|
||||
get_mime_type,
|
||||
get_mime_type, sha256_hash,
|
||||
};
|
||||
|
||||
mod document;
|
||||
mod domain;
|
||||
mod fragment;
|
||||
|
||||
use based::{
|
||||
get_pg,
|
||||
ui::{components::prelude::Shell, prelude::Nothing},
|
||||
|
@ -19,6 +22,8 @@ use based::{
|
|||
use chrono::NaiveDate;
|
||||
pub use document::Document;
|
||||
pub use domain::*;
|
||||
pub use fragment::*;
|
||||
use sqlx::prelude::FromRow;
|
||||
|
||||
/// Read directory entries into `Vec<String>`
|
||||
pub fn read_dir(dir: &PathBuf) -> Vec<String> {
|
||||
|
@ -251,8 +256,6 @@ fn run_command(cmd: &[&str]) {
|
|||
}
|
||||
|
||||
pub async fn index_archive_db(arc: &WebsiteArchive) {
|
||||
// TODO : more index attrs size,mime
|
||||
|
||||
log::info!("Indexing archive");
|
||||
|
||||
for dom in arc.domains() {
|
||||
|
@ -304,7 +307,27 @@ pub async fn index_document(doc: &Document) {
|
|||
|
||||
if mime.as_str() == "text/html" {
|
||||
// TODO : domain links index
|
||||
// TODO : data fragments
|
||||
|
||||
let mut hashes = Vec::new();
|
||||
|
||||
for (mime, data) in extract_data_urls(&String::from_utf8_lossy(&content)) {
|
||||
let hash = sha256_hash(&data);
|
||||
hashes.push(hash.clone());
|
||||
sqlx::query("INSERT INTO fragments (id, mime, blob) VALUES ($1, $2, $3) ON CONFLICT DO NOTHING")
|
||||
.bind(&hash)
|
||||
.bind(&mime)
|
||||
.bind(data)
|
||||
.execute(get_pg!()).await.unwrap();
|
||||
}
|
||||
|
||||
for hash in hashes {
|
||||
sqlx::query("INSERT INTO document_fragments (domain, path, version, fragment) VALUES ($1, $2, $3, $4) ON CONFLICT DO NOTHING")
|
||||
.bind(&doc.domain)
|
||||
.bind(&doc.path)
|
||||
.bind(chrono::NaiveDate::parse_from_str(version_str, "%Y-%m-%d").unwrap())
|
||||
.bind(&hash)
|
||||
.execute(get_pg!()).await.unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
if let Ok(version) = chrono::NaiveDate::parse_from_str(version_str, "%Y-%m-%d") {
|
||||
|
@ -328,9 +351,25 @@ pub async fn index_document(doc: &Document) {
|
|||
}
|
||||
}
|
||||
|
||||
pub struct DocumentIndex {}
|
||||
#[derive(Debug, Clone, FromRow)]
|
||||
pub struct DocumentIndex {
|
||||
pub domain: String,
|
||||
pub path: String,
|
||||
pub version: chrono::NaiveDate,
|
||||
pub size: i64,
|
||||
pub mime: String,
|
||||
}
|
||||
|
||||
impl DocumentIndex {
|
||||
pub fn url(&self) -> String {
|
||||
format!(
|
||||
"/s/{}/{}?time={}",
|
||||
self.domain,
|
||||
self.path,
|
||||
self.version.to_string()
|
||||
)
|
||||
}
|
||||
|
||||
pub async fn get_documents_of_day(
|
||||
day: NaiveDate,
|
||||
domain: Option<&str>,
|
||||
|
@ -361,4 +400,12 @@ impl DocumentIndex {
|
|||
|
||||
ret
|
||||
}
|
||||
|
||||
pub async fn get_documents_of_other_mime(domain: &str) -> Vec<DocumentIndex> {
|
||||
sqlx::query_as("SELECT * FROM document_index WHERE mime != 'text/html' AND domain = $1")
|
||||
.bind(domain)
|
||||
.fetch_all(get_pg!())
|
||||
.await
|
||||
.unwrap()
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue