webarc/src/archive/mod.rs
JMARyA 907ed2a2ef
All checks were successful
ci/woodpecker/push/build Pipeline was successful
♻️ refactor
2025-03-03 01:35:14 +01:00

457 lines
13 KiB
Rust

use std::{
collections::{HashMap, HashSet},
path::PathBuf,
};
use crate::{
blacklist::{check_blacklist, check_blacklist_path},
conf::get_config,
extract_data_urls,
favicon::download_fav_for,
get_mime_type, sha256_hash,
};
mod document;
mod domain;
mod fragment;
use based::{
get_pg,
ui::{components::prelude::Shell, prelude::Nothing},
};
use chrono::NaiveDate;
pub use document::Document;
pub use domain::*;
pub use fragment::*;
use sqlx::prelude::FromRow;
/// Read directory entries into `Vec<String>`
pub fn read_dir(dir: &PathBuf) -> Vec<String> {
let mut list = Vec::new();
if let Ok(entries) = std::fs::read_dir(dir) {
for entry in entries.flatten() {
if let Some(file_name) = entry.file_name().to_str() {
list.push(file_name.to_string());
}
}
}
list
}
/// Rewrite all URLs in `input` to the format `/s/<domain>/<path..>`
pub fn internalize_urls(input: &str, base: &str) -> String {
// todo : fix regex, domains without path are not captured
let url_pattern = r#"(\ |"|')(?:(<?)(https?:\/\/([a-zA-Z0-9.-]+))?(\/[\w./-]*))"#;
let re = regex::Regex::new(url_pattern).unwrap();
re.replace_all(input, |caps: &regex::Captures| {
if caps.get(2).map(|x| x.as_str()).unwrap_or_default() == "<" {
return caps.get(0).unwrap().as_str().to_string();
}
if caps.get(0).unwrap().as_str() == " //" {
return " //".to_string();
}
let wrap = caps.get(1).map(|x| x.as_str()).unwrap_or_default();
if let Some(domain) = caps.get(3) {
let domain = domain.as_str();
let (protocol, domain) = if domain.starts_with("https://") {
("https", domain.trim_start_matches("https://"))
} else {
("http", domain.trim_start_matches("http://"))
};
let domain = domain.trim_start_matches("www.");
let path = caps.get(5).map_or("", |m| m.as_str());
// Skip transformation if the domain is in the blacklist
if check_blacklist(domain) {
format!("{wrap}{protocol}://{domain}{path}")
} else {
format!("{wrap}/s/{domain}{path}")
}
} else if let Some(path) = caps.get(5) {
// Handle relative paths
format!("{wrap}/s/{base}{}", path.as_str())
} else {
// Default fallback
caps[0].to_string()
}
})
.to_string()
}
/// Extract all domains
pub fn extract_domains(input: &str) -> Vec<String> {
let url_pattern = r"https?://([a-zA-Z0-9.-]+)(/[\w./-]*)?";
let re = regex::Regex::new(url_pattern).unwrap();
let mut domains = HashSet::new();
for caps in re.captures_iter(input) {
let domain = caps[1].trim_start_matches("www.");
domains.insert(domain.to_string());
}
let mut domains: Vec<_> = domains.into_iter().collect();
domains.sort();
domains
}
// TODO : impl archive index to db
/// Represents a directory containg archived websites
#[derive(Debug, Clone)]
pub struct WebsiteArchive {
pub dir: PathBuf,
}
impl WebsiteArchive {
/// Creates a new `WebsiteArchive` instance.
///
/// # Parameters
/// - `dir`: The directory path where the archive will be stored.
///
/// # Returns
/// A new `WebsiteArchive` instance.
pub fn new(dir: &str) -> Self {
Self {
dir: PathBuf::from(dir),
}
}
/// Retrieves the list of domain names stored in the archive.
///
/// # Returns
/// A vector of domain names as strings.
pub fn domains(&self) -> Vec<String> {
read_dir(&self.dir)
}
/// Retrieves a `Domain` instance for a specified domain name.
///
/// # Parameters
/// - `domain`: The name of the domain to retrieve.
///
/// # Returns
/// A `Domain` instance corresponding to the specified domain.
pub fn get_domain(&self, domain: &str) -> Domain {
Domain::new(domain, self.dir.join(domain))
}
/// Archives a URL by downloading and storing its content.
///
/// If the URL does not pass the blacklist check, it will not be archived.
///
/// # Parameters
/// - `url`: The URL to archive.
///
/// This function downloads the content of the URL, processes it, and saves it to the archive.
pub async fn archive_url(&self, url: &str) {
let parsed_url = url::Url::parse(url).unwrap();
let domain = parsed_url.domain().unwrap().trim_start_matches("www.");
// Deny blacklist
if check_blacklist(domain) {
return;
}
let path = parsed_url.path();
if check_blacklist_path(domain, path) {
return;
}
let mut folder_name = self.dir.join(domain);
download_fav_for(domain).await;
for paths in path.split('/') {
let paths = url_escape::decode(paths).to_string();
if !paths.is_empty() {
folder_name = folder_name.join(paths);
}
}
std::fs::create_dir_all(&folder_name).unwrap();
let timestamp = chrono::Utc::now().format("%Y-%m-%d").to_string();
let filename = folder_name.join(format!("index_{timestamp}"));
log::info!("Archiving {url} to {}", filename.to_str().unwrap());
let conf = get_config()
.get_domain_config(domain)
.cloned()
.unwrap_or_default();
let mut cmd = vec!["monolith", "--isolate", "-o", filename.to_str().unwrap()];
if conf.no_audio.unwrap_or_default() {
cmd.push("--no-audio");
}
if conf.no_css.unwrap_or_default() {
cmd.push("--no-css");
}
if conf.no_frames.unwrap_or_default() {
cmd.push("--no-frames");
}
if conf.no_fonts.unwrap_or_default() {
cmd.push("--no-frames");
}
if conf.no_image.unwrap_or_default() {
cmd.push("--no-images");
}
if conf.no_javascript.unwrap_or_default() {
cmd.push("--no-js");
cmd.push("--unwrap-noscript");
}
if conf.no_video.unwrap_or_default() {
cmd.push("--no-video");
}
if let Some(ua) = &conf.user_agent {
cmd.push("--user-agent");
cmd.push(ua.as_str());
}
let mut url = url::Url::parse(&format!("https://{domain}")).unwrap();
url = url.join(path).unwrap();
let url = url.to_string();
cmd.push(&url);
run_command(&cmd);
index_path(&self.get_domain(domain), path).await;
}
}
fn run_command(cmd: &[&str]) {
let mut cmd_setup = std::process::Command::new(cmd[0]);
let cmd_setup = cmd_setup
.args(cmd.iter().skip(1).collect::<Vec<_>>())
.stdout(std::process::Stdio::inherit())
.stderr(std::process::Stdio::inherit());
let child = cmd_setup.spawn().unwrap();
let status = child.wait_with_output().unwrap();
if !status.status.success() {
log::warn!(
"Command {cmd:?} exited with code {}",
status.status.code().unwrap_or_default()
)
}
}
pub async fn index_archive_db(arc: &WebsiteArchive) {
log::info!("Indexing archive");
for dom in arc.domains() {
let dom = arc.get_domain(&dom);
index_path(&dom, "/").await;
}
log::info!("Done indexing archive");
}
pub async fn index_path(dom: &Domain, path: &str) {
let (paths, is_doc) = dom.paths(path);
// If the path is a document, process the root path.
if is_doc {
let doc = dom.path("/");
index_document(&doc).await;
}
// Create a queue to process paths iteratively
let mut queue = std::collections::VecDeque::new();
// Add the initial paths to the queue
queue.extend(paths);
while let Some(next_path) = queue.pop_front() {
let (next_paths, is_doc) = dom.paths(next_path.path());
if is_doc {
let doc = dom.path(next_path.path());
log::info!(
"Indexing {} / {} [{} queued]",
doc.domain,
doc.path,
queue.len()
);
index_document(&doc).await;
}
queue.extend(next_paths);
}
}
pub async fn index_document(doc: &Document) {
for version_str in &doc.versions() {
let domain = &doc.domain;
let path = &doc.path;
let version =
if let Ok(version) = chrono::NaiveDate::parse_from_str(&version_str, "%Y-%m-%d") {
version
} else {
log::error!(
"Could not parse version {version_str} as valid date for {} / {}",
domain,
path
);
continue;
};
if DocumentIndex::exists(domain, path, &version).await {
log::info!(
"Document {} / {} @ {} already indexed",
domain,
path,
version
);
continue;
}
if let Ok(content) = doc
.render_local(
Some(version_str.to_string()),
&Shell::new(Nothing(), Nothing(), Nothing()),
)
.await
{
let size = content.len();
let mime = get_mime_type(&content).unwrap_or("text/html".to_string());
if mime.as_str() == "text/html" {
// TODO : domain links index
let mut hashes = Vec::new();
for (mime, data) in extract_data_urls(&String::from_utf8_lossy(&content)) {
let hash = sha256_hash(&data);
log::info!("{} / {}: Indexing fragment {hash}", doc.domain, doc.path);
hashes.push(hash.clone());
sqlx::query("INSERT INTO fragments (id, mime, blob) VALUES ($1, $2, $3) ON CONFLICT DO NOTHING")
.bind(&hash)
.bind(&mime)
.bind(data)
.execute(get_pg!()).await.unwrap();
}
for hash in hashes {
sqlx::query("INSERT INTO document_fragments (domain, path, version, fragment) VALUES ($1, $2, $3, $4) ON CONFLICT DO NOTHING")
.bind(&doc.domain)
.bind(&doc.path)
.bind(chrono::NaiveDate::parse_from_str(version_str, "%Y-%m-%d").unwrap())
.bind(&hash)
.execute(get_pg!()).await.unwrap();
}
}
if let Ok(version) = chrono::NaiveDate::parse_from_str(version_str, "%Y-%m-%d") {
sqlx::query(
r#"
INSERT INTO document_index (domain, path, version, size, mime)
VALUES ($1, $2, $3, $4, $5)
ON CONFLICT (domain, path, version) DO NOTHING
"#,
)
.bind(&doc.domain)
.bind(&doc.path)
.bind(version)
.bind(size as i64)
.bind(mime)
.execute(get_pg!())
.await
.unwrap();
}
}
}
}
#[derive(Debug, Clone, FromRow)]
pub struct DocumentIndex {
pub domain: String,
pub path: String,
pub version: chrono::NaiveDate,
pub size: i64,
pub mime: String,
}
impl DocumentIndex {
pub async fn exists(domain: &str, path: &str, version: &chrono::NaiveDate) -> bool {
let res: Option<Self> = sqlx::query_as(
"SELECT * FROM document_index WHERE domain = $1 AND path = $2 AND version = $3",
)
.bind(domain)
.bind(path)
.bind(version)
.fetch_optional(get_pg!())
.await
.unwrap();
res.is_some()
}
pub fn url(&self) -> String {
format!(
"/s/{}/{}?time={}",
self.domain,
self.path,
self.version.to_string()
)
}
pub async fn get_documents_of_day(
day: NaiveDate,
domain: Option<&str>,
) -> HashMap<String, Vec<String>> {
let res: Vec<(String, String)> = if let Some(domain) = domain {
sqlx::query_as(
"SELECT domain, path FROM document_index WHERE version = $1 WHERE domain = $2",
)
.bind(day)
.bind(domain)
.fetch_all(get_pg!())
.await
.unwrap()
} else {
sqlx::query_as("SELECT domain, path FROM document_index WHERE version = $1")
.bind(day)
.fetch_all(get_pg!())
.await
.unwrap()
};
let mut ret = HashMap::new();
for (domain, path) in res {
let d: &mut Vec<String> = ret.entry(domain).or_default();
d.push(path);
}
ret
}
pub async fn get_documents_of_other_mime(domain: &str) -> Vec<DocumentIndex> {
sqlx::query_as("SELECT * FROM document_index WHERE mime != 'text/html' AND domain = $1")
.bind(domain)
.fetch_all(get_pg!())
.await
.unwrap()
}
}