457 lines
13 KiB
Rust
457 lines
13 KiB
Rust
use std::{
|
|
collections::{HashMap, HashSet},
|
|
path::PathBuf,
|
|
};
|
|
|
|
use crate::{
|
|
blacklist::{check_blacklist, check_blacklist_path},
|
|
conf::get_config,
|
|
extract_data_urls,
|
|
favicon::download_fav_for,
|
|
get_mime_type, sha256_hash,
|
|
};
|
|
|
|
mod document;
|
|
mod domain;
|
|
mod fragment;
|
|
|
|
use based::{
|
|
get_pg,
|
|
ui::{components::prelude::Shell, prelude::Nothing},
|
|
};
|
|
use chrono::NaiveDate;
|
|
pub use document::Document;
|
|
pub use domain::*;
|
|
pub use fragment::*;
|
|
use sqlx::prelude::FromRow;
|
|
|
|
/// Read directory entries into `Vec<String>`
|
|
pub fn read_dir(dir: &PathBuf) -> Vec<String> {
|
|
let mut list = Vec::new();
|
|
|
|
if let Ok(entries) = std::fs::read_dir(dir) {
|
|
for entry in entries.flatten() {
|
|
if let Some(file_name) = entry.file_name().to_str() {
|
|
list.push(file_name.to_string());
|
|
}
|
|
}
|
|
}
|
|
|
|
list
|
|
}
|
|
|
|
/// Rewrite all URLs in `input` to the format `/s/<domain>/<path..>`
|
|
pub fn internalize_urls(input: &str, base: &str) -> String {
|
|
// todo : fix regex, domains without path are not captured
|
|
let url_pattern = r#"(\ |"|')(?:(<?)(https?:\/\/([a-zA-Z0-9.-]+))?(\/[\w./-]*))"#;
|
|
let re = regex::Regex::new(url_pattern).unwrap();
|
|
|
|
re.replace_all(input, |caps: ®ex::Captures| {
|
|
if caps.get(2).map(|x| x.as_str()).unwrap_or_default() == "<" {
|
|
return caps.get(0).unwrap().as_str().to_string();
|
|
}
|
|
|
|
if caps.get(0).unwrap().as_str() == " //" {
|
|
return " //".to_string();
|
|
}
|
|
|
|
let wrap = caps.get(1).map(|x| x.as_str()).unwrap_or_default();
|
|
|
|
if let Some(domain) = caps.get(3) {
|
|
let domain = domain.as_str();
|
|
let (protocol, domain) = if domain.starts_with("https://") {
|
|
("https", domain.trim_start_matches("https://"))
|
|
} else {
|
|
("http", domain.trim_start_matches("http://"))
|
|
};
|
|
|
|
let domain = domain.trim_start_matches("www.");
|
|
let path = caps.get(5).map_or("", |m| m.as_str());
|
|
|
|
// Skip transformation if the domain is in the blacklist
|
|
if check_blacklist(domain) {
|
|
format!("{wrap}{protocol}://{domain}{path}")
|
|
} else {
|
|
format!("{wrap}/s/{domain}{path}")
|
|
}
|
|
} else if let Some(path) = caps.get(5) {
|
|
// Handle relative paths
|
|
format!("{wrap}/s/{base}{}", path.as_str())
|
|
} else {
|
|
// Default fallback
|
|
caps[0].to_string()
|
|
}
|
|
})
|
|
.to_string()
|
|
}
|
|
|
|
/// Extract all domains
|
|
pub fn extract_domains(input: &str) -> Vec<String> {
|
|
let url_pattern = r"https?://([a-zA-Z0-9.-]+)(/[\w./-]*)?";
|
|
let re = regex::Regex::new(url_pattern).unwrap();
|
|
|
|
let mut domains = HashSet::new();
|
|
for caps in re.captures_iter(input) {
|
|
let domain = caps[1].trim_start_matches("www.");
|
|
domains.insert(domain.to_string());
|
|
}
|
|
|
|
let mut domains: Vec<_> = domains.into_iter().collect();
|
|
domains.sort();
|
|
|
|
domains
|
|
}
|
|
|
|
// TODO : impl archive index to db
|
|
|
|
/// Represents a directory containg archived websites
|
|
#[derive(Debug, Clone)]
|
|
pub struct WebsiteArchive {
|
|
pub dir: PathBuf,
|
|
}
|
|
|
|
impl WebsiteArchive {
|
|
/// Creates a new `WebsiteArchive` instance.
|
|
///
|
|
/// # Parameters
|
|
/// - `dir`: The directory path where the archive will be stored.
|
|
///
|
|
/// # Returns
|
|
/// A new `WebsiteArchive` instance.
|
|
pub fn new(dir: &str) -> Self {
|
|
Self {
|
|
dir: PathBuf::from(dir),
|
|
}
|
|
}
|
|
|
|
/// Retrieves the list of domain names stored in the archive.
|
|
///
|
|
/// # Returns
|
|
/// A vector of domain names as strings.
|
|
pub fn domains(&self) -> Vec<String> {
|
|
read_dir(&self.dir)
|
|
}
|
|
|
|
/// Retrieves a `Domain` instance for a specified domain name.
|
|
///
|
|
/// # Parameters
|
|
/// - `domain`: The name of the domain to retrieve.
|
|
///
|
|
/// # Returns
|
|
/// A `Domain` instance corresponding to the specified domain.
|
|
pub fn get_domain(&self, domain: &str) -> Domain {
|
|
Domain::new(domain, self.dir.join(domain))
|
|
}
|
|
|
|
/// Archives a URL by downloading and storing its content.
|
|
///
|
|
/// If the URL does not pass the blacklist check, it will not be archived.
|
|
///
|
|
/// # Parameters
|
|
/// - `url`: The URL to archive.
|
|
///
|
|
/// This function downloads the content of the URL, processes it, and saves it to the archive.
|
|
pub async fn archive_url(&self, url: &str) {
|
|
let parsed_url = url::Url::parse(url).unwrap();
|
|
|
|
let domain = parsed_url.domain().unwrap().trim_start_matches("www.");
|
|
|
|
// Deny blacklist
|
|
if check_blacklist(domain) {
|
|
return;
|
|
}
|
|
|
|
let path = parsed_url.path();
|
|
|
|
if check_blacklist_path(domain, path) {
|
|
return;
|
|
}
|
|
|
|
let mut folder_name = self.dir.join(domain);
|
|
|
|
download_fav_for(domain).await;
|
|
|
|
for paths in path.split('/') {
|
|
let paths = url_escape::decode(paths).to_string();
|
|
if !paths.is_empty() {
|
|
folder_name = folder_name.join(paths);
|
|
}
|
|
}
|
|
|
|
std::fs::create_dir_all(&folder_name).unwrap();
|
|
|
|
let timestamp = chrono::Utc::now().format("%Y-%m-%d").to_string();
|
|
let filename = folder_name.join(format!("index_{timestamp}"));
|
|
|
|
log::info!("Archiving {url} to {}", filename.to_str().unwrap());
|
|
|
|
let conf = get_config()
|
|
.get_domain_config(domain)
|
|
.cloned()
|
|
.unwrap_or_default();
|
|
|
|
let mut cmd = vec!["monolith", "--isolate", "-o", filename.to_str().unwrap()];
|
|
|
|
if conf.no_audio.unwrap_or_default() {
|
|
cmd.push("--no-audio");
|
|
}
|
|
|
|
if conf.no_css.unwrap_or_default() {
|
|
cmd.push("--no-css");
|
|
}
|
|
|
|
if conf.no_frames.unwrap_or_default() {
|
|
cmd.push("--no-frames");
|
|
}
|
|
|
|
if conf.no_fonts.unwrap_or_default() {
|
|
cmd.push("--no-frames");
|
|
}
|
|
|
|
if conf.no_image.unwrap_or_default() {
|
|
cmd.push("--no-images");
|
|
}
|
|
|
|
if conf.no_javascript.unwrap_or_default() {
|
|
cmd.push("--no-js");
|
|
cmd.push("--unwrap-noscript");
|
|
}
|
|
|
|
if conf.no_video.unwrap_or_default() {
|
|
cmd.push("--no-video");
|
|
}
|
|
|
|
if let Some(ua) = &conf.user_agent {
|
|
cmd.push("--user-agent");
|
|
cmd.push(ua.as_str());
|
|
}
|
|
|
|
let mut url = url::Url::parse(&format!("https://{domain}")).unwrap();
|
|
url = url.join(path).unwrap();
|
|
let url = url.to_string();
|
|
cmd.push(&url);
|
|
|
|
run_command(&cmd);
|
|
|
|
index_path(&self.get_domain(domain), path).await;
|
|
}
|
|
}
|
|
|
|
fn run_command(cmd: &[&str]) {
|
|
let mut cmd_setup = std::process::Command::new(cmd[0]);
|
|
let cmd_setup = cmd_setup
|
|
.args(cmd.iter().skip(1).collect::<Vec<_>>())
|
|
.stdout(std::process::Stdio::inherit())
|
|
.stderr(std::process::Stdio::inherit());
|
|
|
|
let child = cmd_setup.spawn().unwrap();
|
|
|
|
let status = child.wait_with_output().unwrap();
|
|
if !status.status.success() {
|
|
log::warn!(
|
|
"Command {cmd:?} exited with code {}",
|
|
status.status.code().unwrap_or_default()
|
|
)
|
|
}
|
|
}
|
|
|
|
pub async fn index_archive_db(arc: &WebsiteArchive) {
|
|
log::info!("Indexing archive");
|
|
|
|
for dom in arc.domains() {
|
|
let dom = arc.get_domain(&dom);
|
|
index_path(&dom, "/").await;
|
|
}
|
|
|
|
log::info!("Done indexing archive");
|
|
}
|
|
|
|
pub async fn index_path(dom: &Domain, path: &str) {
|
|
let (paths, is_doc) = dom.paths(path);
|
|
|
|
// If the path is a document, process the root path.
|
|
if is_doc {
|
|
let doc = dom.path("/");
|
|
index_document(&doc).await;
|
|
}
|
|
|
|
// Create a queue to process paths iteratively
|
|
let mut queue = std::collections::VecDeque::new();
|
|
|
|
// Add the initial paths to the queue
|
|
queue.extend(paths);
|
|
|
|
while let Some(next_path) = queue.pop_front() {
|
|
let (next_paths, is_doc) = dom.paths(next_path.path());
|
|
|
|
if is_doc {
|
|
let doc = dom.path(next_path.path());
|
|
log::info!(
|
|
"Indexing {} / {} [{} queued]",
|
|
doc.domain,
|
|
doc.path,
|
|
queue.len()
|
|
);
|
|
index_document(&doc).await;
|
|
}
|
|
|
|
queue.extend(next_paths);
|
|
}
|
|
}
|
|
|
|
pub async fn index_document(doc: &Document) {
|
|
for version_str in &doc.versions() {
|
|
let domain = &doc.domain;
|
|
let path = &doc.path;
|
|
let version =
|
|
if let Ok(version) = chrono::NaiveDate::parse_from_str(&version_str, "%Y-%m-%d") {
|
|
version
|
|
} else {
|
|
log::error!(
|
|
"Could not parse version {version_str} as valid date for {} / {}",
|
|
domain,
|
|
path
|
|
);
|
|
continue;
|
|
};
|
|
|
|
if DocumentIndex::exists(domain, path, &version).await {
|
|
log::info!(
|
|
"Document {} / {} @ {} already indexed",
|
|
domain,
|
|
path,
|
|
version
|
|
);
|
|
continue;
|
|
}
|
|
|
|
if let Ok(content) = doc
|
|
.render_local(
|
|
Some(version_str.to_string()),
|
|
&Shell::new(Nothing(), Nothing(), Nothing()),
|
|
)
|
|
.await
|
|
{
|
|
let size = content.len();
|
|
let mime = get_mime_type(&content).unwrap_or("text/html".to_string());
|
|
|
|
if mime.as_str() == "text/html" {
|
|
// TODO : domain links index
|
|
|
|
let mut hashes = Vec::new();
|
|
|
|
for (mime, data) in extract_data_urls(&String::from_utf8_lossy(&content)) {
|
|
let hash = sha256_hash(&data);
|
|
|
|
log::info!("{} / {}: Indexing fragment {hash}", doc.domain, doc.path);
|
|
|
|
hashes.push(hash.clone());
|
|
sqlx::query("INSERT INTO fragments (id, mime, blob) VALUES ($1, $2, $3) ON CONFLICT DO NOTHING")
|
|
.bind(&hash)
|
|
.bind(&mime)
|
|
.bind(data)
|
|
.execute(get_pg!()).await.unwrap();
|
|
}
|
|
|
|
for hash in hashes {
|
|
sqlx::query("INSERT INTO document_fragments (domain, path, version, fragment) VALUES ($1, $2, $3, $4) ON CONFLICT DO NOTHING")
|
|
.bind(&doc.domain)
|
|
.bind(&doc.path)
|
|
.bind(chrono::NaiveDate::parse_from_str(version_str, "%Y-%m-%d").unwrap())
|
|
.bind(&hash)
|
|
.execute(get_pg!()).await.unwrap();
|
|
}
|
|
}
|
|
|
|
if let Ok(version) = chrono::NaiveDate::parse_from_str(version_str, "%Y-%m-%d") {
|
|
sqlx::query(
|
|
r#"
|
|
INSERT INTO document_index (domain, path, version, size, mime)
|
|
VALUES ($1, $2, $3, $4, $5)
|
|
ON CONFLICT (domain, path, version) DO NOTHING
|
|
"#,
|
|
)
|
|
.bind(&doc.domain)
|
|
.bind(&doc.path)
|
|
.bind(version)
|
|
.bind(size as i64)
|
|
.bind(mime)
|
|
.execute(get_pg!())
|
|
.await
|
|
.unwrap();
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
#[derive(Debug, Clone, FromRow)]
|
|
pub struct DocumentIndex {
|
|
pub domain: String,
|
|
pub path: String,
|
|
pub version: chrono::NaiveDate,
|
|
pub size: i64,
|
|
pub mime: String,
|
|
}
|
|
|
|
impl DocumentIndex {
|
|
pub async fn exists(domain: &str, path: &str, version: &chrono::NaiveDate) -> bool {
|
|
let res: Option<Self> = sqlx::query_as(
|
|
"SELECT * FROM document_index WHERE domain = $1 AND path = $2 AND version = $3",
|
|
)
|
|
.bind(domain)
|
|
.bind(path)
|
|
.bind(version)
|
|
.fetch_optional(get_pg!())
|
|
.await
|
|
.unwrap();
|
|
res.is_some()
|
|
}
|
|
|
|
pub fn url(&self) -> String {
|
|
format!(
|
|
"/s/{}/{}?time={}",
|
|
self.domain,
|
|
self.path,
|
|
self.version.to_string()
|
|
)
|
|
}
|
|
|
|
pub async fn get_documents_of_day(
|
|
day: NaiveDate,
|
|
domain: Option<&str>,
|
|
) -> HashMap<String, Vec<String>> {
|
|
let res: Vec<(String, String)> = if let Some(domain) = domain {
|
|
sqlx::query_as(
|
|
"SELECT domain, path FROM document_index WHERE version = $1 WHERE domain = $2",
|
|
)
|
|
.bind(day)
|
|
.bind(domain)
|
|
.fetch_all(get_pg!())
|
|
.await
|
|
.unwrap()
|
|
} else {
|
|
sqlx::query_as("SELECT domain, path FROM document_index WHERE version = $1")
|
|
.bind(day)
|
|
.fetch_all(get_pg!())
|
|
.await
|
|
.unwrap()
|
|
};
|
|
|
|
let mut ret = HashMap::new();
|
|
|
|
for (domain, path) in res {
|
|
let d: &mut Vec<String> = ret.entry(domain).or_default();
|
|
d.push(path);
|
|
}
|
|
|
|
ret
|
|
}
|
|
|
|
pub async fn get_documents_of_other_mime(domain: &str) -> Vec<DocumentIndex> {
|
|
sqlx::query_as("SELECT * FROM document_index WHERE mime != 'text/html' AND domain = $1")
|
|
.bind(domain)
|
|
.fetch_all(get_pg!())
|
|
.await
|
|
.unwrap()
|
|
}
|
|
}
|