This commit is contained in:
parent
56f13c6524
commit
3696f61b02
8 changed files with 524 additions and 446 deletions
207
src/archive/mod.rs
Normal file
207
src/archive/mod.rs
Normal file
|
@ -0,0 +1,207 @@
|
|||
use std::{collections::HashSet, path::PathBuf};
|
||||
|
||||
use crate::{
|
||||
blacklist::{check_blacklist, check_blacklist_path},
|
||||
conf::get_config,
|
||||
favicon::download_fav_for
|
||||
};
|
||||
|
||||
mod document;
|
||||
mod domain;
|
||||
pub use document::Document;
|
||||
pub use domain::*;
|
||||
|
||||
/// Read directory entries into `Vec<String>`
|
||||
pub fn read_dir(dir: &PathBuf) -> Vec<String> {
|
||||
let mut list = Vec::new();
|
||||
|
||||
if let Ok(entries) = std::fs::read_dir(dir) {
|
||||
for entry in entries.flatten() {
|
||||
if let Some(file_name) = entry.file_name().to_str() {
|
||||
list.push(file_name.to_string());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
list
|
||||
}
|
||||
|
||||
/// Rewrite all URLs in `input` to the format `/s/<domain>/<path..>`
|
||||
fn internalize_urls(input: &str) -> String {
|
||||
let url_pattern = r"https?://([a-zA-Z0-9.-]+)(/[\w./-]*)";
|
||||
let re = regex::Regex::new(url_pattern).unwrap();
|
||||
|
||||
re.replace_all(input, |caps: ®ex::Captures| {
|
||||
let domain = caps[1].trim_start_matches("www.");
|
||||
let path = &caps[2];
|
||||
|
||||
// Dont transform if in blacklist
|
||||
if check_blacklist(domain) {
|
||||
return format!("https://{domain}/{path}");
|
||||
}
|
||||
|
||||
format!("/s/{domain}/{path}")
|
||||
})
|
||||
.to_string()
|
||||
}
|
||||
|
||||
/// Extract all domains
|
||||
pub fn extract_domains(input: &str) -> Vec<String> {
|
||||
let url_pattern = r"https?://([a-zA-Z0-9.-]+)(/[\w./-]*)?";
|
||||
let re = regex::Regex::new(url_pattern).unwrap();
|
||||
|
||||
let mut domains = HashSet::new();
|
||||
for caps in re.captures_iter(input) {
|
||||
let domain = caps[1].trim_start_matches("www.");
|
||||
domains.insert(domain.to_string());
|
||||
}
|
||||
|
||||
let mut domains: Vec<_> = domains.into_iter().collect();
|
||||
domains.sort();
|
||||
|
||||
domains
|
||||
}
|
||||
|
||||
/// Represents a directory containg archived websites
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct WebsiteArchive {
|
||||
pub dir: PathBuf,
|
||||
}
|
||||
|
||||
impl WebsiteArchive {
|
||||
/// Creates a new `WebsiteArchive` instance.
|
||||
///
|
||||
/// # Parameters
|
||||
/// - `dir`: The directory path where the archive will be stored.
|
||||
///
|
||||
/// # Returns
|
||||
/// A new `WebsiteArchive` instance.
|
||||
pub fn new(dir: &str) -> Self {
|
||||
Self {
|
||||
dir: PathBuf::from(dir),
|
||||
}
|
||||
}
|
||||
|
||||
/// Retrieves the list of domain names stored in the archive.
|
||||
///
|
||||
/// # Returns
|
||||
/// A vector of domain names as strings.
|
||||
pub fn domains(&self) -> Vec<String> {
|
||||
read_dir(&self.dir)
|
||||
}
|
||||
|
||||
/// Retrieves a `Domain` instance for a specified domain name.
|
||||
///
|
||||
/// # Parameters
|
||||
/// - `domain`: The name of the domain to retrieve.
|
||||
///
|
||||
/// # Returns
|
||||
/// A `Domain` instance corresponding to the specified domain.
|
||||
pub fn get_domain(&self, domain: &str) -> Domain {
|
||||
Domain::new(domain, self.dir.join(domain))
|
||||
}
|
||||
|
||||
/// Archives a URL by downloading and storing its content.
|
||||
///
|
||||
/// If the URL does not pass the blacklist check, it will not be archived.
|
||||
///
|
||||
/// # Parameters
|
||||
/// - `url`: The URL to archive.
|
||||
///
|
||||
/// This function downloads the content of the URL, processes it, and saves it to the archive.
|
||||
pub async fn archive_url(&self, url: &str) {
|
||||
let parsed_url = url::Url::parse(url).unwrap();
|
||||
|
||||
let domain = parsed_url.domain().unwrap().trim_start_matches("www.");
|
||||
|
||||
// Deny blacklist
|
||||
if check_blacklist(domain) {
|
||||
return;
|
||||
}
|
||||
|
||||
let path = parsed_url.path();
|
||||
|
||||
if check_blacklist_path(domain, path) {
|
||||
return;
|
||||
}
|
||||
|
||||
let mut folder_name = self.dir.join(domain);
|
||||
|
||||
download_fav_for(domain).await;
|
||||
|
||||
for paths in path.split('/') {
|
||||
let paths = url_escape::decode(paths).to_string();
|
||||
if !paths.is_empty() {
|
||||
folder_name = folder_name.join(paths);
|
||||
}
|
||||
}
|
||||
|
||||
std::fs::create_dir_all(&folder_name).unwrap();
|
||||
|
||||
let timestamp = chrono::Utc::now().format("%Y-%m-%d").to_string();
|
||||
let filename = folder_name.join(format!("index_{timestamp}.html"));
|
||||
|
||||
log::info!("Archiving {url} to {}", filename.to_str().unwrap());
|
||||
|
||||
let conf = get_config()
|
||||
.get_domain_config(domain)
|
||||
.cloned()
|
||||
.unwrap_or_default();
|
||||
|
||||
let mut cmd = vec!["monolith", "--isolate", "-o", filename.to_str().unwrap()];
|
||||
|
||||
if conf.no_audio.unwrap_or_default() {
|
||||
cmd.push("--no-audio");
|
||||
}
|
||||
|
||||
if conf.no_css.unwrap_or_default() {
|
||||
cmd.push("--no-css");
|
||||
}
|
||||
|
||||
if conf.no_frames.unwrap_or_default() {
|
||||
cmd.push("--no-frames");
|
||||
}
|
||||
|
||||
if conf.no_fonts.unwrap_or_default() {
|
||||
cmd.push("--no-frames");
|
||||
}
|
||||
|
||||
if conf.no_image.unwrap_or_default() {
|
||||
cmd.push("--no-images");
|
||||
}
|
||||
|
||||
if conf.no_javascript.unwrap_or_default() {
|
||||
cmd.push("--no-js");
|
||||
cmd.push("--unwrap-noscript");
|
||||
}
|
||||
|
||||
if conf.no_video.unwrap_or_default() {
|
||||
cmd.push("--no-video");
|
||||
}
|
||||
|
||||
if let Some(ua) = &conf.user_agent {
|
||||
cmd.push("--user-agent");
|
||||
cmd.push(ua.as_str());
|
||||
}
|
||||
|
||||
let mut url = url::Url::parse(&format!("https://{domain}")).unwrap();
|
||||
url = url.join(path).unwrap();
|
||||
let url = url.to_string();
|
||||
cmd.push(&url);
|
||||
|
||||
run_command(&cmd);
|
||||
}
|
||||
}
|
||||
|
||||
fn run_command(cmd: &[&str]) {
|
||||
let mut cmd_setup = std::process::Command::new(cmd[0]);
|
||||
let cmd_setup = cmd_setup
|
||||
.args(cmd.iter().skip(1).collect::<Vec<_>>())
|
||||
.stdout(std::process::Stdio::inherit())
|
||||
.stderr(std::process::Stdio::inherit());
|
||||
|
||||
let child = cmd_setup.spawn().unwrap();
|
||||
|
||||
let status = child.wait_with_output().unwrap();
|
||||
assert!(status.status.success());
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue