fix + refactor
Some checks failed
ci/woodpecker/push/build Pipeline failed

This commit is contained in:
JMARyA 2025-01-11 16:21:15 +01:00
parent 56f13c6524
commit 3696f61b02
Signed by: jmarya
GPG key ID: 901B2ADDF27C2263
8 changed files with 524 additions and 446 deletions

207
src/archive/mod.rs Normal file
View file

@ -0,0 +1,207 @@
use std::{collections::HashSet, path::PathBuf};
use crate::{
blacklist::{check_blacklist, check_blacklist_path},
conf::get_config,
favicon::download_fav_for
};
mod document;
mod domain;
pub use document::Document;
pub use domain::*;
/// Read directory entries into `Vec<String>`
pub fn read_dir(dir: &PathBuf) -> Vec<String> {
let mut list = Vec::new();
if let Ok(entries) = std::fs::read_dir(dir) {
for entry in entries.flatten() {
if let Some(file_name) = entry.file_name().to_str() {
list.push(file_name.to_string());
}
}
}
list
}
/// Rewrite all URLs in `input` to the format `/s/<domain>/<path..>`
fn internalize_urls(input: &str) -> String {
let url_pattern = r"https?://([a-zA-Z0-9.-]+)(/[\w./-]*)";
let re = regex::Regex::new(url_pattern).unwrap();
re.replace_all(input, |caps: &regex::Captures| {
let domain = caps[1].trim_start_matches("www.");
let path = &caps[2];
// Dont transform if in blacklist
if check_blacklist(domain) {
return format!("https://{domain}/{path}");
}
format!("/s/{domain}/{path}")
})
.to_string()
}
/// Extract all domains
pub fn extract_domains(input: &str) -> Vec<String> {
let url_pattern = r"https?://([a-zA-Z0-9.-]+)(/[\w./-]*)?";
let re = regex::Regex::new(url_pattern).unwrap();
let mut domains = HashSet::new();
for caps in re.captures_iter(input) {
let domain = caps[1].trim_start_matches("www.");
domains.insert(domain.to_string());
}
let mut domains: Vec<_> = domains.into_iter().collect();
domains.sort();
domains
}
/// Represents a directory containg archived websites
#[derive(Debug, Clone)]
pub struct WebsiteArchive {
pub dir: PathBuf,
}
impl WebsiteArchive {
/// Creates a new `WebsiteArchive` instance.
///
/// # Parameters
/// - `dir`: The directory path where the archive will be stored.
///
/// # Returns
/// A new `WebsiteArchive` instance.
pub fn new(dir: &str) -> Self {
Self {
dir: PathBuf::from(dir),
}
}
/// Retrieves the list of domain names stored in the archive.
///
/// # Returns
/// A vector of domain names as strings.
pub fn domains(&self) -> Vec<String> {
read_dir(&self.dir)
}
/// Retrieves a `Domain` instance for a specified domain name.
///
/// # Parameters
/// - `domain`: The name of the domain to retrieve.
///
/// # Returns
/// A `Domain` instance corresponding to the specified domain.
pub fn get_domain(&self, domain: &str) -> Domain {
Domain::new(domain, self.dir.join(domain))
}
/// Archives a URL by downloading and storing its content.
///
/// If the URL does not pass the blacklist check, it will not be archived.
///
/// # Parameters
/// - `url`: The URL to archive.
///
/// This function downloads the content of the URL, processes it, and saves it to the archive.
pub async fn archive_url(&self, url: &str) {
let parsed_url = url::Url::parse(url).unwrap();
let domain = parsed_url.domain().unwrap().trim_start_matches("www.");
// Deny blacklist
if check_blacklist(domain) {
return;
}
let path = parsed_url.path();
if check_blacklist_path(domain, path) {
return;
}
let mut folder_name = self.dir.join(domain);
download_fav_for(domain).await;
for paths in path.split('/') {
let paths = url_escape::decode(paths).to_string();
if !paths.is_empty() {
folder_name = folder_name.join(paths);
}
}
std::fs::create_dir_all(&folder_name).unwrap();
let timestamp = chrono::Utc::now().format("%Y-%m-%d").to_string();
let filename = folder_name.join(format!("index_{timestamp}.html"));
log::info!("Archiving {url} to {}", filename.to_str().unwrap());
let conf = get_config()
.get_domain_config(domain)
.cloned()
.unwrap_or_default();
let mut cmd = vec!["monolith", "--isolate", "-o", filename.to_str().unwrap()];
if conf.no_audio.unwrap_or_default() {
cmd.push("--no-audio");
}
if conf.no_css.unwrap_or_default() {
cmd.push("--no-css");
}
if conf.no_frames.unwrap_or_default() {
cmd.push("--no-frames");
}
if conf.no_fonts.unwrap_or_default() {
cmd.push("--no-frames");
}
if conf.no_image.unwrap_or_default() {
cmd.push("--no-images");
}
if conf.no_javascript.unwrap_or_default() {
cmd.push("--no-js");
cmd.push("--unwrap-noscript");
}
if conf.no_video.unwrap_or_default() {
cmd.push("--no-video");
}
if let Some(ua) = &conf.user_agent {
cmd.push("--user-agent");
cmd.push(ua.as_str());
}
let mut url = url::Url::parse(&format!("https://{domain}")).unwrap();
url = url.join(path).unwrap();
let url = url.to_string();
cmd.push(&url);
run_command(&cmd);
}
}
fn run_command(cmd: &[&str]) {
let mut cmd_setup = std::process::Command::new(cmd[0]);
let cmd_setup = cmd_setup
.args(cmd.iter().skip(1).collect::<Vec<_>>())
.stdout(std::process::Stdio::inherit())
.stderr(std::process::Stdio::inherit());
let child = cmd_setup.spawn().unwrap();
let status = child.wait_with_output().unwrap();
assert!(status.status.success());
}