webarc/src/archive/mod.rs

use std::{
    collections::{HashMap, HashSet},
    path::PathBuf,
};

use crate::{
    blacklist::{check_blacklist, check_blacklist_path},
    conf::get_config,
    extract_data_urls,
    favicon::download_fav_for,
    get_mime_type, sha256_hash,
};

mod document;
mod domain;
mod fragment;

use based::{
    get_pg,
    ui::{components::prelude::Shell, prelude::Nothing},
};
use chrono::NaiveDate;
pub use document::Document;
pub use domain::*;
pub use fragment::*;
use sqlx::prelude::FromRow;

/// Read directory entries into `Vec<String>`
pub fn read_dir(dir: &PathBuf) -> Vec<String> {
    let mut list = Vec::new();

    if let Ok(entries) = std::fs::read_dir(dir) {
        for entry in entries.flatten() {
            if let Some(file_name) = entry.file_name().to_str() {
                list.push(file_name.to_string());
            }
        }
    }

    list
}

/// Rewrite all URLs in `input` to the format `/s/<domain>/<path..>`
pub fn internalize_urls(input: &str, base: &str) -> String {
    // todo : fix regex, domains without path are not captured
    let url_pattern = r#"(\ |"|')(?:(<?)(https?:\/\/([a-zA-Z0-9.-]+))?(\/[\w./-]*))"#;
    let re = regex::Regex::new(url_pattern).unwrap();

    re.replace_all(input, |caps: &regex::Captures| {
        if caps.get(2).map(|x| x.as_str()).unwrap_or_default() == "<" {
            return caps.get(0).unwrap().as_str().to_string();
        }

        if caps.get(0).unwrap().as_str() == " //" {
            return " //".to_string();
        }

        let wrap = caps.get(1).map(|x| x.as_str()).unwrap_or_default();

        if let Some(domain) = caps.get(3) {
            let domain = domain.as_str();
            let (protocol, domain) = if domain.starts_with("https://") {
                ("https", domain.trim_start_matches("https://"))
            } else {
                ("http", domain.trim_start_matches("http://"))
            };

            let domain = domain.trim_start_matches("www.");
            let path = caps.get(5).map_or("", |m| m.as_str());

            // Skip transformation if the domain is in the blacklist
            if check_blacklist(domain) {
                format!("{wrap}{protocol}://{domain}{path}")
            } else {
                format!("{wrap}/s/{domain}{path}")
            }
        } else if let Some(path) = caps.get(5) {
            // Handle relative paths
            format!("{wrap}/s/{base}{}", path.as_str())
        } else {
            // Default fallback
            caps[0].to_string()
        }
    })
    .to_string()
}

/// Extract all domains
pub fn extract_domains(input: &str) -> Vec<String> {
    let url_pattern = r"https?://([a-zA-Z0-9.-]+)(/[\w./-]*)?";
    let re = regex::Regex::new(url_pattern).unwrap();

    let mut domains = HashSet::new();
    for caps in re.captures_iter(input) {
        let domain = caps[1].trim_start_matches("www.");
        domains.insert(domain.to_string());
    }

    let mut domains: Vec<_> = domains.into_iter().collect();
    domains.sort();

    domains
}

// TODO : impl archive index to db

/// Represents a directory containg archived websites
#[derive(Debug, Clone)]
pub struct WebsiteArchive {
    pub dir: PathBuf,
}

impl WebsiteArchive {
    /// Creates a new `WebsiteArchive` instance.
    ///
    /// # Parameters
    /// - `dir`: The directory path where the archive will be stored.
    ///
    /// # Returns
    /// A new `WebsiteArchive` instance.
    pub fn new(dir: &str) -> Self {
        Self {
            dir: PathBuf::from(dir),
        }
    }

    /// Retrieves the list of domain names stored in the archive.
    ///
    /// # Returns
    /// A vector of domain names as strings.
    pub fn domains(&self) -> Vec<String> {
        read_dir(&self.dir)
    }

    /// Retrieves a `Domain` instance for a specified domain name.
    ///
    /// # Parameters
    /// - `domain`: The name of the domain to retrieve.
    ///
    /// # Returns
    /// A `Domain` instance corresponding to the specified domain.
    pub fn get_domain(&self, domain: &str) -> Domain {
        Domain::new(domain, self.dir.join(domain))
    }

    /// Archives a URL by downloading and storing its content.
    ///
    /// If the URL does not pass the blacklist check, it will not be archived.
    ///
    /// # Parameters
    /// - `url`: The URL to archive.
    ///
    /// This function downloads the content of the URL, processes it, and saves it to the archive.
    pub async fn archive_url(&self, url: &str) {
        let parsed_url = url::Url::parse(url).unwrap();

        let domain = parsed_url.domain().unwrap().trim_start_matches("www.");

        // Deny blacklist
        if check_blacklist(domain) {
            return;
        }

        let path = parsed_url.path();

        if check_blacklist_path(domain, path) {
            return;
        }

        let mut folder_name = self.dir.join(domain);

        download_fav_for(domain).await;

        for paths in path.split('/') {
            let paths = url_escape::decode(paths).to_string();
            if !paths.is_empty() {
                folder_name = folder_name.join(paths);
            }
        }

        std::fs::create_dir_all(&folder_name).unwrap();

        let timestamp = chrono::Utc::now().format("%Y-%m-%d").to_string();
        let filename = folder_name.join(format!("index_{timestamp}"));

        log::info!("Archiving {url} to {}", filename.to_str().unwrap());

        let conf = get_config()
            .get_domain_config(domain)
            .cloned()
            .unwrap_or_default();

        let mut cmd = vec!["monolith", "--isolate", "-o", filename.to_str().unwrap()];

        if conf.no_audio.unwrap_or_default() {
            cmd.push("--no-audio");
        }

        if conf.no_css.unwrap_or_default() {
            cmd.push("--no-css");
        }

        if conf.no_frames.unwrap_or_default() {
            cmd.push("--no-frames");
        }

        if conf.no_fonts.unwrap_or_default() {
            cmd.push("--no-frames");
        }

        if conf.no_image.unwrap_or_default() {
            cmd.push("--no-images");
        }

        if conf.no_javascript.unwrap_or_default() {
            cmd.push("--no-js");
            cmd.push("--unwrap-noscript");
        }

        if conf.no_video.unwrap_or_default() {
            cmd.push("--no-video");
        }

        if let Some(ua) = &conf.user_agent {
            cmd.push("--user-agent");
            cmd.push(ua.as_str());
        }

        let mut url = url::Url::parse(&format!("https://{domain}")).unwrap();
        url = url.join(path).unwrap();
        let url = url.to_string();
        cmd.push(&url);

        run_command(&cmd);

        index_path(&self.get_domain(domain), path).await;
    }
}

fn run_command(cmd: &[&str]) {
    let mut cmd_setup = std::process::Command::new(cmd[0]);
    let cmd_setup = cmd_setup
        .args(cmd.iter().skip(1).collect::<Vec<_>>())
        .stdout(std::process::Stdio::inherit())
        .stderr(std::process::Stdio::inherit());

    let child = cmd_setup.spawn().unwrap();

    let status = child.wait_with_output().unwrap();
    if !status.status.success() {
        log::warn!(
            "Command {cmd:?} exited with code {}",
            status.status.code().unwrap_or_default()
        )
    }
}

pub async fn index_archive_db(arc: &WebsiteArchive) {
    log::info!("Indexing archive");

    for dom in arc.domains() {
        let dom = arc.get_domain(&dom);
        index_path(&dom, "/").await;
    }

    log::info!("Done indexing archive");
}

pub async fn index_path(dom: &Domain, path: &str) {
    let (paths, is_doc) = dom.paths(path);

    // If the path is a document, process the root path.
    if is_doc {
        let doc = dom.path("/");
        index_document(&doc).await;
    }

    // Create a queue to process paths iteratively
    let mut queue = std::collections::VecDeque::new();

    // Add the initial paths to the queue
    queue.extend(paths);

    while let Some(next_path) = queue.pop_front() {
        let (next_paths, is_doc) = dom.paths(next_path.path());

        if is_doc {
            let doc = dom.path(next_path.path());
            log::info!(
                "Indexing {} / {} [{} queued]",
                doc.domain,
                doc.path,
                queue.len()
            );
            index_document(&doc).await;
        }

        queue.extend(next_paths);
    }
}

pub async fn index_document(doc: &Document) {
    for version_str in &doc.versions() {
        let domain = &doc.domain;
        let path = &doc.path;
        let version =
            if let Ok(version) = chrono::NaiveDate::parse_from_str(&version_str, "%Y-%m-%d") {
                version
            } else {
                log::error!(
                    "Could not parse version {version_str} as valid date for {} / {}",
                    domain,
                    path
                );
                continue;
            };

        if DocumentIndex::exists(domain, path, &version).await {
            log::info!(
                "Document {} / {} @ {} already indexed",
                domain,
                path,
                version
            );
            continue;
        }

        if let Ok(content) = doc
            .render_local(
                Some(version_str.to_string()),
                &Shell::new(Nothing(), Nothing(), Nothing()),
            )
            .await
        {
            let size = content.len();
            let mime = get_mime_type(&content).unwrap_or("text/html".to_string());

            if mime.as_str() == "text/html" {
                // TODO : domain links index

                let mut hashes = Vec::new();

                for (mime, data) in extract_data_urls(&String::from_utf8_lossy(&content)) {
                    let hash = sha256_hash(&data);

                    log::info!("{} / {}: Indexing fragment {hash}", doc.domain, doc.path);

                    hashes.push(hash.clone());
                    sqlx::query("INSERT INTO fragments (id, mime, blob) VALUES ($1, $2, $3) ON CONFLICT DO NOTHING")
                    .bind(&hash)
                    .bind(&mime)
                    .bind(data)
                    .execute(get_pg!()).await.unwrap();
                }

                for hash in hashes {
                    sqlx::query("INSERT INTO document_fragments (domain, path, version, fragment) VALUES ($1, $2, $3, $4) ON CONFLICT DO NOTHING")
                    .bind(&doc.domain)
                    .bind(&doc.path)
                    .bind(chrono::NaiveDate::parse_from_str(version_str, "%Y-%m-%d").unwrap())
                    .bind(&hash)
                    .execute(get_pg!()).await.unwrap();
                }
            }

            if let Ok(version) = chrono::NaiveDate::parse_from_str(version_str, "%Y-%m-%d") {
                sqlx::query(
                    r#"
                INSERT INTO document_index (domain, path, version, size, mime)
                VALUES ($1, $2, $3, $4, $5)
                ON CONFLICT (domain, path, version) DO NOTHING
                "#,
                )
                .bind(&doc.domain)
                .bind(&doc.path)
                .bind(version)
                .bind(size as i64)
                .bind(mime)
                .execute(get_pg!())
                .await
                .unwrap();
            }
        }
    }
}

#[derive(Debug, Clone, FromRow)]
pub struct DocumentIndex {
    pub domain: String,
    pub path: String,
    pub version: chrono::NaiveDate,
    pub size: i64,
    pub mime: String,
}

impl DocumentIndex {
    pub async fn exists(domain: &str, path: &str, version: &chrono::NaiveDate) -> bool {
        let res: Option<Self> = sqlx::query_as(
            "SELECT * FROM document_index WHERE domain = $1 AND path = $2 AND version = $3",
        )
        .bind(domain)
        .bind(path)
        .bind(version)
        .fetch_optional(get_pg!())
        .await
        .unwrap();
        res.is_some()
    }

    pub fn url(&self) -> String {
        format!(
            "/s/{}/{}?time={}",
            self.domain,
            self.path,
            self.version.to_string()
        )
    }

    pub async fn get_documents_of_day(
        day: NaiveDate,
        domain: Option<&str>,
    ) -> HashMap<String, Vec<String>> {
        let res: Vec<(String, String)> = if let Some(domain) = domain {
            sqlx::query_as(
                "SELECT domain, path FROM document_index WHERE version = $1 WHERE domain = $2",
            )
            .bind(day)
            .bind(domain)
            .fetch_all(get_pg!())
            .await
            .unwrap()
        } else {
            sqlx::query_as("SELECT domain, path FROM document_index WHERE version = $1")
                .bind(day)
                .fetch_all(get_pg!())
                .await
                .unwrap()
        };

        let mut ret = HashMap::new();

        for (domain, path) in res {
            let d: &mut Vec<String> = ret.entry(domain).or_default();
            d.push(path);
        }

        ret
    }

    pub async fn get_documents_of_other_mime(domain: &str) -> Vec<DocumentIndex> {
        sqlx::query_as("SELECT * FROM document_index WHERE mime != 'text/html' AND domain = $1")
            .bind(domain)
            .fetch_all(get_pg!())
            .await
            .unwrap()
    }
}