This commit is contained in:
parent
56f13c6524
commit
3696f61b02
8 changed files with 524 additions and 446 deletions
10
Cargo.lock
generated
10
Cargo.lock
generated
|
@ -3569,6 +3569,15 @@ dependencies = [
|
|||
"percent-encoding",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "url-escape"
|
||||
version = "0.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "44e0ce4d1246d075ca5abec4b41d33e87a6054d08e2366b63205665e950db218"
|
||||
dependencies = [
|
||||
"percent-encoding",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "utf-8"
|
||||
version = "0.7.6"
|
||||
|
@ -3752,6 +3761,7 @@ dependencies = [
|
|||
"tokio",
|
||||
"toml",
|
||||
"url",
|
||||
"url-escape",
|
||||
"uuid",
|
||||
]
|
||||
|
||||
|
|
|
@ -24,3 +24,4 @@ pgvector = { version = "0.4", features = ["sqlx"] }
|
|||
html2md = "0.2.14"
|
||||
clap = { version = "4.5.23", features = ["cargo", "derive"] }
|
||||
toml = "0.8.19"
|
||||
url-escape = "0.1.1"
|
||||
|
|
17
config.toml
17
config.toml
|
@ -8,7 +8,8 @@ DOWNLOAD_ON_DEMAND=true
|
|||
[websites]
|
||||
# You can blacklist sites which wont work well
|
||||
BLACKLIST_DOMAINS = [
|
||||
"^gitlab" # All domains starting with gitlab
|
||||
"^gitlab", # All domains starting with gitlab
|
||||
"youtube" # YouTube
|
||||
]
|
||||
|
||||
# Domain configuration (Example)
|
||||
|
@ -56,3 +57,17 @@ no_javascript = true
|
|||
[[websites.domains]]
|
||||
domain = "github.com"
|
||||
no_javascript = true
|
||||
|
||||
[[websites.domains]]
|
||||
domain = "en.wikipedia.org"
|
||||
no_javascript = true
|
||||
|
||||
[[websites.domains]]
|
||||
domain = "api.flutter.dev"
|
||||
no_javascript = true
|
||||
no_video = true
|
||||
|
||||
[[websites.domains]]
|
||||
domain = "docs.flutter.dev"
|
||||
no_javascript = true
|
||||
no_video = true
|
441
src/archive.rs
441
src/archive.rs
|
@ -1,441 +0,0 @@
|
|||
use std::{collections::HashSet, io::Read, path::PathBuf};
|
||||
|
||||
use based::{request::RequestContext, result::LogAndIgnore};
|
||||
use maud::html;
|
||||
|
||||
use crate::{
|
||||
blacklist::{check_blacklist, check_blacklist_path},
|
||||
conf::get_config,
|
||||
favicon::download_fav_for,
|
||||
render_page,
|
||||
};
|
||||
|
||||
/// Read directory entries into `Vec<String>`
|
||||
pub fn read_dir(dir: &PathBuf) -> Vec<String> {
|
||||
let mut list = Vec::new();
|
||||
|
||||
if let Ok(entries) = std::fs::read_dir(dir) {
|
||||
for entry in entries.flatten() {
|
||||
if let Some(file_name) = entry.file_name().to_str() {
|
||||
list.push(file_name.to_string());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
list
|
||||
}
|
||||
|
||||
/// Rewrite all URLs in `input` to the format `/s/<domain>/<path..>`
|
||||
fn internalize_urls(input: &str) -> String {
|
||||
let url_pattern = r"https?://([a-zA-Z0-9.-]+)(/[\w./-]*)";
|
||||
let re = regex::Regex::new(url_pattern).unwrap();
|
||||
|
||||
re.replace_all(input, |caps: ®ex::Captures| {
|
||||
let domain = caps[1].trim_start_matches("www.");
|
||||
let path = &caps[2];
|
||||
|
||||
// Dont transform if in blacklist
|
||||
if check_blacklist(domain) {
|
||||
return format!("https://{domain}/{path}");
|
||||
}
|
||||
|
||||
format!("/s/{domain}/{path}")
|
||||
})
|
||||
.to_string()
|
||||
}
|
||||
|
||||
/// Extract all domains
|
||||
pub fn extract_domains(input: &str) -> Vec<String> {
|
||||
let url_pattern = r"https?://([a-zA-Z0-9.-]+)(/[\w./-]*)?";
|
||||
let re = regex::Regex::new(url_pattern).unwrap();
|
||||
|
||||
let mut domains = HashSet::new();
|
||||
for caps in re.captures_iter(input) {
|
||||
let domain = caps[1].trim_start_matches("www.");
|
||||
domains.insert(domain.to_string());
|
||||
}
|
||||
|
||||
let mut domains: Vec<_> = domains.into_iter().collect();
|
||||
domains.sort();
|
||||
|
||||
domains
|
||||
}
|
||||
|
||||
/// Represents a directory containg archived websites
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct WebsiteArchive {
|
||||
pub dir: PathBuf,
|
||||
}
|
||||
|
||||
/// Represents a domain within the website archive
|
||||
pub struct Domain {
|
||||
/// Domain name
|
||||
pub name: String,
|
||||
dir: PathBuf,
|
||||
}
|
||||
|
||||
impl Domain {
|
||||
/// Creates a new `Domain` instance.
|
||||
///
|
||||
/// If the domain name is not blacklisted, a directory is created.
|
||||
///
|
||||
/// # Parameters
|
||||
/// - `name`: The name of the domain.
|
||||
/// - `dir`: The directory path for the domain.
|
||||
///
|
||||
/// # Returns
|
||||
/// A new `Domain` instance.
|
||||
pub fn new(name: &str, dir: PathBuf) -> Self {
|
||||
if !check_blacklist(name) {
|
||||
std::fs::create_dir_all(&dir)
|
||||
.log_err_and_ignore(&format!("Could not create domain dir {name}"));
|
||||
}
|
||||
Self {
|
||||
name: name.to_string(),
|
||||
dir,
|
||||
}
|
||||
}
|
||||
|
||||
/// Resolves a specific path within the domain and returns a `Document` representing it.
|
||||
///
|
||||
/// # Parameters
|
||||
/// - `path`: The path to resolve within the domain.
|
||||
///
|
||||
/// # Returns
|
||||
/// A `Document` instance corresponding to the given path.
|
||||
pub fn path(&self, path: &str) -> Document {
|
||||
Document::new(&self.name, path, self.dir.parent().unwrap().to_path_buf())
|
||||
}
|
||||
|
||||
/// Get all paths associated with the domain
|
||||
pub fn all_paths(&self) -> Vec<PathEntry> {
|
||||
let mut queue = self.paths("/").0;
|
||||
|
||||
let mut ret = Vec::new();
|
||||
|
||||
ret.push(PathEntry(self.name.clone(), "/".to_string()));
|
||||
|
||||
while let Some(el) = queue.pop() {
|
||||
ret.push(el.clone());
|
||||
let paths = self.paths(&el.1).0;
|
||||
queue.extend(paths);
|
||||
}
|
||||
|
||||
ret
|
||||
}
|
||||
|
||||
/// Retrieves entries and metadata for a given path within the domain.
|
||||
///
|
||||
/// # Parameters
|
||||
/// - `path`: The path to inspect.
|
||||
///
|
||||
/// # Returns
|
||||
/// A tuple containing:
|
||||
/// - A vector of `PathEntry` instances representing the contents of the path.
|
||||
/// - A boolean indicating whether the path is itself a `Document`
|
||||
pub fn paths(&self, path: &str) -> (Vec<PathEntry>, bool) {
|
||||
let mut base_path = self.dir.clone();
|
||||
|
||||
for p in path.split('/') {
|
||||
base_path = base_path.join(p);
|
||||
}
|
||||
|
||||
let path = path
|
||||
.split("/")
|
||||
.filter(|x| !x.is_empty())
|
||||
.collect::<Vec<&str>>()
|
||||
.join("/");
|
||||
|
||||
let dir_content = read_dir(&base_path);
|
||||
|
||||
let mut ret = Vec::new();
|
||||
|
||||
let mut is_doc = false;
|
||||
|
||||
for entry in dir_content {
|
||||
let url_path = format!("{path}/{entry}");
|
||||
let url_path = url_path
|
||||
.split("/")
|
||||
.filter(|x| !x.is_empty())
|
||||
.collect::<Vec<&str>>()
|
||||
.join("/");
|
||||
if entry.starts_with("index_") && entry.ends_with(".html") {
|
||||
is_doc = true;
|
||||
continue;
|
||||
}
|
||||
|
||||
ret.push(PathEntry(self.name.clone(), url_path));
|
||||
}
|
||||
|
||||
(ret, is_doc)
|
||||
}
|
||||
}
|
||||
|
||||
/// Represents an entry within a domain's path, containing its name and URL path.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct PathEntry(String, String);
|
||||
|
||||
impl PathEntry {
|
||||
pub fn url(&self) -> String {
|
||||
format!("/d/{}/{}", self.0, self.1)
|
||||
}
|
||||
|
||||
pub fn path(&self) -> &String {
|
||||
&self.1
|
||||
}
|
||||
}
|
||||
|
||||
/// Represents a document within a domain
|
||||
pub struct Document {
|
||||
/// The domain associated with the document.
|
||||
pub domain: String,
|
||||
/// The path of the document within the domain.
|
||||
pub path: String,
|
||||
base_dir: PathBuf,
|
||||
}
|
||||
|
||||
impl Document {
|
||||
/// Creates a new `Document` instance.
|
||||
///
|
||||
/// # Parameters
|
||||
/// - `domain`: The domain to which the document belongs.
|
||||
/// - `path`: The path of the document within the domain.
|
||||
/// - `base_dir`: The base directory of the archive storage.
|
||||
///
|
||||
/// # Returns
|
||||
/// A new `Document` instance.
|
||||
pub fn new(domain: &str, path: &str, base_dir: PathBuf) -> Self {
|
||||
let split = path
|
||||
.split('/')
|
||||
.filter(|x| !x.is_empty())
|
||||
.collect::<Vec<&str>>();
|
||||
|
||||
Self {
|
||||
domain: domain.to_string(),
|
||||
path: if split.is_empty() {
|
||||
"/".to_string()
|
||||
} else {
|
||||
split.join("/")
|
||||
},
|
||||
base_dir,
|
||||
}
|
||||
}
|
||||
|
||||
/// Renders the document, returning its content as a string.
|
||||
///
|
||||
/// If the environment variable `$ROUTE_INTERNAL` is set to `true`, all links will be rewritten to point to internal archived routes.
|
||||
///
|
||||
/// # Parameters
|
||||
/// - `version`: An optional version of the document to render in the format `YYYY-MM-DD`.
|
||||
///
|
||||
/// # Returns
|
||||
/// An `Option` containing the rendered content as a string, or `None` if nothing could be rendered.
|
||||
pub async fn render_local(&self, version: Option<String>) -> Option<String> {
|
||||
if check_blacklist(&self.domain) {
|
||||
let content = html! {
|
||||
h3 { "This site is blacklisted" };
|
||||
};
|
||||
return Some(render_page(content, RequestContext::default()).await.1 .1);
|
||||
}
|
||||
|
||||
let mut file_path = self.doc_dir();
|
||||
|
||||
let latest_version = if let Some(version) = version {
|
||||
format!("index_{version}.html")
|
||||
} else {
|
||||
let versions = self.versions();
|
||||
let version = versions.first().cloned()?;
|
||||
format!("index_{version}.html")
|
||||
};
|
||||
|
||||
file_path = file_path.join(latest_version);
|
||||
|
||||
let mut buf = Vec::new();
|
||||
std::fs::File::open(file_path)
|
||||
.ok()?
|
||||
.read_to_end(&mut buf)
|
||||
.unwrap();
|
||||
let content = String::from_utf8_lossy(&buf);
|
||||
|
||||
if get_config().ROUTE_INTERNAL {
|
||||
Some(internalize_urls(&content))
|
||||
} else {
|
||||
Some(content.to_string())
|
||||
}
|
||||
}
|
||||
|
||||
/// Determines the directory where the document is stored.
|
||||
///
|
||||
/// # Returns
|
||||
/// A `PathBuf` representing the document directory.
|
||||
pub fn doc_dir(&self) -> PathBuf {
|
||||
let mut file_path = self.base_dir.join(&self.domain);
|
||||
|
||||
for p in self.path.split('/').filter(|x| !x.is_empty()) {
|
||||
file_path = file_path.join(p);
|
||||
}
|
||||
|
||||
file_path
|
||||
}
|
||||
|
||||
/// Retrieves available versions of the document.
|
||||
///
|
||||
/// # Returns
|
||||
/// A vector of strings representing the available versions of the document, sorted in descending order.
|
||||
pub fn versions(&self) -> Vec<String> {
|
||||
let mut res: Vec<String> = read_dir(&self.doc_dir())
|
||||
.into_iter()
|
||||
.filter_map(|x| {
|
||||
if x.starts_with("index_") && x.ends_with(".html") {
|
||||
return Some(
|
||||
x.trim_start_matches("index_")
|
||||
.trim_end_matches(".html")
|
||||
.to_string(),
|
||||
);
|
||||
}
|
||||
|
||||
None
|
||||
})
|
||||
.collect();
|
||||
res.sort();
|
||||
res.reverse();
|
||||
res
|
||||
}
|
||||
}
|
||||
|
||||
impl WebsiteArchive {
|
||||
/// Creates a new `WebsiteArchive` instance.
|
||||
///
|
||||
/// # Parameters
|
||||
/// - `dir`: The directory path where the archive will be stored.
|
||||
///
|
||||
/// # Returns
|
||||
/// A new `WebsiteArchive` instance.
|
||||
pub fn new(dir: &str) -> Self {
|
||||
Self {
|
||||
dir: PathBuf::from(dir),
|
||||
}
|
||||
}
|
||||
|
||||
/// Retrieves the list of domain names stored in the archive.
|
||||
///
|
||||
/// # Returns
|
||||
/// A vector of domain names as strings.
|
||||
pub fn domains(&self) -> Vec<String> {
|
||||
read_dir(&self.dir)
|
||||
}
|
||||
|
||||
/// Retrieves a `Domain` instance for a specified domain name.
|
||||
///
|
||||
/// # Parameters
|
||||
/// - `domain`: The name of the domain to retrieve.
|
||||
///
|
||||
/// # Returns
|
||||
/// A `Domain` instance corresponding to the specified domain.
|
||||
pub fn get_domain(&self, domain: &str) -> Domain {
|
||||
Domain::new(domain, self.dir.join(domain))
|
||||
}
|
||||
|
||||
/// Archives a URL by downloading and storing its content.
|
||||
///
|
||||
/// If the URL does not pass the blacklist check, it will not be archived.
|
||||
///
|
||||
/// # Parameters
|
||||
/// - `url`: The URL to archive.
|
||||
///
|
||||
/// This function downloads the content of the URL, processes it, and saves it to the archive.
|
||||
pub async fn archive_url(&self, url: &str) {
|
||||
let parsed_url = url::Url::parse(url).unwrap();
|
||||
|
||||
let domain = parsed_url.domain().unwrap().trim_start_matches("www.");
|
||||
|
||||
// Deny blacklist
|
||||
if check_blacklist(domain) {
|
||||
return;
|
||||
}
|
||||
|
||||
let path = parsed_url.path();
|
||||
|
||||
if check_blacklist_path(domain, path) {
|
||||
return;
|
||||
}
|
||||
|
||||
let mut folder_name = self.dir.join(domain);
|
||||
|
||||
download_fav_for(domain).await;
|
||||
|
||||
for paths in path.split('/') {
|
||||
if !paths.is_empty() {
|
||||
folder_name = folder_name.join(paths);
|
||||
}
|
||||
}
|
||||
|
||||
std::fs::create_dir_all(&folder_name).unwrap();
|
||||
|
||||
let timestamp = chrono::Utc::now().format("%Y-%m-%d").to_string();
|
||||
let filename = folder_name.join(format!("index_{timestamp}.html"));
|
||||
|
||||
log::info!("Archiving {url} to {}", filename.to_str().unwrap());
|
||||
|
||||
let conf = get_config()
|
||||
.get_domain_config(domain)
|
||||
.cloned()
|
||||
.unwrap_or_default();
|
||||
|
||||
let mut cmd = vec!["monolith", "--isolate", "-o", filename.to_str().unwrap()];
|
||||
|
||||
if conf.no_audio.unwrap_or_default() {
|
||||
cmd.push("--no-audio");
|
||||
}
|
||||
|
||||
if conf.no_css.unwrap_or_default() {
|
||||
cmd.push("--no-css");
|
||||
}
|
||||
|
||||
if conf.no_frames.unwrap_or_default() {
|
||||
cmd.push("--no-frames");
|
||||
}
|
||||
|
||||
if conf.no_fonts.unwrap_or_default() {
|
||||
cmd.push("--no-frames");
|
||||
}
|
||||
|
||||
if conf.no_image.unwrap_or_default() {
|
||||
cmd.push("--no-images");
|
||||
}
|
||||
|
||||
if conf.no_javascript.unwrap_or_default() {
|
||||
cmd.push("--no-js");
|
||||
cmd.push("--unwrap-noscript");
|
||||
}
|
||||
|
||||
if conf.no_video.unwrap_or_default() {
|
||||
cmd.push("--no-video");
|
||||
}
|
||||
|
||||
if let Some(ua) = &conf.user_agent {
|
||||
cmd.push("--user-agent");
|
||||
cmd.push(ua.as_str());
|
||||
}
|
||||
|
||||
let mut url = url::Url::parse(&format!("https://{domain}")).unwrap();
|
||||
url = url.join(path).unwrap();
|
||||
let url = url.to_string();
|
||||
cmd.push(&url);
|
||||
|
||||
run_command(&cmd);
|
||||
}
|
||||
}
|
||||
|
||||
fn run_command(cmd: &[&str]) {
|
||||
let mut cmd_setup = std::process::Command::new(cmd[0]);
|
||||
let cmd_setup = cmd_setup
|
||||
.args(cmd.iter().skip(1).collect::<Vec<_>>())
|
||||
.stdout(std::process::Stdio::inherit())
|
||||
.stderr(std::process::Stdio::inherit());
|
||||
|
||||
let child = cmd_setup.spawn().unwrap();
|
||||
|
||||
let status = child.wait_with_output().unwrap();
|
||||
assert!(status.status.success());
|
||||
}
|
126
src/archive/document.rs
Normal file
126
src/archive/document.rs
Normal file
|
@ -0,0 +1,126 @@
|
|||
use std::{io::Read, path::PathBuf};
|
||||
|
||||
use based::request::RequestContext;
|
||||
use maud::html;
|
||||
|
||||
use crate::{blacklist::check_blacklist, conf::get_config, render_page};
|
||||
|
||||
use super::{internalize_urls, read_dir};
|
||||
|
||||
/// Represents a document within a domain
|
||||
pub struct Document {
|
||||
/// The domain associated with the document.
|
||||
pub domain: String,
|
||||
/// The path of the document within the domain.
|
||||
pub path: String,
|
||||
base_dir: PathBuf,
|
||||
}
|
||||
|
||||
impl Document {
|
||||
/// Creates a new `Document` instance.
|
||||
///
|
||||
/// # Parameters
|
||||
/// - `domain`: The domain to which the document belongs.
|
||||
/// - `path`: The path of the document within the domain.
|
||||
/// - `base_dir`: The base directory of the archive storage.
|
||||
///
|
||||
/// # Returns
|
||||
/// A new `Document` instance.
|
||||
pub fn new(domain: &str, path: &str, base_dir: PathBuf) -> Self {
|
||||
let split = path
|
||||
.split('/')
|
||||
.filter(|x| !x.is_empty())
|
||||
.collect::<Vec<&str>>();
|
||||
|
||||
Self {
|
||||
domain: domain.to_string(),
|
||||
path: if split.is_empty() {
|
||||
"/".to_string()
|
||||
} else {
|
||||
split.join("/")
|
||||
},
|
||||
base_dir,
|
||||
}
|
||||
}
|
||||
|
||||
/// Renders the document, returning its content as a string.
|
||||
///
|
||||
/// If the environment variable `$ROUTE_INTERNAL` is set to `true`, all links will be rewritten to point to internal archived routes.
|
||||
///
|
||||
/// # Parameters
|
||||
/// - `version`: An optional version of the document to render in the format `YYYY-MM-DD`.
|
||||
///
|
||||
/// # Returns
|
||||
/// An `Option` containing the rendered content as a string, or `None` if nothing could be rendered.
|
||||
pub async fn render_local(&self, version: Option<String>) -> Option<String> {
|
||||
if check_blacklist(&self.domain) {
|
||||
let content = html! {
|
||||
h3 { "This site is blacklisted" };
|
||||
};
|
||||
return Some(render_page(content, RequestContext::default()).await.1 .1);
|
||||
}
|
||||
|
||||
let mut file_path = self.doc_dir();
|
||||
|
||||
let latest_version = if let Some(version) = version {
|
||||
format!("index_{version}.html")
|
||||
} else {
|
||||
let versions = self.versions();
|
||||
let version = versions.first().cloned()?;
|
||||
format!("index_{version}.html")
|
||||
};
|
||||
|
||||
file_path = file_path.join(latest_version);
|
||||
|
||||
let mut buf = Vec::new();
|
||||
std::fs::File::open(file_path)
|
||||
.ok()?
|
||||
.read_to_end(&mut buf)
|
||||
.unwrap();
|
||||
let content = String::from_utf8_lossy(&buf);
|
||||
|
||||
if get_config().ROUTE_INTERNAL {
|
||||
Some(internalize_urls(&content))
|
||||
} else {
|
||||
Some(content.to_string())
|
||||
}
|
||||
}
|
||||
|
||||
/// Determines the directory where the document is stored.
|
||||
///
|
||||
/// # Returns
|
||||
/// A `PathBuf` representing the document directory.
|
||||
pub fn doc_dir(&self) -> PathBuf {
|
||||
let mut file_path = self.base_dir.join(&self.domain);
|
||||
|
||||
for p in self.path.split('/').filter(|x| !x.is_empty()) {
|
||||
file_path = file_path.join(p);
|
||||
}
|
||||
|
||||
file_path
|
||||
}
|
||||
|
||||
/// Retrieves available versions of the document.
|
||||
///
|
||||
/// # Returns
|
||||
/// A vector of strings representing the available versions of the document, sorted in descending order.
|
||||
pub fn versions(&self) -> Vec<String> {
|
||||
let mut res: Vec<String> = read_dir(&self.doc_dir())
|
||||
.into_iter()
|
||||
.filter_map(|x| {
|
||||
if x.starts_with("index_") && x.ends_with(".html") {
|
||||
return Some(
|
||||
x.trim_start_matches("index_")
|
||||
.trim_end_matches(".html")
|
||||
.to_string(),
|
||||
);
|
||||
}
|
||||
|
||||
None
|
||||
})
|
||||
.collect();
|
||||
res.sort();
|
||||
res.reverse();
|
||||
res
|
||||
}
|
||||
}
|
126
src/archive/domain.rs
Normal file
126
src/archive/domain.rs
Normal file
|
@ -0,0 +1,126 @@
|
|||
use std::path::PathBuf;
|
||||
|
||||
use based::result::LogAndIgnore;
|
||||
|
||||
use crate::blacklist::check_blacklist;
|
||||
|
||||
use super::{read_dir, Document};
|
||||
|
||||
|
||||
/// Represents a domain within the website archive
|
||||
pub struct Domain {
|
||||
/// Domain name
|
||||
pub name: String,
|
||||
dir: PathBuf,
|
||||
}
|
||||
|
||||
impl Domain {
|
||||
/// Creates a new `Domain` instance.
|
||||
///
|
||||
/// If the domain name is not blacklisted, a directory is created.
|
||||
///
|
||||
/// # Parameters
|
||||
/// - `name`: The name of the domain.
|
||||
/// - `dir`: The directory path for the domain.
|
||||
///
|
||||
/// # Returns
|
||||
/// A new `Domain` instance.
|
||||
pub fn new(name: &str, dir: PathBuf) -> Self {
|
||||
if !check_blacklist(name) {
|
||||
std::fs::create_dir_all(&dir)
|
||||
.log_err_and_ignore(&format!("Could not create domain dir {name}"));
|
||||
}
|
||||
Self {
|
||||
name: name.to_string(),
|
||||
dir,
|
||||
}
|
||||
}
|
||||
|
||||
/// Resolves a specific path within the domain and returns a `Document` representing it.
|
||||
///
|
||||
/// # Parameters
|
||||
/// - `path`: The path to resolve within the domain.
|
||||
///
|
||||
/// # Returns
|
||||
/// A `Document` instance corresponding to the given path.
|
||||
pub fn path(&self, path: &str) -> Document {
|
||||
Document::new(&self.name, path, self.dir.parent().unwrap().to_path_buf())
|
||||
}
|
||||
|
||||
/// Get all paths associated with the domain
|
||||
pub fn all_paths(&self) -> Vec<PathEntry> {
|
||||
let mut queue = self.paths("/").0;
|
||||
|
||||
let mut ret = Vec::new();
|
||||
|
||||
ret.push(PathEntry(self.name.clone(), "/".to_string()));
|
||||
|
||||
while let Some(el) = queue.pop() {
|
||||
ret.push(el.clone());
|
||||
let paths = self.paths(&el.1).0;
|
||||
queue.extend(paths);
|
||||
}
|
||||
|
||||
ret
|
||||
}
|
||||
|
||||
/// Retrieves entries and metadata for a given path within the domain.
|
||||
///
|
||||
/// # Parameters
|
||||
/// - `path`: The path to inspect.
|
||||
///
|
||||
/// # Returns
|
||||
/// A tuple containing:
|
||||
/// - A vector of `PathEntry` instances representing the contents of the path.
|
||||
/// - A boolean indicating whether the path is itself a `Document`
|
||||
pub fn paths(&self, path: &str) -> (Vec<PathEntry>, bool) {
|
||||
let mut base_path = self.dir.clone();
|
||||
|
||||
for p in path.split('/') {
|
||||
base_path = base_path.join(p);
|
||||
}
|
||||
|
||||
let path = path
|
||||
.split("/")
|
||||
.filter(|x| !x.is_empty())
|
||||
.collect::<Vec<&str>>()
|
||||
.join("/");
|
||||
|
||||
let dir_content = read_dir(&base_path);
|
||||
|
||||
let mut ret = Vec::new();
|
||||
|
||||
let mut is_doc = false;
|
||||
|
||||
for entry in dir_content {
|
||||
let url_path = format!("{path}/{entry}");
|
||||
let url_path = url_path
|
||||
.split("/")
|
||||
.filter(|x| !x.is_empty())
|
||||
.collect::<Vec<&str>>()
|
||||
.join("/");
|
||||
if entry.starts_with("index_") && entry.ends_with(".html") {
|
||||
is_doc = true;
|
||||
continue;
|
||||
}
|
||||
|
||||
ret.push(PathEntry(self.name.clone(), url_path));
|
||||
}
|
||||
|
||||
(ret, is_doc)
|
||||
}
|
||||
}
|
||||
|
||||
/// Represents an entry within a domain's path, containing its name and URL path.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct PathEntry(String, String);
|
||||
|
||||
impl PathEntry {
|
||||
pub fn url(&self) -> String {
|
||||
format!("/d/{}/{}", self.0, self.1)
|
||||
}
|
||||
|
||||
pub fn path(&self) -> &String {
|
||||
&self.1
|
||||
}
|
||||
}
|
207
src/archive/mod.rs
Normal file
207
src/archive/mod.rs
Normal file
|
@ -0,0 +1,207 @@
|
|||
use std::{collections::HashSet, path::PathBuf};
|
||||
|
||||
use crate::{
|
||||
blacklist::{check_blacklist, check_blacklist_path},
|
||||
conf::get_config,
|
||||
favicon::download_fav_for
|
||||
};
|
||||
|
||||
mod document;
|
||||
mod domain;
|
||||
pub use document::Document;
|
||||
pub use domain::*;
|
||||
|
||||
/// Read directory entries into `Vec<String>`
|
||||
pub fn read_dir(dir: &PathBuf) -> Vec<String> {
|
||||
let mut list = Vec::new();
|
||||
|
||||
if let Ok(entries) = std::fs::read_dir(dir) {
|
||||
for entry in entries.flatten() {
|
||||
if let Some(file_name) = entry.file_name().to_str() {
|
||||
list.push(file_name.to_string());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
list
|
||||
}
|
||||
|
||||
/// Rewrite all URLs in `input` to the format `/s/<domain>/<path..>`
|
||||
fn internalize_urls(input: &str) -> String {
|
||||
let url_pattern = r"https?://([a-zA-Z0-9.-]+)(/[\w./-]*)";
|
||||
let re = regex::Regex::new(url_pattern).unwrap();
|
||||
|
||||
re.replace_all(input, |caps: ®ex::Captures| {
|
||||
let domain = caps[1].trim_start_matches("www.");
|
||||
let path = &caps[2];
|
||||
|
||||
// Dont transform if in blacklist
|
||||
if check_blacklist(domain) {
|
||||
return format!("https://{domain}/{path}");
|
||||
}
|
||||
|
||||
format!("/s/{domain}/{path}")
|
||||
})
|
||||
.to_string()
|
||||
}
|
||||
|
||||
/// Extract all domains
|
||||
pub fn extract_domains(input: &str) -> Vec<String> {
|
||||
let url_pattern = r"https?://([a-zA-Z0-9.-]+)(/[\w./-]*)?";
|
||||
let re = regex::Regex::new(url_pattern).unwrap();
|
||||
|
||||
let mut domains = HashSet::new();
|
||||
for caps in re.captures_iter(input) {
|
||||
let domain = caps[1].trim_start_matches("www.");
|
||||
domains.insert(domain.to_string());
|
||||
}
|
||||
|
||||
let mut domains: Vec<_> = domains.into_iter().collect();
|
||||
domains.sort();
|
||||
|
||||
domains
|
||||
}
|
||||
|
||||
/// Represents a directory containg archived websites
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct WebsiteArchive {
|
||||
pub dir: PathBuf,
|
||||
}
|
||||
|
||||
impl WebsiteArchive {
|
||||
/// Creates a new `WebsiteArchive` instance.
|
||||
///
|
||||
/// # Parameters
|
||||
/// - `dir`: The directory path where the archive will be stored.
|
||||
///
|
||||
/// # Returns
|
||||
/// A new `WebsiteArchive` instance.
|
||||
pub fn new(dir: &str) -> Self {
|
||||
Self {
|
||||
dir: PathBuf::from(dir),
|
||||
}
|
||||
}
|
||||
|
||||
/// Retrieves the list of domain names stored in the archive.
|
||||
///
|
||||
/// # Returns
|
||||
/// A vector of domain names as strings.
|
||||
pub fn domains(&self) -> Vec<String> {
|
||||
read_dir(&self.dir)
|
||||
}
|
||||
|
||||
/// Retrieves a `Domain` instance for a specified domain name.
|
||||
///
|
||||
/// # Parameters
|
||||
/// - `domain`: The name of the domain to retrieve.
|
||||
///
|
||||
/// # Returns
|
||||
/// A `Domain` instance corresponding to the specified domain.
|
||||
pub fn get_domain(&self, domain: &str) -> Domain {
|
||||
Domain::new(domain, self.dir.join(domain))
|
||||
}
|
||||
|
||||
/// Archives a URL by downloading and storing its content.
|
||||
///
|
||||
/// If the URL does not pass the blacklist check, it will not be archived.
|
||||
///
|
||||
/// # Parameters
|
||||
/// - `url`: The URL to archive.
|
||||
///
|
||||
/// This function downloads the content of the URL, processes it, and saves it to the archive.
|
||||
pub async fn archive_url(&self, url: &str) {
|
||||
let parsed_url = url::Url::parse(url).unwrap();
|
||||
|
||||
let domain = parsed_url.domain().unwrap().trim_start_matches("www.");
|
||||
|
||||
// Deny blacklist
|
||||
if check_blacklist(domain) {
|
||||
return;
|
||||
}
|
||||
|
||||
let path = parsed_url.path();
|
||||
|
||||
if check_blacklist_path(domain, path) {
|
||||
return;
|
||||
}
|
||||
|
||||
let mut folder_name = self.dir.join(domain);
|
||||
|
||||
download_fav_for(domain).await;
|
||||
|
||||
for paths in path.split('/') {
|
||||
let paths = url_escape::decode(paths).to_string();
|
||||
if !paths.is_empty() {
|
||||
folder_name = folder_name.join(paths);
|
||||
}
|
||||
}
|
||||
|
||||
std::fs::create_dir_all(&folder_name).unwrap();
|
||||
|
||||
let timestamp = chrono::Utc::now().format("%Y-%m-%d").to_string();
|
||||
let filename = folder_name.join(format!("index_{timestamp}.html"));
|
||||
|
||||
log::info!("Archiving {url} to {}", filename.to_str().unwrap());
|
||||
|
||||
let conf = get_config()
|
||||
.get_domain_config(domain)
|
||||
.cloned()
|
||||
.unwrap_or_default();
|
||||
|
||||
let mut cmd = vec!["monolith", "--isolate", "-o", filename.to_str().unwrap()];
|
||||
|
||||
if conf.no_audio.unwrap_or_default() {
|
||||
cmd.push("--no-audio");
|
||||
}
|
||||
|
||||
if conf.no_css.unwrap_or_default() {
|
||||
cmd.push("--no-css");
|
||||
}
|
||||
|
||||
if conf.no_frames.unwrap_or_default() {
|
||||
cmd.push("--no-frames");
|
||||
}
|
||||
|
||||
if conf.no_fonts.unwrap_or_default() {
|
||||
cmd.push("--no-frames");
|
||||
}
|
||||
|
||||
if conf.no_image.unwrap_or_default() {
|
||||
cmd.push("--no-images");
|
||||
}
|
||||
|
||||
if conf.no_javascript.unwrap_or_default() {
|
||||
cmd.push("--no-js");
|
||||
cmd.push("--unwrap-noscript");
|
||||
}
|
||||
|
||||
if conf.no_video.unwrap_or_default() {
|
||||
cmd.push("--no-video");
|
||||
}
|
||||
|
||||
if let Some(ua) = &conf.user_agent {
|
||||
cmd.push("--user-agent");
|
||||
cmd.push(ua.as_str());
|
||||
}
|
||||
|
||||
let mut url = url::Url::parse(&format!("https://{domain}")).unwrap();
|
||||
url = url.join(path).unwrap();
|
||||
let url = url.to_string();
|
||||
cmd.push(&url);
|
||||
|
||||
run_command(&cmd);
|
||||
}
|
||||
}
|
||||
|
||||
fn run_command(cmd: &[&str]) {
|
||||
let mut cmd_setup = std::process::Command::new(cmd[0]);
|
||||
let cmd_setup = cmd_setup
|
||||
.args(cmd.iter().skip(1).collect::<Vec<_>>())
|
||||
.stdout(std::process::Stdio::inherit())
|
||||
.stderr(std::process::Stdio::inherit());
|
||||
|
||||
let child = cmd_setup.spawn().unwrap();
|
||||
|
||||
let status = child.wait_with_output().unwrap();
|
||||
assert!(status.status.success());
|
||||
}
|
|
@ -7,7 +7,7 @@ use based::{
|
|||
},
|
||||
};
|
||||
use maud::{html, PreEscaped};
|
||||
use rocket::{get, State};
|
||||
use rocket::{get, request::FromSegments, State};
|
||||
|
||||
pub mod component;
|
||||
use component::*;
|
||||
|
@ -174,15 +174,49 @@ pub async fn render_txt_website(
|
|||
Some(html2md::parse_html(&content))
|
||||
}
|
||||
|
||||
pub struct PathSegment {
|
||||
segments: Vec<String>
|
||||
}
|
||||
|
||||
impl PathSegment {
|
||||
pub fn to_str(&self) -> String {
|
||||
self.segments.join("/")
|
||||
}
|
||||
}
|
||||
|
||||
impl<'r> FromSegments<'r> for PathSegment {
|
||||
type Error = ();
|
||||
|
||||
fn from_segments(segments: rocket::http::uri::Segments<'r, rocket::http::uri::fmt::Path>) -> Result<Self, Self::Error> {
|
||||
let paths: Vec<_> = segments
|
||||
.filter_map(|x| {
|
||||
if x == "." {
|
||||
return None;
|
||||
}
|
||||
|
||||
if x == ".." {
|
||||
return None
|
||||
}
|
||||
|
||||
Some(x.to_string())
|
||||
})
|
||||
.collect();
|
||||
|
||||
Ok(PathSegment {
|
||||
segments: paths
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// Return archived version of `domain` / `path` at `time`
|
||||
#[get("/s/<domain>/<path..>?<time>")]
|
||||
pub async fn render_website(
|
||||
domain: &str,
|
||||
path: PathBuf,
|
||||
path: PathSegment,
|
||||
time: Option<&str>,
|
||||
arc: &State<WebsiteArchive>,
|
||||
) -> Option<DataResponse> {
|
||||
let document = arc.get_domain(domain).path(path.to_str().unwrap());
|
||||
let document = arc.get_domain(domain).path(&path.to_str());
|
||||
|
||||
let content = document
|
||||
.render_local(time.map(|time| time.to_string()))
|
||||
|
@ -195,7 +229,7 @@ pub async fn render_website(
|
|||
Some(60 * 60 * 24),
|
||||
));
|
||||
} else if get_config().DOWNLOAD_ON_DEMAND && time.is_none() {
|
||||
arc.archive_url(&format!("https://{domain}/{}", path.to_str().unwrap()))
|
||||
arc.archive_url(&format!("https://{domain}/{}", path.to_str()))
|
||||
.await;
|
||||
|
||||
let content = document.render_local(None).await?;
|
||||
|
|
Loading…
Add table
Reference in a new issue