250 lines
6.3 KiB
Rust
250 lines
6.3 KiB
Rust
use std::path::PathBuf;
|
|
|
|
use based::request::RequestContext;
|
|
use maud::html;
|
|
|
|
use crate::{blacklist::check_blacklist, favicon::download_fav_for, pages::render_page};
|
|
|
|
pub fn read_dir(dir: &PathBuf) -> Vec<String> {
|
|
let mut list = Vec::new();
|
|
|
|
if let Ok(entries) = std::fs::read_dir(dir) {
|
|
for entry in entries {
|
|
if let Ok(entry) = entry {
|
|
if let Some(file_name) = entry.file_name().to_str() {
|
|
list.push(file_name.to_string());
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
list
|
|
}
|
|
|
|
fn internalize_urls(input: &str) -> String {
|
|
let url_pattern = r"https?://([a-zA-Z0-9.-]+)(/[\w./-]*)";
|
|
let re = regex::Regex::new(url_pattern).unwrap();
|
|
|
|
re.replace_all(input, |caps: ®ex::Captures| {
|
|
format!(
|
|
"/s/{}/{}",
|
|
&caps[1].trim_start_matches("www."), // Domain
|
|
&caps[2] // Path
|
|
)
|
|
})
|
|
.to_string()
|
|
}
|
|
|
|
#[derive(Debug, Clone)]
|
|
pub struct WebsiteArchive {
|
|
pub dir: PathBuf,
|
|
}
|
|
|
|
pub struct Domain {
|
|
pub name: String,
|
|
dir: PathBuf,
|
|
}
|
|
|
|
impl Domain {
|
|
pub fn new(name: &str, dir: PathBuf) -> Self {
|
|
if !check_blacklist(name) {
|
|
std::fs::create_dir_all(&dir).unwrap();
|
|
}
|
|
Self {
|
|
name: name.to_string(),
|
|
dir,
|
|
}
|
|
}
|
|
|
|
pub fn path(&self, path: &str) -> Document {
|
|
Document::new(&self.name, path, self.dir.parent().unwrap().to_path_buf())
|
|
}
|
|
|
|
pub fn paths(&self, path: &str) -> Vec<PathEntry> {
|
|
let mut base_path = self.dir.clone();
|
|
|
|
for p in path.split('/') {
|
|
base_path = base_path.join(p);
|
|
}
|
|
|
|
let dir_content = read_dir(&base_path);
|
|
|
|
let mut ret = Vec::new();
|
|
|
|
for entry in dir_content {
|
|
let url_path = format!("{path}/{entry}");
|
|
let is_doc = read_dir(&base_path.join(entry))
|
|
.into_iter()
|
|
.any(|x| x.starts_with("index_") && x.ends_with(".html"));
|
|
if is_doc {
|
|
ret.push(PathEntry::Document(Document::new(
|
|
&self.name,
|
|
&url_path,
|
|
self.dir.parent().unwrap().to_path_buf(),
|
|
)));
|
|
} else {
|
|
ret.push(PathEntry::Path(self.name.clone(), url_path));
|
|
}
|
|
}
|
|
|
|
ret
|
|
}
|
|
}
|
|
|
|
pub enum PathEntry {
|
|
Path(String, String),
|
|
Document(Document),
|
|
}
|
|
|
|
impl PathEntry {
|
|
pub fn url(&self) -> String {
|
|
match self {
|
|
PathEntry::Path(domain, path) => format!("/d/{domain}/{path}"),
|
|
PathEntry::Document(document) => document.url(),
|
|
}
|
|
}
|
|
|
|
pub fn path(&self) -> String {
|
|
match self {
|
|
PathEntry::Path(_, path) => path.to_string(),
|
|
PathEntry::Document(document) => document.path.clone(),
|
|
}
|
|
}
|
|
}
|
|
|
|
pub struct Document {
|
|
pub domain: String,
|
|
pub path: String,
|
|
base_dir: PathBuf,
|
|
}
|
|
|
|
impl Document {
|
|
pub fn new(domain: &str, path: &str, base_dir: PathBuf) -> Self {
|
|
Self {
|
|
domain: domain.to_string(),
|
|
path: path.to_string(),
|
|
base_dir,
|
|
}
|
|
}
|
|
|
|
pub fn url(&self) -> String {
|
|
format!("/s/{}/{}", self.domain, self.path)
|
|
}
|
|
|
|
pub async fn render_local(&self, version: Option<String>) -> Option<String> {
|
|
if check_blacklist(&self.domain) {
|
|
let content = html! {
|
|
h3 { "This site is blacklisted" };
|
|
};
|
|
return Some(render_page(content, RequestContext::default()).await.1 .1);
|
|
}
|
|
|
|
let mut file_path = self.doc_dir();
|
|
|
|
let latest_version = if let Some(version) = version {
|
|
format!("index_{version}.html")
|
|
} else {
|
|
let versions = self.versions();
|
|
versions.first().cloned()?
|
|
};
|
|
|
|
file_path = file_path.join(latest_version);
|
|
|
|
let content = std::fs::read_to_string(file_path).ok()?;
|
|
|
|
if std::env::var("ROUTE_INTERNAL").unwrap_or("false".to_string()) == "true" {
|
|
Some(internalize_urls(&content))
|
|
} else {
|
|
Some(content)
|
|
}
|
|
}
|
|
|
|
pub fn doc_dir(&self) -> PathBuf {
|
|
let mut file_path = self.base_dir.join(&self.domain);
|
|
|
|
for p in self.path.split('/') {
|
|
file_path = file_path.join(p);
|
|
}
|
|
|
|
file_path
|
|
}
|
|
|
|
pub fn versions(&self) -> Vec<String> {
|
|
read_dir(&self.doc_dir())
|
|
}
|
|
}
|
|
|
|
impl WebsiteArchive {
|
|
pub fn new(dir: &str) -> Self {
|
|
Self {
|
|
dir: PathBuf::from(dir),
|
|
}
|
|
}
|
|
|
|
pub fn domains(&self) -> Vec<String> {
|
|
read_dir(&self.dir)
|
|
}
|
|
|
|
pub fn get_domain(&self, domain: &str) -> Domain {
|
|
Domain::new(domain, self.dir.join(domain))
|
|
}
|
|
|
|
/// Archive a URL
|
|
pub async fn archive_url(&self, url: &str) {
|
|
let parsed_url = url::Url::parse(url).unwrap();
|
|
|
|
let domain = parsed_url.domain().unwrap().trim_start_matches("www.");
|
|
|
|
// Deny blacklist
|
|
if check_blacklist(domain) {
|
|
return;
|
|
}
|
|
|
|
let path = parsed_url.path();
|
|
|
|
let mut folder_name = self.dir.join(&domain);
|
|
|
|
if !std::fs::exists(&folder_name).unwrap() {
|
|
download_fav_for(domain).await;
|
|
}
|
|
|
|
for paths in path.split('/') {
|
|
if !paths.is_empty() {
|
|
folder_name = folder_name.join(paths);
|
|
}
|
|
}
|
|
|
|
std::fs::create_dir_all(&folder_name).unwrap();
|
|
|
|
let timestamp = chrono::Utc::now().format("%Y-%m-%d").to_string();
|
|
let filename = folder_name.join(&format!("index_{timestamp}.html"));
|
|
|
|
log::info!("Archiving {url} to {}", filename.to_str().unwrap());
|
|
|
|
run_command(&vec![
|
|
"monolith",
|
|
"-I",
|
|
"-o",
|
|
filename.to_str().unwrap(),
|
|
&format!("https://{}/{}", domain, path),
|
|
]);
|
|
}
|
|
}
|
|
|
|
// full text search
|
|
// add new sites?
|
|
// transparent auto page downloading
|
|
// redownload after threshold
|
|
|
|
fn run_command(cmd: &[&str]) {
|
|
let mut cmd_setup = std::process::Command::new(cmd[0]);
|
|
let cmd_setup = cmd_setup
|
|
.args(cmd.into_iter().skip(1).collect::<Vec<_>>())
|
|
.stdout(std::process::Stdio::inherit())
|
|
.stderr(std::process::Stdio::inherit());
|
|
|
|
let child = cmd_setup.spawn().unwrap();
|
|
|
|
let status = child.wait_with_output().unwrap();
|
|
assert!(status.status.success());
|
|
}
|