diff --git a/Cargo.lock b/Cargo.lock index c276ef2..6ee4987 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -164,7 +164,7 @@ checksum = "8c3c1a368f70d6cf7302d78f8f7093da241fb8e8807c05cc9e51a125895a6d5b" [[package]] name = "based" version = "0.1.0" -source = "git+https://git.hydrar.de/jmarya/based#04852f2fbcc301d0c2b4098f613b9450b4474363" +source = "git+https://git.hydrar.de/jmarya/based#38373021611149d2ebc6d33a269375ec240527cb" dependencies = [ "bcrypt", "chrono", diff --git a/docker-compose.yml b/docker-compose.yml index 67564e0..e92401f 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -13,3 +13,5 @@ services: - "ROUTE_INTERNAL=true" # Download missing routes on demand - "DOWNLOAD_ON_DEMAND=true" + # Blacklisted domains (Comma-seperated regex) + - "BLACKLIST_DOMAINS=google.com,.*.youtube.com" diff --git a/src/archive.rs b/src/archive.rs index 74cf768..aa29525 100644 --- a/src/archive.rs +++ b/src/archive.rs @@ -1,5 +1,10 @@ use std::path::PathBuf; +use based::request::RequestContext; +use maud::html; + +use crate::{blacklist::check_blacklist, favicon::download_fav_for, pages::render_page}; + pub fn read_dir(dir: &PathBuf) -> Vec { let mut list = Vec::new(); @@ -30,6 +35,7 @@ fn internalize_urls(input: &str) -> String { .to_string() } +#[derive(Debug, Clone)] pub struct WebsiteArchive { pub dir: PathBuf, } @@ -41,7 +47,9 @@ pub struct Domain { impl Domain { pub fn new(name: &str, dir: PathBuf) -> Self { - std::fs::create_dir_all(&dir).unwrap(); + if !check_blacklist(name) { + std::fs::create_dir_all(&dir).unwrap(); + } Self { name: name.to_string(), dir, @@ -123,7 +131,14 @@ impl Document { format!("/s/{}/{}", self.domain, self.path) } - pub fn render_local(&self, version: Option) -> Option { + pub async fn render_local(&self, version: Option) -> Option { + if check_blacklist(&self.domain) { + let content = html! { + h3 { "This site is blacklisted" }; + }; + return Some(render_page(content, RequestContext::default()).await.1 .1); + } + let mut file_path = self.doc_dir(); let latest_version = if let Some(version) = version { @@ -175,14 +190,24 @@ impl WebsiteArchive { } /// Archive a URL - pub fn archive_url(&self, url: &str) { + pub async fn archive_url(&self, url: &str) { let parsed_url = url::Url::parse(url).unwrap(); let domain = parsed_url.domain().unwrap().trim_start_matches("www."); + + // Deny blacklist + if check_blacklist(domain) { + return; + } + let path = parsed_url.path(); let mut folder_name = self.dir.join(&domain); + if !std::fs::exists(&folder_name).unwrap() { + download_fav_for(domain).await; + } + for paths in path.split('/') { if !paths.is_empty() { folder_name = folder_name.join(paths); diff --git a/src/blacklist.rs b/src/blacklist.rs new file mode 100644 index 0000000..9a31b69 --- /dev/null +++ b/src/blacklist.rs @@ -0,0 +1,18 @@ +pub fn check_blacklist(domain: &str) -> bool { + let blacklist_raw = std::env::var("BLACKLIST_DOMAINS").unwrap_or_default(); + + if blacklist_raw.is_empty() { + return false; + } + + let blacklist: Vec<&str> = blacklist_raw.split(',').collect(); + + for domain_regex in blacklist { + let rgx = regex::Regex::new(domain_regex).unwrap(); + if rgx.is_match(domain) { + return true; + } + } + + return false; +} diff --git a/src/favicon.rs b/src/favicon.rs index 70f05e6..01d48ed 100644 --- a/src/favicon.rs +++ b/src/favicon.rs @@ -15,10 +15,15 @@ pub async fn download_favicon(domain: &str) -> Option> { Some(favicon_data) } -pub async fn download_favicons_for_sites(sites: Vec) { - for site in sites { - if let Some(fav) = download_favicon(&site).await { - std::fs::write(std::path::Path::new("./favicon").join(site), fav).unwrap(); - } +pub async fn download_fav_for(site: &str) { + if let Some(fav) = download_favicon(&site).await { + std::fs::write(std::path::Path::new("./favicon").join(site), fav).unwrap(); + log::info!("Writting favicon for {site}"); + } +} + +pub async fn download_favicons_for_sites(sites: &[String]) { + for site in sites { + download_fav_for(site).await; } } diff --git a/src/main.rs b/src/main.rs index 9aaa595..df8ddb4 100644 --- a/src/main.rs +++ b/src/main.rs @@ -2,6 +2,7 @@ use archive::WebsiteArchive; use rocket::routes; mod archive; +mod blacklist; mod favicon; mod pages; @@ -11,7 +12,11 @@ async fn launch() -> _ { let arc = WebsiteArchive::new("./websites"); - favicon::download_favicons_for_sites(arc.domains()).await; + let archive = arc.clone(); + + tokio::spawn(async move { + favicon::download_favicons_for_sites(&archive.domains()).await; + }); rocket::build() .mount( diff --git a/src/pages/mod.rs b/src/pages/mod.rs index ad64349..e0ff696 100644 --- a/src/pages/mod.rs +++ b/src/pages/mod.rs @@ -113,11 +113,13 @@ pub async fn render_website( ) -> Option { let document = arc.get_domain(domain).path(path.to_str().unwrap()); - let content = document.render_local(if time.is_some() { - Some(time.unwrap().to_string()) - } else { - None - }); + let content = document + .render_local(if time.is_some() { + Some(time.unwrap().to_string()) + } else { + None + }) + .await; if let Some(content) = content { return Some(respond_html(&content)); @@ -127,13 +129,18 @@ pub async fn render_website( .as_str() == "true" { - arc.archive_url(&format!("https://{domain}/{}", path.to_str().unwrap())); + arc.archive_url(&format!("https://{domain}/{}", path.to_str().unwrap())) + .await; - return Some(respond_html(&document.render_local(if time.is_some() { - Some(time.unwrap().to_string()) - } else { - None - })?)); + return Some(respond_html( + &document + .render_local(if time.is_some() { + Some(time.unwrap().to_string()) + } else { + None + }) + .await?, + )); } }