add per domain config
Some checks failed
ci/woodpecker/push/build Pipeline failed

This commit is contained in:
JMARyA 2025-01-03 13:34:59 +01:00
parent dc10052c16
commit 536f42a4e8
Signed by: jmarya
GPG key ID: 901B2ADDF27C2263
4 changed files with 164 additions and 18 deletions

View file

@ -11,6 +11,48 @@ BLACKLIST_DOMAINS = [
"^gitlab" # All domains starting with gitlab "^gitlab" # All domains starting with gitlab
] ]
# Domain configuration (Example)
[[websites.domains]]
# The domain the config applies to
domain = "example.com"
# Blacklisted Path (Regexes)
blacklist_paths = ["/.*"]
# Exclude <audio> tags
no_audio = false
# Exclude <video> tags
no_video = false
# Exclude <img> tags
no_image = false
# Exclude CSS
no_css = false
# Exclude Javascript
no_javascript = false
# Exclude fonts
no_fonts = false
# Exclude iframes
no_frames = false
# User Agent
user_agent = "Safari"
[ai] [ai]
# Ollama URL (Enables vector search) # Ollama URL (Enables vector search)
OLLAMA_URL="127.0.0.1:11434" OLLAMA_URL="127.0.0.1:11434"
# --- Website Config
[[websites.domains]]
domain = "developer.mozilla.org"
no_javascript = true
[[websites.domains]]
domain = "github.com"
no_javascript = true

View file

@ -3,7 +3,12 @@ use std::{collections::HashSet, io::Read, path::PathBuf};
use based::{request::RequestContext, result::LogAndIgnore}; use based::{request::RequestContext, result::LogAndIgnore};
use maud::html; use maud::html;
use crate::{blacklist::check_blacklist, conf::get_config, favicon::download_fav_for, render_page}; use crate::{
blacklist::{check_blacklist, check_blacklist_path},
conf::get_config,
favicon::download_fav_for,
render_page,
};
/// Read directory entries into `Vec<String>` /// Read directory entries into `Vec<String>`
pub fn read_dir(dir: &PathBuf) -> Vec<String> { pub fn read_dir(dir: &PathBuf) -> Vec<String> {
@ -340,7 +345,6 @@ impl WebsiteArchive {
/// ///
/// This function downloads the content of the URL, processes it, and saves it to the archive. /// This function downloads the content of the URL, processes it, and saves it to the archive.
pub async fn archive_url(&self, url: &str) { pub async fn archive_url(&self, url: &str) {
// TODO : refactor
let parsed_url = url::Url::parse(url).unwrap(); let parsed_url = url::Url::parse(url).unwrap();
let domain = parsed_url.domain().unwrap().trim_start_matches("www."); let domain = parsed_url.domain().unwrap().trim_start_matches("www.");
@ -352,6 +356,10 @@ impl WebsiteArchive {
let path = parsed_url.path(); let path = parsed_url.path();
if check_blacklist_path(domain, path) {
return;
}
let mut folder_name = self.dir.join(domain); let mut folder_name = self.dir.join(domain);
download_fav_for(domain).await; download_fav_for(domain).await;
@ -369,21 +377,56 @@ impl WebsiteArchive {
log::info!("Archiving {url} to {}", filename.to_str().unwrap()); log::info!("Archiving {url} to {}", filename.to_str().unwrap());
run_command(&[ let conf = get_config()
"monolith", .get_domain_config(domain)
"-I", .cloned()
"-o", .unwrap_or_default();
filename.to_str().unwrap(),
&format!("https://{}/{}", domain, path), let mut cmd = vec!["monolith", "--isolate", "-o", filename.to_str().unwrap()];
]);
if conf.no_audio {
cmd.push("--no-audio");
}
if conf.no_css {
cmd.push("--no-css");
}
if conf.no_frames {
cmd.push("--no-frames");
}
if conf.no_fonts {
cmd.push("--no-frames");
}
if conf.no_image {
cmd.push("--no-images");
}
if conf.no_javascript {
cmd.push("--no-js");
cmd.push("--unwrap-noscript");
}
if conf.no_video {
cmd.push("--no-video");
}
if let Some(ua) = &conf.user_agent {
cmd.push("--user-agent");
cmd.push(ua.as_str());
}
let mut url = url::Url::parse(&format!("https://{domain}")).unwrap();
url = url.join(path).unwrap();
let url = url.to_string();
cmd.push(&url);
run_command(&cmd);
} }
} }
// full text search
// add new sites?
// transparent auto page downloading
// redownload after threshold
fn run_command(cmd: &[&str]) { fn run_command(cmd: &[&str]) {
let mut cmd_setup = std::process::Command::new(cmd[0]); let mut cmd_setup = std::process::Command::new(cmd[0]);
let cmd_setup = cmd_setup let cmd_setup = cmd_setup

View file

@ -7,13 +7,37 @@ pub fn check_blacklist(domain: &str) -> bool {
let conf = get_config(); let conf = get_config();
let conf = conf.websites.as_ref(); let conf = conf.websites.as_ref();
// TODO : Block IPs
// Test SSRF
let blacklisted_domains = conf let blacklisted_domains = conf
.map(|x| x.BLACKLIST_DOMAINS.as_ref()) .map(|x| x.BLACKLIST_DOMAINS.as_ref())
.unwrap_or_default(); .unwrap_or_default();
for domain_regex in blacklisted_domains.unwrap_or(&Vec::new()) { check_regex(domain, blacklisted_domains.unwrap_or(&Vec::new()))
let rgx = regex::Regex::new(domain_regex).unwrap(); }
if rgx.is_match(domain) {
pub fn check_blacklist_path(domain: &str, path: &str) -> bool {
let conf = get_config();
let conf = conf.websites.as_ref();
if let Some(website) = conf {
let empty = Vec::new();
let domain_conf = website.domains.as_ref().unwrap_or(&empty);
if let Some(domain_conf) = domain_conf.iter().find(|x| x.domain == domain) {
let empty = Vec::new();
let blacklist = domain_conf.blacklist_paths.as_ref().unwrap_or(&empty);
return check_regex(path, blacklist);
}
}
false
}
pub fn check_regex(input: &str, regexes: &Vec<String>) -> bool {
for regex in regexes {
let rgx = regex::Regex::new(regex).unwrap();
if rgx.is_match(input) {
return true; return true;
} }
} }

View file

@ -38,6 +38,19 @@ pub struct Config {
pub websites: Option<WebsiteConfig>, pub websites: Option<WebsiteConfig>,
} }
impl Config {
pub fn get_domain_config(&self, domain: &str) -> Option<&DomainConfig> {
if let Some(websites) = &self.websites {
if let Some(domains) = &websites.domains {
let domain = domains.iter().find(|x| x.domain == domain);
return domain;
}
}
None
}
}
#[allow(non_snake_case)] #[allow(non_snake_case)]
#[derive(Debug, Clone, Deserialize)] #[derive(Debug, Clone, Deserialize)]
pub struct AIConfig { pub struct AIConfig {
@ -53,9 +66,33 @@ pub struct WebsiteConfig {
#[derive(Debug, Clone, Deserialize)] #[derive(Debug, Clone, Deserialize)]
pub struct DomainConfig { pub struct DomainConfig {
// TODO : Domain specific config pub domain: String,
pub blacklist_paths: Option<Vec<String>>, pub blacklist_paths: Option<Vec<String>>,
pub no_audio: bool,
pub no_video: bool,
pub no_image: bool,
pub no_css: bool,
pub no_javascript: bool, pub no_javascript: bool,
pub no_fonts: bool,
pub no_frames: bool,
pub user_agent: Option<String>,
}
impl Default for DomainConfig {
fn default() -> Self {
Self {
domain: String::new(),
blacklist_paths: None,
no_audio: false,
no_video: false,
no_image: false,
no_css: false,
no_javascript: false,
no_fonts: false,
no_frames: false,
user_agent: None,
}
}
} }
impl Default for Config { impl Default for Config {