This commit is contained in:
parent
dc10052c16
commit
536f42a4e8
4 changed files with 164 additions and 18 deletions
42
config.toml
42
config.toml
|
@ -11,6 +11,48 @@ BLACKLIST_DOMAINS = [
|
||||||
"^gitlab" # All domains starting with gitlab
|
"^gitlab" # All domains starting with gitlab
|
||||||
]
|
]
|
||||||
|
|
||||||
|
# Domain configuration (Example)
|
||||||
|
[[websites.domains]]
|
||||||
|
# The domain the config applies to
|
||||||
|
domain = "example.com"
|
||||||
|
|
||||||
|
# Blacklisted Path (Regexes)
|
||||||
|
blacklist_paths = ["/.*"]
|
||||||
|
|
||||||
|
# Exclude <audio> tags
|
||||||
|
no_audio = false
|
||||||
|
|
||||||
|
# Exclude <video> tags
|
||||||
|
no_video = false
|
||||||
|
|
||||||
|
# Exclude <img> tags
|
||||||
|
no_image = false
|
||||||
|
|
||||||
|
# Exclude CSS
|
||||||
|
no_css = false
|
||||||
|
|
||||||
|
# Exclude Javascript
|
||||||
|
no_javascript = false
|
||||||
|
|
||||||
|
# Exclude fonts
|
||||||
|
no_fonts = false
|
||||||
|
|
||||||
|
# Exclude iframes
|
||||||
|
no_frames = false
|
||||||
|
|
||||||
|
# User Agent
|
||||||
|
user_agent = "Safari"
|
||||||
|
|
||||||
[ai]
|
[ai]
|
||||||
# Ollama URL (Enables vector search)
|
# Ollama URL (Enables vector search)
|
||||||
OLLAMA_URL="127.0.0.1:11434"
|
OLLAMA_URL="127.0.0.1:11434"
|
||||||
|
|
||||||
|
# --- Website Config
|
||||||
|
|
||||||
|
[[websites.domains]]
|
||||||
|
domain = "developer.mozilla.org"
|
||||||
|
no_javascript = true
|
||||||
|
|
||||||
|
[[websites.domains]]
|
||||||
|
domain = "github.com"
|
||||||
|
no_javascript = true
|
||||||
|
|
|
@ -3,7 +3,12 @@ use std::{collections::HashSet, io::Read, path::PathBuf};
|
||||||
use based::{request::RequestContext, result::LogAndIgnore};
|
use based::{request::RequestContext, result::LogAndIgnore};
|
||||||
use maud::html;
|
use maud::html;
|
||||||
|
|
||||||
use crate::{blacklist::check_blacklist, conf::get_config, favicon::download_fav_for, render_page};
|
use crate::{
|
||||||
|
blacklist::{check_blacklist, check_blacklist_path},
|
||||||
|
conf::get_config,
|
||||||
|
favicon::download_fav_for,
|
||||||
|
render_page,
|
||||||
|
};
|
||||||
|
|
||||||
/// Read directory entries into `Vec<String>`
|
/// Read directory entries into `Vec<String>`
|
||||||
pub fn read_dir(dir: &PathBuf) -> Vec<String> {
|
pub fn read_dir(dir: &PathBuf) -> Vec<String> {
|
||||||
|
@ -340,7 +345,6 @@ impl WebsiteArchive {
|
||||||
///
|
///
|
||||||
/// This function downloads the content of the URL, processes it, and saves it to the archive.
|
/// This function downloads the content of the URL, processes it, and saves it to the archive.
|
||||||
pub async fn archive_url(&self, url: &str) {
|
pub async fn archive_url(&self, url: &str) {
|
||||||
// TODO : refactor
|
|
||||||
let parsed_url = url::Url::parse(url).unwrap();
|
let parsed_url = url::Url::parse(url).unwrap();
|
||||||
|
|
||||||
let domain = parsed_url.domain().unwrap().trim_start_matches("www.");
|
let domain = parsed_url.domain().unwrap().trim_start_matches("www.");
|
||||||
|
@ -352,6 +356,10 @@ impl WebsiteArchive {
|
||||||
|
|
||||||
let path = parsed_url.path();
|
let path = parsed_url.path();
|
||||||
|
|
||||||
|
if check_blacklist_path(domain, path) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
let mut folder_name = self.dir.join(domain);
|
let mut folder_name = self.dir.join(domain);
|
||||||
|
|
||||||
download_fav_for(domain).await;
|
download_fav_for(domain).await;
|
||||||
|
@ -369,21 +377,56 @@ impl WebsiteArchive {
|
||||||
|
|
||||||
log::info!("Archiving {url} to {}", filename.to_str().unwrap());
|
log::info!("Archiving {url} to {}", filename.to_str().unwrap());
|
||||||
|
|
||||||
run_command(&[
|
let conf = get_config()
|
||||||
"monolith",
|
.get_domain_config(domain)
|
||||||
"-I",
|
.cloned()
|
||||||
"-o",
|
.unwrap_or_default();
|
||||||
filename.to_str().unwrap(),
|
|
||||||
&format!("https://{}/{}", domain, path),
|
let mut cmd = vec!["monolith", "--isolate", "-o", filename.to_str().unwrap()];
|
||||||
]);
|
|
||||||
|
if conf.no_audio {
|
||||||
|
cmd.push("--no-audio");
|
||||||
|
}
|
||||||
|
|
||||||
|
if conf.no_css {
|
||||||
|
cmd.push("--no-css");
|
||||||
|
}
|
||||||
|
|
||||||
|
if conf.no_frames {
|
||||||
|
cmd.push("--no-frames");
|
||||||
|
}
|
||||||
|
|
||||||
|
if conf.no_fonts {
|
||||||
|
cmd.push("--no-frames");
|
||||||
|
}
|
||||||
|
|
||||||
|
if conf.no_image {
|
||||||
|
cmd.push("--no-images");
|
||||||
|
}
|
||||||
|
|
||||||
|
if conf.no_javascript {
|
||||||
|
cmd.push("--no-js");
|
||||||
|
cmd.push("--unwrap-noscript");
|
||||||
|
}
|
||||||
|
|
||||||
|
if conf.no_video {
|
||||||
|
cmd.push("--no-video");
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(ua) = &conf.user_agent {
|
||||||
|
cmd.push("--user-agent");
|
||||||
|
cmd.push(ua.as_str());
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut url = url::Url::parse(&format!("https://{domain}")).unwrap();
|
||||||
|
url = url.join(path).unwrap();
|
||||||
|
let url = url.to_string();
|
||||||
|
cmd.push(&url);
|
||||||
|
|
||||||
|
run_command(&cmd);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// full text search
|
|
||||||
// add new sites?
|
|
||||||
// transparent auto page downloading
|
|
||||||
// redownload after threshold
|
|
||||||
|
|
||||||
fn run_command(cmd: &[&str]) {
|
fn run_command(cmd: &[&str]) {
|
||||||
let mut cmd_setup = std::process::Command::new(cmd[0]);
|
let mut cmd_setup = std::process::Command::new(cmd[0]);
|
||||||
let cmd_setup = cmd_setup
|
let cmd_setup = cmd_setup
|
||||||
|
|
|
@ -7,13 +7,37 @@ pub fn check_blacklist(domain: &str) -> bool {
|
||||||
let conf = get_config();
|
let conf = get_config();
|
||||||
let conf = conf.websites.as_ref();
|
let conf = conf.websites.as_ref();
|
||||||
|
|
||||||
|
// TODO : Block IPs
|
||||||
|
// Test SSRF
|
||||||
|
|
||||||
let blacklisted_domains = conf
|
let blacklisted_domains = conf
|
||||||
.map(|x| x.BLACKLIST_DOMAINS.as_ref())
|
.map(|x| x.BLACKLIST_DOMAINS.as_ref())
|
||||||
.unwrap_or_default();
|
.unwrap_or_default();
|
||||||
|
|
||||||
for domain_regex in blacklisted_domains.unwrap_or(&Vec::new()) {
|
check_regex(domain, blacklisted_domains.unwrap_or(&Vec::new()))
|
||||||
let rgx = regex::Regex::new(domain_regex).unwrap();
|
}
|
||||||
if rgx.is_match(domain) {
|
|
||||||
|
pub fn check_blacklist_path(domain: &str, path: &str) -> bool {
|
||||||
|
let conf = get_config();
|
||||||
|
let conf = conf.websites.as_ref();
|
||||||
|
|
||||||
|
if let Some(website) = conf {
|
||||||
|
let empty = Vec::new();
|
||||||
|
let domain_conf = website.domains.as_ref().unwrap_or(&empty);
|
||||||
|
if let Some(domain_conf) = domain_conf.iter().find(|x| x.domain == domain) {
|
||||||
|
let empty = Vec::new();
|
||||||
|
let blacklist = domain_conf.blacklist_paths.as_ref().unwrap_or(&empty);
|
||||||
|
return check_regex(path, blacklist);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
false
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn check_regex(input: &str, regexes: &Vec<String>) -> bool {
|
||||||
|
for regex in regexes {
|
||||||
|
let rgx = regex::Regex::new(regex).unwrap();
|
||||||
|
if rgx.is_match(input) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
39
src/conf.rs
39
src/conf.rs
|
@ -38,6 +38,19 @@ pub struct Config {
|
||||||
pub websites: Option<WebsiteConfig>,
|
pub websites: Option<WebsiteConfig>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl Config {
|
||||||
|
pub fn get_domain_config(&self, domain: &str) -> Option<&DomainConfig> {
|
||||||
|
if let Some(websites) = &self.websites {
|
||||||
|
if let Some(domains) = &websites.domains {
|
||||||
|
let domain = domains.iter().find(|x| x.domain == domain);
|
||||||
|
return domain;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
None
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[allow(non_snake_case)]
|
#[allow(non_snake_case)]
|
||||||
#[derive(Debug, Clone, Deserialize)]
|
#[derive(Debug, Clone, Deserialize)]
|
||||||
pub struct AIConfig {
|
pub struct AIConfig {
|
||||||
|
@ -53,9 +66,33 @@ pub struct WebsiteConfig {
|
||||||
|
|
||||||
#[derive(Debug, Clone, Deserialize)]
|
#[derive(Debug, Clone, Deserialize)]
|
||||||
pub struct DomainConfig {
|
pub struct DomainConfig {
|
||||||
// TODO : Domain specific config
|
pub domain: String,
|
||||||
pub blacklist_paths: Option<Vec<String>>,
|
pub blacklist_paths: Option<Vec<String>>,
|
||||||
|
pub no_audio: bool,
|
||||||
|
pub no_video: bool,
|
||||||
|
pub no_image: bool,
|
||||||
|
pub no_css: bool,
|
||||||
pub no_javascript: bool,
|
pub no_javascript: bool,
|
||||||
|
pub no_fonts: bool,
|
||||||
|
pub no_frames: bool,
|
||||||
|
pub user_agent: Option<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for DomainConfig {
|
||||||
|
fn default() -> Self {
|
||||||
|
Self {
|
||||||
|
domain: String::new(),
|
||||||
|
blacklist_paths: None,
|
||||||
|
no_audio: false,
|
||||||
|
no_video: false,
|
||||||
|
no_image: false,
|
||||||
|
no_css: false,
|
||||||
|
no_javascript: false,
|
||||||
|
no_fonts: false,
|
||||||
|
no_frames: false,
|
||||||
|
user_agent: None,
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Default for Config {
|
impl Default for Config {
|
||||||
|
|
Loading…
Add table
Reference in a new issue