refactor
Some checks failed
ci/woodpecker/push/build Pipeline failed

This commit is contained in:
JMARyA 2024-12-30 09:57:42 +01:00
parent 9b96a21906
commit 654d4b9cba
Signed by: jmarya
GPG key ID: 901B2ADDF27C2263
7 changed files with 222 additions and 105 deletions

5
Cargo.lock generated
View file

@ -3384,22 +3384,17 @@ dependencies = [
"chrono",
"env_logger",
"futures",
"hex",
"log",
"maud",
"rayon",
"regex",
"reqwest 0.12.11",
"ring 0.16.20",
"rocket",
"rocket_cors",
"serde",
"serde_json",
"sqlx",
"tokio",
"url",
"uuid",
"walkdir",
]
[[package]]

View file

@ -5,16 +5,11 @@ edition = "2021"
[dependencies]
env_logger = "0.10.0"
hex = "0.4.3"
rayon = "1.7.0"
regex = "1.9.5"
ring = "0.16.20"
walkdir = "2.4.0"
chrono = { version = "0.4.38", features = ["serde"] }
futures = "0.3.30"
log = "0.4.20"
rocket = { version = "0.5.1", features = ["json"] }
rocket_cors = "0.6.0"
serde = { version = "1.0.195", features = ["derive"] }
serde_json = "1.0.111"
tokio = { version = "1.35.1", features = ["full"] }

View file

@ -3,17 +3,16 @@ use std::path::PathBuf;
use based::request::RequestContext;
use maud::html;
use crate::{blacklist::check_blacklist, favicon::download_fav_for, pages::render_page};
use crate::{blacklist::check_blacklist, favicon::download_fav_for, pages::component::render_page};
/// Read directory entries into `Vec<String>`
pub fn read_dir(dir: &PathBuf) -> Vec<String> {
let mut list = Vec::new();
if let Ok(entries) = std::fs::read_dir(dir) {
for entry in entries {
if let Ok(entry) = entry {
if let Some(file_name) = entry.file_name().to_str() {
list.push(file_name.to_string());
}
for entry in entries.flatten() {
if let Some(file_name) = entry.file_name().to_str() {
list.push(file_name.to_string());
}
}
}
@ -21,6 +20,7 @@ pub fn read_dir(dir: &PathBuf) -> Vec<String> {
list
}
/// Rewrite all URLs in `input` to the format `/s/<domain>/<path..>`
fn internalize_urls(input: &str) -> String {
let url_pattern = r"https?://([a-zA-Z0-9.-]+)(/[\w./-]*)";
let re = regex::Regex::new(url_pattern).unwrap();
@ -35,17 +35,30 @@ fn internalize_urls(input: &str) -> String {
.to_string()
}
/// Represents a directory containg archived websites
#[derive(Debug, Clone)]
pub struct WebsiteArchive {
pub dir: PathBuf,
}
/// Represents a domain within the website archive
pub struct Domain {
/// Domain name
pub name: String,
dir: PathBuf,
}
impl Domain {
/// Creates a new `Domain` instance.
///
/// If the domain name is not blacklisted, a directory is created.
///
/// # Parameters
/// - `name`: The name of the domain.
/// - `dir`: The directory path for the domain.
///
/// # Returns
/// A new `Domain` instance.
pub fn new(name: &str, dir: PathBuf) -> Self {
if !check_blacklist(name) {
std::fs::create_dir_all(&dir).unwrap();
@ -56,10 +69,26 @@ impl Domain {
}
}
/// Resolves a specific path within the domain and returns a `Document` representing it.
///
/// # Parameters
/// - `path`: The path to resolve within the domain.
///
/// # Returns
/// A `Document` instance corresponding to the given path.
pub fn path(&self, path: &str) -> Document {
Document::new(&self.name, path, self.dir.parent().unwrap().to_path_buf())
}
/// Retrieves entries and metadata for a given path within the domain.
///
/// # Parameters
/// - `path`: The path to inspect.
///
/// # Returns
/// A tuple containing:
/// - A vector of `PathEntry` instances representing the contents of the path.
/// - A boolean indicating whether the path is itself a `Document`
pub fn paths(&self, path: &str) -> (Vec<PathEntry>, bool) {
let mut base_path = self.dir.clone();
@ -87,6 +116,7 @@ impl Domain {
}
}
/// Represents an entry within a domain's path, containing its name and URL path.
pub struct PathEntry(String, String);
impl PathEntry {
@ -99,13 +129,25 @@ impl PathEntry {
}
}
/// Represents a document within a domain
pub struct Document {
/// The domain associated with the document.
pub domain: String,
/// The path of the document within the domain.
pub path: String,
base_dir: PathBuf,
}
impl Document {
/// Creates a new `Document` instance.
///
/// # Parameters
/// - `domain`: The domain to which the document belongs.
/// - `path`: The path of the document within the domain.
/// - `base_dir`: The base directory of the archive storage.
///
/// # Returns
/// A new `Document` instance.
pub fn new(domain: &str, path: &str, base_dir: PathBuf) -> Self {
Self {
domain: domain.to_string(),
@ -114,10 +156,15 @@ impl Document {
}
}
pub fn url(&self) -> String {
format!("/s/{}/{}", self.domain, self.path)
}
/// Renders the document, returning its content as a string.
///
/// If the environment variable `$ROUTE_INTERNAL` is set to `true`, all links will be rewritten to point to internal archived routes.
///
/// # Parameters
/// - `version`: An optional version of the document to render in the format `YYYY-MM-DD`.
///
/// # Returns
/// An `Option` containing the rendered content as a string, or `None` if nothing could be rendered.
pub async fn render_local(&self, version: Option<String>) -> Option<String> {
if check_blacklist(&self.domain) {
let content = html! {
@ -146,6 +193,10 @@ impl Document {
}
}
/// Determines the directory where the document is stored.
///
/// # Returns
/// A `PathBuf` representing the document directory.
pub fn doc_dir(&self) -> PathBuf {
let mut file_path = self.base_dir.join(&self.domain);
@ -156,6 +207,10 @@ impl Document {
file_path
}
/// Retrieves available versions of the document.
///
/// # Returns
/// A vector of strings representing the available versions of the document, sorted in descending order.
pub fn versions(&self) -> Vec<String> {
let mut res: Vec<String> = read_dir(&self.doc_dir())
.into_iter()
@ -168,21 +223,46 @@ impl Document {
}
impl WebsiteArchive {
/// Creates a new `WebsiteArchive` instance.
///
/// # Parameters
/// - `dir`: The directory path where the archive will be stored.
///
/// # Returns
/// A new `WebsiteArchive` instance.
pub fn new(dir: &str) -> Self {
Self {
dir: PathBuf::from(dir),
}
}
/// Retrieves the list of domain names stored in the archive.
///
/// # Returns
/// A vector of domain names as strings.
pub fn domains(&self) -> Vec<String> {
read_dir(&self.dir)
}
/// Retrieves a `Domain` instance for a specified domain name.
///
/// # Parameters
/// - `domain`: The name of the domain to retrieve.
///
/// # Returns
/// A `Domain` instance corresponding to the specified domain.
pub fn get_domain(&self, domain: &str) -> Domain {
Domain::new(domain, self.dir.join(domain))
}
/// Archive a URL
/// Archives a URL by downloading and storing its content.
///
/// If the URL does not pass the blacklist check, it will not be archived.
///
/// # Parameters
/// - `url`: The URL to archive.
///
/// This function downloads the content of the URL, processes it, and saves it to the archive.
pub async fn archive_url(&self, url: &str) {
let parsed_url = url::Url::parse(url).unwrap();
@ -197,9 +277,7 @@ impl WebsiteArchive {
let mut folder_name = self.dir.join(domain);
if !std::fs::exists(&folder_name).unwrap() {
download_fav_for(domain).await;
}
download_fav_for(domain).await;
for paths in path.split('/') {
if !paths.is_empty() {

View file

@ -1,3 +1,7 @@
/// Checks if a domain is present in the blacklist of unwanted domains.
///
/// This function checks the `$BLACKLIST_DOMAINS` environment variable for a comma-separated list of regular expressions to match against.
/// If a match is found, it immediately returns `true`. Otherwise, it returns `false`.
pub fn check_blacklist(domain: &str) -> bool {
let blacklist_raw = std::env::var("BLACKLIST_DOMAINS").unwrap_or_default();

View file

@ -1,3 +1,12 @@
/// Downloads a favicon for the given domain.
///
/// # Parameters
///
/// * `domain`: The domain for which to download the favicon.
///
/// # Returns
///
/// A `Vec<u8>` containing the favicon data, or `None` if an error occurred.
pub async fn download_favicon(domain: &str) -> Option<Vec<u8>> {
let mut favicon_url = url::Url::parse(&format!("https://{}", domain)).ok()?;
favicon_url.set_path("/favicon.ico");
@ -15,6 +24,9 @@ pub async fn download_favicon(domain: &str) -> Option<Vec<u8>> {
Some(favicon_data)
}
/// Downloads a favicon for `site` and stores it.
///
/// This will not download a favicon if it is already present.
pub async fn download_fav_for(site: &str) {
if let Some(fav) = download_favicon(site).await {
let fav_path = std::path::Path::new("./favicon").join(site);

88
src/pages/component.rs Normal file
View file

@ -0,0 +1,88 @@
use based::{
page::Shell,
request::{RequestContext, StringResponse},
};
use maud::{html, PreEscaped};
/// Generates an SVG arrow icon with the specified color.
///
/// # Parameters
/// - `color`: The color of the arrow icon.
///
/// # Returns
/// A `PreEscaped<String>` containing the SVG markup for the arrow icon.
pub fn arrow_icon(color: &str) -> PreEscaped<String> {
html! {
svg class=(format!("w-5 h-5 text-{color}-500")) xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 24 24" stroke="currentColor" {
path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M9 5l7 7-7 7" {};
};
}
}
/// Generates a styled slash separator.
///
/// # Returns
/// A `PreEscaped<String>` containing the HTML markup for a slash separator.
pub fn slash_seperator() -> PreEscaped<String> {
html! {
p class="font-bold p-2 text-gray-400" { " / " };
}
}
/// Generates a hyperlink for a specific path within a domain.
///
/// # Parameters
/// - `path`: The path segment to link.
/// - `index`: The index of the current path segment in the hierarchy.
/// - `path_seperations`: The array of all path segments in the hierarchy.
/// - `domain`: The domain to which the path belongs.
///
/// # Returns
/// A `PreEscaped<String>` containing the HTML markup for the hyperlink.
pub fn gen_path_link(
path: &str,
index: usize,
path_seperations: &[&str],
domain: &str,
) -> PreEscaped<String> {
let upto: Vec<&str> = path_seperations.iter().take(index + 1).cloned().collect();
html! {
a href=(format!("/d/{}/{}", domain, upto.join("/"))) { (path)}
}
}
/// Generates a breadcrumb-like header for a path within a domain.
///
/// # Parameters
/// - `path_seperations`: A vector of path segments representing the hierarchy.
/// - `domain`: The domain to which the path belongs.
///
/// # Returns
/// A `PreEscaped<String>` containing the HTML markup for the path header.
pub fn gen_path_header(path_seperations: Vec<&str>, domain: &str) -> PreEscaped<String> {
html! {
@for (index, path) in path_seperations.iter().enumerate() {
(gen_path_link(path, index, &path_seperations, domain))
@if index < path_seperations.len()-1 {
(slash_seperator())
};
};
}
}
pub async fn render_page(content: PreEscaped<String>, ctx: RequestContext) -> StringResponse {
based::page::render_page(
content,
"Website Archive",
ctx,
&Shell::new(
html! {
script src="https://cdn.tailwindcss.com" {};
meta name="viewport" content="width=device-width, initial-scale=1.0" {};
},
html! {},
Some("bg-zinc-950 text-white min-h-screen flex pt-8 justify-center".to_string()),
),
)
.await
}

View file

@ -1,31 +1,15 @@
use std::{io::Read, path::PathBuf};
use based::{
page::Shell,
request::{assets::DataResponse, RequestContext, StringResponse},
};
use maud::{html, PreEscaped};
use based::request::{assets::DataResponse, RequestContext, StringResponse};
use maud::html;
use rocket::{get, State};
pub mod component;
use component::*;
use crate::archive::WebsiteArchive;
pub async fn render_page(content: PreEscaped<String>, ctx: RequestContext) -> StringResponse {
based::page::render_page(
content,
"Website Archive",
ctx,
&Shell::new(
html! {
script src="https://cdn.tailwindcss.com" {};
meta name="viewport" content="width=device-width, initial-scale=1.0" {};
},
html! {},
Some("bg-zinc-950 text-white min-h-screen flex pt-8 justify-center".to_string()),
),
)
.await
}
/// Get the favicon of a domain
#[get("/favicon/<domain>")]
pub async fn favicon_route(domain: &str) -> Option<DataResponse> {
let mut buf = Vec::new();
@ -41,6 +25,7 @@ pub async fn favicon_route(domain: &str) -> Option<DataResponse> {
))
}
/// Websites Overview
#[get("/")]
pub async fn index(ctx: RequestContext, arc: &State<WebsiteArchive>) -> StringResponse {
let websites = arc.domains();
@ -48,7 +33,7 @@ pub async fn index(ctx: RequestContext, arc: &State<WebsiteArchive>) -> StringRe
let content = html! {
div class="container mx-auto p-4" {
h1 class="text-5xl font-bold text-center mb-10" { "Websites" };
div class="grid grid-cols-2 sm:grid-cols-3 lg:grid-cols-4 gap-6 w-screen" {
div class="grid grid-cols-2 sm:grid-cols-3 lg:grid-cols-5 xl:grid-cols-6 2xl:grid-cols-8 gap-6" {
@for site in websites {
a href=(format!("/d/{site}")) class="bg-neutral-900 shadow-md rounded-lg hover:bg-neutral-800 bg-gray-1 hover:cursor-pointer transition-all duration-300 flex flex-col items-center justify-center aspect-square max-w-60" {
@ -65,43 +50,7 @@ pub async fn index(ctx: RequestContext, arc: &State<WebsiteArchive>) -> StringRe
render_page(content, ctx).await
}
pub fn arrow_icon(color: &str) -> PreEscaped<String> {
html! {
svg class=(format!("w-5 h-5 text-{color}-500")) xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 24 24" stroke="currentColor" {
path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M9 5l7 7-7 7" {};
};
}
}
pub fn slash_seperator() -> PreEscaped<String> {
html! {
p class="font-bold p-2 text-gray-400" { " / " };
}
}
pub fn gen_path_link(
path: &str,
index: usize,
path_seperations: &[&str],
domain: &str,
) -> PreEscaped<String> {
let upto: Vec<&str> = path_seperations.iter().take(index + 1).cloned().collect();
html! {
a href=(format!("/d/{}/{}", domain, upto.join("/"))) { (path)}
}
}
pub fn gen_path_header(path_seperations: Vec<&str>, domain: &str) -> PreEscaped<String> {
html! {
@for (index, path) in path_seperations.iter().enumerate() {
(gen_path_link(path, index, &path_seperations, domain))
@if index < path_seperations.len()-1 {
(slash_seperator())
};
};
}
}
/// Overview on <domain> / <path>
#[get("/d/<domain>/<paths..>")]
pub async fn domain_info_route(
ctx: RequestContext,
@ -123,6 +72,8 @@ pub async fn domain_info_route(
let (path_entries, is_doc) = domain.paths(paths.to_str().unwrap());
let path_seperations: Vec<&str> = paths.to_str().unwrap().split('/').collect();
// TODO : Show domains beeing linked on the page
let content = html! {
h2 class="text-xl font-bold mb-4 flex items-center" {
img class="p-2" src=(format!("/favicon/{}", &domain.name)) {};
@ -172,6 +123,7 @@ pub async fn domain_info_route(
render_page(content, ctx).await
}
/// Return archived version of `domain` / `path` at `time`
#[get("/s/<domain>/<path..>?<time>")]
pub async fn render_website(
domain: &str,
@ -191,29 +143,22 @@ pub async fn render_website(
"text/html".to_string(),
Some(60 * 60 * 24),
));
} else {
if std::env::var("DOWNLOAD_ON_DEMAND")
.unwrap_or("false".to_string())
.as_str()
== "true"
{
arc.archive_url(&format!("https://{domain}/{}", path.to_str().unwrap()))
.await;
} else if std::env::var("DOWNLOAD_ON_DEMAND")
.unwrap_or("false".to_string())
.as_str()
== "true"
&& time.is_none()
{
arc.archive_url(&format!("https://{domain}/{}", path.to_str().unwrap()))
.await;
let content = document
.render_local(if time.is_some() {
Some(time.unwrap().to_string())
} else {
None
})
.await?;
let content = document.render_local(None).await?;
return Some(DataResponse::new(
content.as_bytes().to_vec(),
"text/html".to_string(),
Some(60 * 60 * 24),
));
}
return Some(DataResponse::new(
content.as_bytes().to_vec(),
"text/html".to_string(),
Some(60 * 60 * 24),
));
}
None