init
This commit is contained in:
commit
319b663694
11 changed files with 3722 additions and 0 deletions
2
.dockerignore
Normal file
2
.dockerignore
Normal file
|
@ -0,0 +1,2 @@
|
||||||
|
websites
|
||||||
|
target
|
2
.gitignore
vendored
Normal file
2
.gitignore
vendored
Normal file
|
@ -0,0 +1,2 @@
|
||||||
|
/target
|
||||||
|
/websites
|
15
.woodpecker/build.yml
Normal file
15
.woodpecker/build.yml
Normal file
|
@ -0,0 +1,15 @@
|
||||||
|
when:
|
||||||
|
- event: push
|
||||||
|
branch: main
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: build
|
||||||
|
image: woodpeckerci/plugin-docker-buildx
|
||||||
|
settings:
|
||||||
|
platforms: linux/amd64,linux/arm64
|
||||||
|
repo: git.hydrar.de/jmarya/webarc
|
||||||
|
registry: git.hydrar.de
|
||||||
|
tags: latest
|
||||||
|
username: jmarya
|
||||||
|
password:
|
||||||
|
from_secret: registry_token
|
3458
Cargo.lock
generated
Normal file
3458
Cargo.lock
generated
Normal file
File diff suppressed because it is too large
Load diff
25
Cargo.toml
Normal file
25
Cargo.toml
Normal file
|
@ -0,0 +1,25 @@
|
||||||
|
[package]
|
||||||
|
name = "watchdogs"
|
||||||
|
version = "0.1.0"
|
||||||
|
edition = "2021"
|
||||||
|
|
||||||
|
[dependencies]
|
||||||
|
env_logger = "0.10.0"
|
||||||
|
hex = "0.4.3"
|
||||||
|
rayon = "1.7.0"
|
||||||
|
regex = "1.9.5"
|
||||||
|
ring = "0.16.20"
|
||||||
|
walkdir = "2.4.0"
|
||||||
|
chrono = { version = "0.4.38", features = ["serde"] }
|
||||||
|
futures = "0.3.30"
|
||||||
|
log = "0.4.20"
|
||||||
|
rocket = { version = "0.5.1", features = ["json"] }
|
||||||
|
rocket_cors = "0.6.0"
|
||||||
|
serde = { version = "1.0.195", features = ["derive"] }
|
||||||
|
serde_json = "1.0.111"
|
||||||
|
tokio = { version = "1.35.1", features = ["full"] }
|
||||||
|
uuid = { version = "1.8.0", features = ["v4", "serde"] }
|
||||||
|
sqlx = { version = "0.8", features = ["postgres", "runtime-tokio-native-tls", "derive", "uuid", "chrono", "json"] }
|
||||||
|
maud = "0.26.0"
|
||||||
|
based = { git = "https://git.hydrar.de/jmarya/based", features = [] }
|
||||||
|
url = "2.5.4"
|
18
Dockerfile
Normal file
18
Dockerfile
Normal file
|
@ -0,0 +1,18 @@
|
||||||
|
FROM rust:buster as builder
|
||||||
|
|
||||||
|
COPY . /app
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
RUN rustup default nightly
|
||||||
|
RUN cargo build --release
|
||||||
|
|
||||||
|
FROM debian:buster
|
||||||
|
|
||||||
|
RUN apt-get update && apt-get upgrade -y
|
||||||
|
RUN apt-get install -y ca-certificates openssl monolith
|
||||||
|
|
||||||
|
COPY --from=builder /app/target/release/webarc /webarc
|
||||||
|
|
||||||
|
WORKDIR /
|
||||||
|
|
||||||
|
CMD ["/webarc"]
|
6
README.md
Normal file
6
README.md
Normal file
|
@ -0,0 +1,6 @@
|
||||||
|
# WebArc
|
||||||
|
|
||||||
|
- Website Archive
|
||||||
|
- LLM AI Integration (VectorDB)?
|
||||||
|
- Regex Search
|
||||||
|
|
25
docker-compose.yml
Normal file
25
docker-compose.yml
Normal file
|
@ -0,0 +1,25 @@
|
||||||
|
services:
|
||||||
|
webarc:
|
||||||
|
build: .
|
||||||
|
ports:
|
||||||
|
- "8080:8000"
|
||||||
|
depends_on:
|
||||||
|
- postgres
|
||||||
|
volumes:
|
||||||
|
- ./websites:/websites
|
||||||
|
environment:
|
||||||
|
- "DATABASE_URL=postgres://user:pass@postgres/webarc"
|
||||||
|
- "RUST_LOG=info"
|
||||||
|
- "ROCKET_ADDRESS=0.0.0.0"
|
||||||
|
|
||||||
|
postgres:
|
||||||
|
image: timescale/timescaledb:latest-pg16
|
||||||
|
restart: always
|
||||||
|
ports:
|
||||||
|
- 5432:5432
|
||||||
|
volumes:
|
||||||
|
- ./db:/var/lib/postgresql/data/
|
||||||
|
environment:
|
||||||
|
- POSTGRES_USER=user
|
||||||
|
- POSTGRES_PASSWORD=pass
|
||||||
|
- POSTGRES_DB=webarc
|
123
src/archive.rs
Normal file
123
src/archive.rs
Normal file
|
@ -0,0 +1,123 @@
|
||||||
|
use std::{fmt::format, fs::read_to_string, path::{Path, PathBuf}};
|
||||||
|
|
||||||
|
pub struct WebsiteArchive {
|
||||||
|
pub dir: PathBuf
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct Domain {
|
||||||
|
pub name: String,
|
||||||
|
dir: PathBuf
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Domain {
|
||||||
|
pub fn new(name: &str, dir: PathBuf) -> Self {
|
||||||
|
std::fs::create_dir_all(&dir).unwrap();
|
||||||
|
Self { name: name.to_string(), dir }
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn path(&self, path: &str) -> Document {
|
||||||
|
Document::new(&self.name, path, self.dir.parent().unwrap().to_path_buf())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct Document {
|
||||||
|
pub domain: String,
|
||||||
|
pub path: String,
|
||||||
|
base_dir: PathBuf
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Document {
|
||||||
|
pub fn new(domain: &str, path: &str, base_dir: PathBuf) -> Self {
|
||||||
|
Self { domain: domain.to_string(), path: path.to_string(), base_dir }
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn render_local(&self, version: Option<String>) -> String {
|
||||||
|
let mut file_path = self.base_dir.join(&self.domain);
|
||||||
|
|
||||||
|
for p in self.path.split('/') {
|
||||||
|
file_path = file_path.join(p);
|
||||||
|
}
|
||||||
|
|
||||||
|
let latest_version = if let Some(version) = version {
|
||||||
|
format!("index_{version}.html")
|
||||||
|
} else {
|
||||||
|
let versions = Self::versions(&file_path);
|
||||||
|
versions.first().cloned().unwrap()
|
||||||
|
};
|
||||||
|
|
||||||
|
file_path = file_path.join(latest_version);
|
||||||
|
|
||||||
|
// TODO : Replace links with local ones
|
||||||
|
return std::fs::read_to_string(file_path).unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn versions(path: &PathBuf) -> Vec<String> {
|
||||||
|
let mut version_list = Vec::new();
|
||||||
|
|
||||||
|
if let Ok(entries) = std::fs::read_dir(path) {
|
||||||
|
for entry in entries {
|
||||||
|
if let Ok(entry) = entry {
|
||||||
|
if let Some(file_name) = entry.file_name().to_str() {
|
||||||
|
version_list.push(file_name.to_string());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
version_list
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl WebsiteArchive {
|
||||||
|
pub fn new(dir: &str) -> Self {
|
||||||
|
Self { dir: PathBuf::from(dir) }
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn get_domain(&self, domain: &str) -> Domain {
|
||||||
|
Domain::new(domain, self.dir.join(domain))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Archive a URL
|
||||||
|
pub fn archive_url(&self, url: &str) {
|
||||||
|
let parsed_url = url::Url::parse(url).unwrap();
|
||||||
|
|
||||||
|
let domain = parsed_url.domain().unwrap().trim_start_matches("www");
|
||||||
|
let path = parsed_url.path();
|
||||||
|
|
||||||
|
let mut folder_name = self.dir.join(&domain);
|
||||||
|
|
||||||
|
for paths in path.split('/') {
|
||||||
|
if !paths.is_empty() {
|
||||||
|
folder_name = folder_name.join(paths);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
std::fs::create_dir_all(&folder_name).unwrap();
|
||||||
|
|
||||||
|
let timestamp = chrono::Utc::now().format("%Y-%m-%d").to_string();
|
||||||
|
let filename = folder_name.join(&format!("index_{timestamp}.html"));
|
||||||
|
|
||||||
|
run_command(&vec![
|
||||||
|
"monolith",
|
||||||
|
"-I",
|
||||||
|
"-o",
|
||||||
|
filename.to_str().unwrap(),
|
||||||
|
&format!("https://{}/{}", domain, path)
|
||||||
|
]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// full text search
|
||||||
|
// add new sites?
|
||||||
|
// transparent auto page downloading
|
||||||
|
// redownload after threshold
|
||||||
|
|
||||||
|
|
||||||
|
fn run_command(cmd: &[&str]) {
|
||||||
|
let mut cmd_setup = std::process::Command::new(cmd[0].clone());
|
||||||
|
let cmd_setup = cmd_setup.args(cmd.into_iter().skip(1).collect::<Vec<_>>());
|
||||||
|
let child = cmd_setup.spawn().unwrap();
|
||||||
|
|
||||||
|
let status = child.wait_with_output().unwrap();
|
||||||
|
assert!(status.status.success());
|
||||||
|
}
|
26
src/main.rs
Normal file
26
src/main.rs
Normal file
|
@ -0,0 +1,26 @@
|
||||||
|
use archive::WebsiteArchive;
|
||||||
|
use based::get_pg;
|
||||||
|
use rocket::routes;
|
||||||
|
|
||||||
|
mod pages;
|
||||||
|
mod archive;
|
||||||
|
|
||||||
|
#[rocket::launch]
|
||||||
|
async fn launch() -> _ {
|
||||||
|
env_logger::init();
|
||||||
|
|
||||||
|
// let pg = get_pg!();
|
||||||
|
// sqlx::migrate!("./migrations").run(pg).await.unwrap();
|
||||||
|
|
||||||
|
let arc = WebsiteArchive::new("./websites");
|
||||||
|
|
||||||
|
rocket::build()
|
||||||
|
.mount(
|
||||||
|
"/",
|
||||||
|
routes![
|
||||||
|
pages::index,
|
||||||
|
pages::render_website
|
||||||
|
],
|
||||||
|
)
|
||||||
|
.manage(arc)
|
||||||
|
}
|
22
src/pages/mod.rs
Normal file
22
src/pages/mod.rs
Normal file
|
@ -0,0 +1,22 @@
|
||||||
|
use std::path::PathBuf;
|
||||||
|
|
||||||
|
use based::request::{respond_html, StringResponse};
|
||||||
|
use maud::html;
|
||||||
|
use rocket::{get, State};
|
||||||
|
|
||||||
|
use crate::archive::WebsiteArchive;
|
||||||
|
|
||||||
|
#[get("/")]
|
||||||
|
pub async fn index() -> StringResponse {
|
||||||
|
// TODO : websites overview grid
|
||||||
|
unimplemented!()
|
||||||
|
}
|
||||||
|
|
||||||
|
#[get("/s/<domain>/<path..>?<time>")]
|
||||||
|
pub async fn render_website(domain: &str, path: PathBuf, time: Option<&str>, arc: &State<WebsiteArchive>) -> StringResponse {
|
||||||
|
if let Some(time) = time {
|
||||||
|
respond_html(&arc.get_domain(domain).path(path.to_str().unwrap()).render_local(Some(time.to_string())))
|
||||||
|
} else {
|
||||||
|
respond_html(&arc.get_domain(domain).path(path.to_str().unwrap()).render_local(None))
|
||||||
|
}
|
||||||
|
}
|
Loading…
Add table
Reference in a new issue