This commit is contained in:
parent
a9f758cd9b
commit
2e5b4fc3d2
9 changed files with 141 additions and 89 deletions
|
@ -140,7 +140,8 @@ impl Embedding for Document {
|
|||
ver.as_ref().unwrap_or(&latest)
|
||||
);
|
||||
|
||||
let content_html = self.render_local(ver.clone(), shell).await?;
|
||||
let content_html = self.render_local(ver.clone(), shell).await.ok()?;
|
||||
let content_html = String::from_utf8_lossy(&content_html);
|
||||
let content = remove_data_urls(&html2md::parse_html(&content_html));
|
||||
|
||||
let mut embeddings = Vec::new();
|
||||
|
|
|
@ -3,9 +3,9 @@ use std::{io::Read, path::PathBuf};
|
|||
use based::{request::RequestContext, ui::components::prelude::Shell};
|
||||
use maud::html;
|
||||
|
||||
use crate::{blacklist::check_blacklist, conf::get_config, render_page};
|
||||
use crate::{blacklist::check_blacklist, render_page};
|
||||
|
||||
use super::{internalize_urls, read_dir};
|
||||
use super::read_dir;
|
||||
|
||||
/// Represents a document within a domain
|
||||
pub struct Document {
|
||||
|
@ -52,43 +52,39 @@ impl Document {
|
|||
///
|
||||
/// # Returns
|
||||
/// An `Option` containing the rendered content as a string, or `None` if nothing could be rendered.
|
||||
pub async fn render_local(&self, version: Option<String>, shell: &Shell) -> Option<String> {
|
||||
pub async fn render_local(
|
||||
&self,
|
||||
version: Option<String>,
|
||||
shell: &Shell,
|
||||
) -> Result<Vec<u8>, String> {
|
||||
if check_blacklist(&self.domain) {
|
||||
let content = html! {
|
||||
h3 { "This site is blacklisted" };
|
||||
};
|
||||
return Some(
|
||||
render_page(content, RequestContext::default(), shell)
|
||||
.await
|
||||
.1
|
||||
.1,
|
||||
);
|
||||
return Err(render_page(content, RequestContext::default(), shell)
|
||||
.await
|
||||
.1
|
||||
.1);
|
||||
}
|
||||
|
||||
let mut file_path = self.doc_dir();
|
||||
|
||||
let latest_version = if let Some(version) = version {
|
||||
format!("index_{version}.html")
|
||||
format!("index_{version}")
|
||||
} else {
|
||||
let versions = self.versions();
|
||||
let version = versions.first().cloned()?;
|
||||
format!("index_{version}.html")
|
||||
let version = versions.first().cloned().ok_or(String::new())?;
|
||||
format!("index_{version}")
|
||||
};
|
||||
|
||||
file_path = file_path.join(latest_version);
|
||||
|
||||
let mut buf = Vec::new();
|
||||
std::fs::File::open(file_path)
|
||||
.ok()?
|
||||
.map_err(|_| String::new())?
|
||||
.read_to_end(&mut buf)
|
||||
.unwrap();
|
||||
let content = String::from_utf8_lossy(&buf);
|
||||
|
||||
if get_config().ROUTE_INTERNAL {
|
||||
Some(internalize_urls(&content, &self.domain))
|
||||
} else {
|
||||
Some(content.to_string())
|
||||
}
|
||||
Ok(buf)
|
||||
}
|
||||
|
||||
/// Determines the directory where the document is stored.
|
||||
|
@ -127,12 +123,8 @@ impl Document {
|
|||
let mut res: Vec<String> = read_dir(&self.doc_dir())
|
||||
.into_iter()
|
||||
.filter_map(|x| {
|
||||
if x.starts_with("index_") && x.ends_with(".html") {
|
||||
return Some(
|
||||
x.trim_start_matches("index_")
|
||||
.trim_end_matches(".html")
|
||||
.to_string(),
|
||||
);
|
||||
if x.starts_with("index_") {
|
||||
return Some(x.trim_start_matches("index_").to_string());
|
||||
}
|
||||
|
||||
None
|
||||
|
|
|
@ -98,7 +98,7 @@ impl Domain {
|
|||
.filter(|x| !x.is_empty())
|
||||
.collect::<Vec<&str>>()
|
||||
.join("/");
|
||||
if entry.starts_with("index_") && entry.ends_with(".html") {
|
||||
if entry.starts_with("index_") {
|
||||
is_doc = true;
|
||||
continue;
|
||||
}
|
||||
|
|
|
@ -32,7 +32,7 @@ pub fn read_dir(dir: &PathBuf) -> Vec<String> {
|
|||
}
|
||||
|
||||
/// Rewrite all URLs in `input` to the format `/s/<domain>/<path..>`
|
||||
fn internalize_urls(input: &str, base: &str) -> String {
|
||||
pub fn internalize_urls(input: &str, base: &str) -> String {
|
||||
// todo : fix regex, domains without path are not captured
|
||||
let url_pattern = r#"(\ |"|')(?:(<?)(https?:\/\/([a-zA-Z0-9.-]+))?(\/[\w./-]*))"#;
|
||||
let re = regex::Regex::new(url_pattern).unwrap();
|
||||
|
@ -172,7 +172,7 @@ impl WebsiteArchive {
|
|||
std::fs::create_dir_all(&folder_name).unwrap();
|
||||
|
||||
let timestamp = chrono::Utc::now().format("%Y-%m-%d").to_string();
|
||||
let filename = folder_name.join(format!("index_{timestamp}.html"));
|
||||
let filename = folder_name.join(format!("index_{timestamp}"));
|
||||
|
||||
log::info!("Archiving {url} to {}", filename.to_str().unwrap());
|
||||
|
||||
|
@ -238,10 +238,17 @@ fn run_command(cmd: &[&str]) {
|
|||
let child = cmd_setup.spawn().unwrap();
|
||||
|
||||
let status = child.wait_with_output().unwrap();
|
||||
assert!(status.status.success());
|
||||
if !status.status.success() {
|
||||
log::warn!(
|
||||
"Command {cmd:?} exited with code {}",
|
||||
status.status.code().unwrap_or_default()
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn index_archive_db(arc: &WebsiteArchive) {
|
||||
// TODO : more index attrs size,mime
|
||||
|
||||
log::info!("Indexing archive");
|
||||
|
||||
for dom in arc.domains() {
|
||||
|
|
|
@ -158,16 +158,17 @@ async fn main() {
|
|||
|
||||
let content = doc.render_local(Some(ver), &shell).await;
|
||||
|
||||
if content.is_none() {
|
||||
if content.is_err() {
|
||||
println!("No document found");
|
||||
std::process::exit(1);
|
||||
}
|
||||
|
||||
if md {
|
||||
let markdown = html2md::parse_html(&content.unwrap());
|
||||
let markdown =
|
||||
html2md::parse_html(&String::from_utf8_lossy(&content.unwrap()));
|
||||
println!("{markdown}");
|
||||
} else {
|
||||
println!("{}", content.unwrap());
|
||||
println!("{}", String::from_utf8_lossy(&content.unwrap()));
|
||||
}
|
||||
}
|
||||
Some((&_, _)) => {}
|
||||
|
|
|
@ -18,7 +18,7 @@ pub mod component;
|
|||
use component::*;
|
||||
use serde_json::json;
|
||||
|
||||
use webarc::archive::{Document, DocumentIndex};
|
||||
use webarc::archive::{internalize_urls, Document, DocumentIndex};
|
||||
use webarc::{
|
||||
ai::{generate_embedding, remove_data_urls, EmbedStore, SearchResult},
|
||||
archive::{extract_domains, WebsiteArchive},
|
||||
|
@ -26,6 +26,8 @@ use webarc::{
|
|||
render_page,
|
||||
};
|
||||
|
||||
// TODO : PDF view
|
||||
|
||||
const SEARCH_BAR_STYLE: &str = "w-full px-4 mb-4 py-2 text-white bg-black border-2 border-neon-blue placeholder-neon-blue focus:ring-2 focus:ring-neon-pink focus:outline-none font-mono text-lg";
|
||||
|
||||
#[allow(non_snake_case)]
|
||||
|
@ -226,12 +228,12 @@ pub async fn domain_info_route(
|
|||
let (path_entries, is_doc) = domain.paths(paths.to_str().unwrap());
|
||||
let path_seperations: Vec<&str> = paths.to_str().unwrap().split('/').collect();
|
||||
|
||||
let domains = extract_domains(
|
||||
let domains = extract_domains(&String::from_utf8_lossy(
|
||||
&document
|
||||
.render_local(None, &shell)
|
||||
.await
|
||||
.unwrap_or_default(),
|
||||
);
|
||||
));
|
||||
|
||||
let content = html! {
|
||||
h2 class="text-xl font-bold mb-4 flex items-center w-fit mx-auto" {
|
||||
|
@ -307,9 +309,12 @@ pub async fn render_txt_website(
|
|||
) -> Option<String> {
|
||||
let document = arc.get_domain(domain).path(path.to_str().unwrap());
|
||||
|
||||
let mut content = document
|
||||
let content = document
|
||||
.render_local(time.map(|time| time.to_string()), &shell)
|
||||
.await?;
|
||||
.await
|
||||
.ok()?;
|
||||
|
||||
let mut content = String::from_utf8_lossy(&content).to_string();
|
||||
|
||||
if no_data_urls.is_some() {
|
||||
content = remove_data_urls(&content);
|
||||
|
@ -376,11 +381,23 @@ pub async fn redownload(
|
|||
arc.archive_url(&format!("https://{domain}/{}", path.to_str()))
|
||||
.await;
|
||||
|
||||
let content = document.render_local(None, &shell).await?;
|
||||
let mut content = document.render_local(None, &shell).await.ok()?;
|
||||
|
||||
let mime = infer::get(&content)
|
||||
.map(|x| x.mime_type())
|
||||
.unwrap_or("text/html");
|
||||
|
||||
if mime == "text/html" {
|
||||
if get_config().ROUTE_INTERNAL {
|
||||
content = internalize_urls(&String::from_utf8_lossy(&content), &domain)
|
||||
.as_bytes()
|
||||
.to_vec();
|
||||
}
|
||||
}
|
||||
|
||||
return Some(DataResponse::new(
|
||||
content.as_bytes().to_vec(),
|
||||
"text/html".to_string(),
|
||||
content.to_vec(),
|
||||
mime.to_string(),
|
||||
Some(60 * 60 * 24),
|
||||
));
|
||||
}
|
||||
|
@ -414,10 +431,22 @@ pub async fn render_website(
|
|||
|
||||
// TODO : keep n versions
|
||||
|
||||
if let Some(content) = content {
|
||||
if let Ok(mut content) = content {
|
||||
let mime = infer::get(&content)
|
||||
.map(|x| x.mime_type())
|
||||
.unwrap_or("text/html");
|
||||
|
||||
if mime == "text/html" {
|
||||
if get_config().ROUTE_INTERNAL {
|
||||
content = internalize_urls(&String::from_utf8_lossy(&content), &domain)
|
||||
.as_bytes()
|
||||
.to_vec();
|
||||
}
|
||||
}
|
||||
|
||||
return Some(DataResponse::new(
|
||||
content.as_bytes().to_vec(),
|
||||
"text/html".to_string(),
|
||||
content,
|
||||
mime.to_string(),
|
||||
Some(60 * 60 * 24),
|
||||
));
|
||||
} else if get_config().DOWNLOAD_ON_DEMAND && time.is_none() {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue