This commit is contained in:
parent
a9f758cd9b
commit
2e5b4fc3d2
9 changed files with 141 additions and 89 deletions
|
@ -32,7 +32,7 @@ pub fn read_dir(dir: &PathBuf) -> Vec<String> {
|
|||
}
|
||||
|
||||
/// Rewrite all URLs in `input` to the format `/s/<domain>/<path..>`
|
||||
fn internalize_urls(input: &str, base: &str) -> String {
|
||||
pub fn internalize_urls(input: &str, base: &str) -> String {
|
||||
// todo : fix regex, domains without path are not captured
|
||||
let url_pattern = r#"(\ |"|')(?:(<?)(https?:\/\/([a-zA-Z0-9.-]+))?(\/[\w./-]*))"#;
|
||||
let re = regex::Regex::new(url_pattern).unwrap();
|
||||
|
@ -172,7 +172,7 @@ impl WebsiteArchive {
|
|||
std::fs::create_dir_all(&folder_name).unwrap();
|
||||
|
||||
let timestamp = chrono::Utc::now().format("%Y-%m-%d").to_string();
|
||||
let filename = folder_name.join(format!("index_{timestamp}.html"));
|
||||
let filename = folder_name.join(format!("index_{timestamp}"));
|
||||
|
||||
log::info!("Archiving {url} to {}", filename.to_str().unwrap());
|
||||
|
||||
|
@ -238,10 +238,17 @@ fn run_command(cmd: &[&str]) {
|
|||
let child = cmd_setup.spawn().unwrap();
|
||||
|
||||
let status = child.wait_with_output().unwrap();
|
||||
assert!(status.status.success());
|
||||
if !status.status.success() {
|
||||
log::warn!(
|
||||
"Command {cmd:?} exited with code {}",
|
||||
status.status.code().unwrap_or_default()
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn index_archive_db(arc: &WebsiteArchive) {
|
||||
// TODO : more index attrs size,mime
|
||||
|
||||
log::info!("Indexing archive");
|
||||
|
||||
for dom in arc.domains() {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue