fix(npm): make tarball extraction more reliable (#23759)

1. Extracts to a directory beside the destination.
2. Renames to the destination with retries.
This commit is contained in:
David Sherret 2024-05-14 14:26:48 -04:00 committed by GitHub
parent c0e3b6ed9d
commit f16b4d4df8
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 153 additions and 29 deletions

View file

@ -25,6 +25,7 @@ use crate::util::fs::hard_link_dir_recursive;
use crate::util::progress_bar::ProgressBar;
use super::tarball::verify_and_extract_tarball;
use super::tarball::TarballExtractionMode;
/// Stores a single copy of npm packages in a cache.
#[derive(Debug)]
@ -69,7 +70,7 @@ impl NpmCache {
/// to ensure a package is only downloaded once per run of the CLI. This
/// prevents downloads from re-occurring when someone has `--reload` and
/// and imports a dynamic import that imports the same package again for example.
fn should_use_global_cache_for_package(&self, package: &PackageNv) -> bool {
fn should_use_cache_for_package(&self, package: &PackageNv) -> bool {
self.cache_setting.should_use_for_npm_package(&package.name)
|| !self
.previously_reloaded_packages
@ -91,26 +92,23 @@ impl NpmCache {
async fn ensure_package_inner(
&self,
package: &PackageNv,
package_nv: &PackageNv,
dist: &NpmPackageVersionDistInfo,
registry_url: &Url,
) -> Result<(), AnyError> {
let package_folder = self
.cache_dir
.package_folder_for_name_and_version(package, registry_url);
if self.should_use_global_cache_for_package(package)
&& self.fs.exists_sync(&package_folder)
// if this file exists, then the package didn't successfully extract
// the first time, or another process is currently extracting the zip file
&& !self.fs.exists_sync(&package_folder.join(NPM_PACKAGE_SYNC_LOCK_FILENAME))
{
.package_folder_for_name_and_version(package_nv, registry_url);
let should_use_cache = self.should_use_cache_for_package(package_nv);
let package_folder_exists = self.fs.exists_sync(&package_folder);
if should_use_cache && package_folder_exists {
return Ok(());
} else if self.cache_setting == CacheSetting::Only {
return Err(custom_error(
"NotCached",
format!(
"An npm specifier not found in cache: \"{}\", --cached-only is specified.",
&package.name
&package_nv.name
)
)
);
@ -127,7 +125,31 @@ impl NpmCache {
.await?;
match maybe_bytes {
Some(bytes) => {
verify_and_extract_tarball(package, &bytes, dist, &package_folder)
let extraction_mode = if should_use_cache || !package_folder_exists {
TarballExtractionMode::SiblingTempDir
} else {
// The user ran with `--reload`, so overwrite the package instead of
// deleting it since the package might get corrupted if a user kills
// their deno process while it's deleting a package directory
//
// We can't rename this folder and delete it because the folder
// may be in use by another process or may now contain hardlinks,
// which will cause windows to throw an "AccessDenied" error when
// renaming. So we settle for overwriting.
TarballExtractionMode::Overwrite
};
let dist = dist.clone();
let package_nv = package_nv.clone();
deno_core::unsync::spawn_blocking(move || {
verify_and_extract_tarball(
&package_nv,
&bytes,
&dist,
&package_folder,
extraction_mode,
)
})
.await?
}
None => {
bail!("Could not find npm package tarball at: {}", dist.tarball);
@ -150,7 +172,7 @@ impl NpmCache {
.package_folder_for_id(folder_id, registry_url);
if package_folder.exists()
// if this file exists, then the package didn't successfully extract
// if this file exists, then the package didn't successfully initialize
// the first time, or another process is currently extracting the zip file
&& !package_folder.join(NPM_PACKAGE_SYNC_LOCK_FILENAME).exists()
&& self.cache_setting.should_use_for_npm_package(&folder_id.nv.name)
@ -161,6 +183,9 @@ impl NpmCache {
let original_package_folder = self
.cache_dir
.package_folder_for_name_and_version(&folder_id.nv, registry_url);
// it seems Windows does an "AccessDenied" error when moving a
// directory with hard links, so that's why this solution is done
with_folder_sync_lock(&folder_id.nv, &package_folder, || {
hard_link_dir_recursive(&original_package_folder, &package_folder)
})?;
@ -206,7 +231,7 @@ impl NpmCache {
const NPM_PACKAGE_SYNC_LOCK_FILENAME: &str = ".deno_sync_lock";
pub fn with_folder_sync_lock(
fn with_folder_sync_lock(
package: &PackageNv,
output_folder: &Path,
action: impl FnOnce() -> Result<(), AnyError>,

View file

@ -2,12 +2,14 @@
use std::collections::HashSet;
use std::fs;
use std::io::ErrorKind;
use std::path::Path;
use std::path::PathBuf;
use base64::prelude::BASE64_STANDARD;
use base64::Engine;
use deno_core::anyhow::bail;
use deno_core::anyhow::Context;
use deno_core::error::AnyError;
use deno_npm::registry::NpmPackageVersionDistInfo;
use deno_npm::registry::NpmPackageVersionDistInfoIntegrity;
@ -16,19 +18,76 @@ use flate2::read::GzDecoder;
use tar::Archive;
use tar::EntryType;
use super::cache::with_folder_sync_lock;
use crate::util::path::get_atomic_dir_path;
#[derive(Debug, Copy, Clone)]
pub enum TarballExtractionMode {
/// Overwrites the destination directory without deleting any files.
Overwrite,
/// Creates and writes to a sibling temporary directory. When done, moves
/// it to the final destination.
///
/// This is more robust than `Overwrite` as it better handles multiple
/// processes writing to the directory at the same time.
SiblingTempDir,
}
pub fn verify_and_extract_tarball(
package: &PackageNv,
package_nv: &PackageNv,
data: &[u8],
dist_info: &NpmPackageVersionDistInfo,
output_folder: &Path,
extraction_mode: TarballExtractionMode,
) -> Result<(), AnyError> {
verify_tarball_integrity(package, data, &dist_info.integrity())?;
verify_tarball_integrity(package_nv, data, &dist_info.integrity())?;
with_folder_sync_lock(package, output_folder, || {
extract_tarball(data, output_folder)
})
match extraction_mode {
TarballExtractionMode::Overwrite => extract_tarball(data, output_folder),
TarballExtractionMode::SiblingTempDir => {
let temp_dir = get_atomic_dir_path(output_folder);
extract_tarball(data, &temp_dir)?;
rename_with_retries(&temp_dir, output_folder)
.map_err(AnyError::from)
.context("Failed moving extracted tarball to final destination.")
}
}
}
fn rename_with_retries(
temp_dir: &Path,
output_folder: &Path,
) -> Result<(), std::io::Error> {
fn already_exists(err: &std::io::Error, output_folder: &Path) -> bool {
// Windows will do an "Access is denied" error
err.kind() == ErrorKind::AlreadyExists || output_folder.exists()
}
let mut count = 0;
// renaming might be flaky if a lot of processes are trying
// to do this, so retry a few times
loop {
match fs::rename(temp_dir, output_folder) {
Ok(_) => return Ok(()),
Err(err) if already_exists(&err, output_folder) => {
// another process copied here, just cleanup
let _ = fs::remove_dir_all(temp_dir);
return Ok(());
}
Err(err) => {
count += 1;
if count > 5 {
// too many retries, cleanup and return the error
let _ = fs::remove_dir_all(temp_dir);
return Err(err);
}
// wait a bit before retrying... this should be very rare or only
// in error cases, so ok to sleep a bit
let sleep_ms = std::cmp::min(100, 20 * count);
std::thread::sleep(std::time::Duration::from_millis(sleep_ms));
}
}
}
}
fn verify_tarball_integrity(
@ -150,6 +209,7 @@ fn extract_tarball(data: &[u8], output_folder: &Path) -> Result<(), AnyError> {
#[cfg(test)]
mod test {
use deno_semver::Version;
use test_util::TempDir;
use super::*;
@ -240,4 +300,25 @@ mod test {
)
.is_ok());
}
#[test]
fn rename_with_retries_succeeds_exists() {
let temp_dir = TempDir::new();
let folder_1 = temp_dir.path().join("folder_1");
let folder_2 = temp_dir.path().join("folder_2");
folder_1.create_dir_all();
folder_1.join("a.txt").write("test");
folder_2.create_dir_all();
// this will not end up in the output as rename_with_retries assumes
// the folders ending up at the destination are the same
folder_2.join("b.txt").write("test2");
let dest_folder = temp_dir.path().join("dest_folder");
rename_with_retries(folder_1.as_path(), dest_folder.as_path()).unwrap();
rename_with_retries(folder_2.as_path(), dest_folder.as_path()).unwrap();
assert!(dest_folder.join("a.txt").exists());
assert!(!dest_folder.join("b.txt").exists());
}
}

View file

@ -2,7 +2,6 @@
use std::collections::HashSet;
use std::env::current_dir;
use std::fmt::Write as FmtWrite;
use std::fs::FileType;
use std::fs::OpenOptions;
use std::io::Error;
@ -23,12 +22,12 @@ use deno_core::error::AnyError;
pub use deno_core::normalize_path;
use deno_core::unsync::spawn_blocking;
use deno_core::ModuleSpecifier;
use deno_runtime::deno_crypto::rand;
use deno_runtime::deno_fs::FileSystem;
use deno_runtime::deno_node::PathClean;
use crate::util::gitignore::DirGitIgnores;
use crate::util::gitignore::GitIgnoreTree;
use crate::util::path::get_atomic_file_path;
use crate::util::progress_bar::ProgressBar;
use crate::util::progress_bar::ProgressBarStyle;
use crate::util::progress_bar::ProgressMessagePrompt;
@ -56,14 +55,7 @@ pub fn atomic_write_file<T: AsRef<[u8]>>(
}
fn inner(file_path: &Path, data: &[u8], mode: u32) -> std::io::Result<()> {
let temp_file_path = {
let rand: String = (0..4).fold(String::new(), |mut output, _| {
let _ = write!(output, "{:02x}", rand::random::<u8>());
output
});
let extension = format!("{rand}.tmp");
file_path.with_extension(extension)
};
let temp_file_path = get_atomic_file_path(file_path);
if let Err(write_err) =
atomic_write_file_raw(&temp_file_path, file_path, data, mode)

View file

@ -42,6 +42,32 @@ pub fn get_extension(file_path: &Path) -> Option<String> {
.map(|e| e.to_lowercase());
}
pub fn get_atomic_dir_path(file_path: &Path) -> PathBuf {
let rand = gen_rand_path_component();
let new_file_name = format!(
".{}_{}",
file_path
.file_name()
.map(|f| f.to_string_lossy())
.unwrap_or(Cow::Borrowed("")),
rand
);
file_path.with_file_name(new_file_name)
}
pub fn get_atomic_file_path(file_path: &Path) -> PathBuf {
let rand = gen_rand_path_component();
let extension = format!("{rand}.tmp");
file_path.with_extension(extension)
}
fn gen_rand_path_component() -> String {
(0..4).fold(String::new(), |mut output, _| {
output.push_str(&format!("{:02x}", rand::random::<u8>()));
output
})
}
/// TypeScript figures out the type of file based on the extension, but we take
/// other factors into account like the file headers. The hack here is to map the
/// specifier passed to TypeScript to a new specifier with the file extension.