From 5fc5e7b54a9fba421dfc473016625a4f592403ed Mon Sep 17 00:00:00 2001 From: Maayan Hanin Date: Tue, 4 Aug 2020 00:39:48 +0300 Subject: [PATCH] fix(cli): add support for non-UTF8 source files (#6789) Fixes: #5542 --- .dprintrc.json | 1 + .gitattributes | 2 + Cargo.lock | 1 + cli/Cargo.toml | 1 + cli/file_fetcher.rs | 346 +++++++++++++++++++++++++------- cli/fmt.rs | 7 +- cli/global_state.rs | 2 +- cli/main.rs | 15 +- cli/module_graph.rs | 10 +- cli/tests/encoding/utf-16be.ts | Bin 0 -> 58 bytes cli/tests/encoding/utf-16le.ts | Bin 0 -> 58 bytes cli/tests/encoding/utf-8.ts | 1 + cli/tests/encoding/windows-1255 | 1 + cli/text_encoding.rs | 94 +++++++++ cli/tsc.rs | 26 ++- test_util/src/lib.rs | 13 ++ tools/lint.py | 1 + 17 files changed, 422 insertions(+), 99 deletions(-) create mode 100644 cli/tests/encoding/utf-16be.ts create mode 100644 cli/tests/encoding/utf-16le.ts create mode 100644 cli/tests/encoding/utf-8.ts create mode 100644 cli/tests/encoding/windows-1255 create mode 100644 cli/text_encoding.rs diff --git a/.dprintrc.json b/.dprintrc.json index dfa68783c2..c9d56cb423 100644 --- a/.dprintrc.json +++ b/.dprintrc.json @@ -14,6 +14,7 @@ "excludes": [ ".cargo_home", "cli/dts", + "cli/tests/encoding", "cli/tsc/*typescript.js", "gh-pages", "std/**/testdata", diff --git a/.gitattributes b/.gitattributes index b70bc09403..0419363b83 100644 --- a/.gitattributes +++ b/.gitattributes @@ -2,6 +2,8 @@ * text=auto eol=lf *.png -text +/cli/tests/encoding/* -text + # Tell git which symlinks point to files, and which ones point to directories. # This is relevant for Windows only, and requires git >= 2.19.2 to work. /core/libdeno/* symlink=dir diff --git a/Cargo.lock b/Cargo.lock index c1102cb906..a43eda71df 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -385,6 +385,7 @@ dependencies = [ "dissimilar", "dlopen", "dprint-plugin-typescript", + "encoding_rs", "futures", "fwdansi", "http", diff --git a/cli/Cargo.toml b/cli/Cargo.toml index 0a9125355e..8a8958b63c 100644 --- a/cli/Cargo.toml +++ b/cli/Cargo.toml @@ -32,6 +32,7 @@ byteorder = "1.3.4" clap = "2.33.1" dissimilar = "1.0.2" dlopen = "0.1.8" +encoding_rs = "0.8.23" dprint-plugin-typescript = "0.25.0" futures = "0.3.5" http = "0.2.1" diff --git a/cli/file_fetcher.rs b/cli/file_fetcher.rs index 7d72d5cd4d..63743f5abe 100644 --- a/cli/file_fetcher.rs +++ b/cli/file_fetcher.rs @@ -7,10 +7,12 @@ use crate::http_util::FetchOnceResult; use crate::msg; use crate::op_error::OpError; use crate::permissions::Permissions; +use crate::text_encoding; use deno_core::ErrBox; use deno_core::ModuleSpecifier; use futures::future::FutureExt; use log::info; +use std::borrow::Cow; use std::collections::HashMap; use std::fs; use std::future::Future; @@ -24,6 +26,47 @@ use std::sync::Arc; use std::sync::Mutex; use url::Url; +/// Structure representing a text document. +#[derive(Debug, Clone)] +pub struct TextDocument { + bytes: Vec, + charset: Cow<'static, str>, +} + +impl TextDocument { + pub fn new( + bytes: Vec, + charset: Option>>, + ) -> TextDocument { + let charset = charset + .map(|cs| cs.into()) + .unwrap_or_else(|| text_encoding::detect_charset(&bytes).into()); + TextDocument { bytes, charset } + } + + pub fn as_bytes(&self) -> &Vec { + &self.bytes + } + + pub fn into_bytes(self) -> Vec { + self.bytes + } + + pub fn to_str(&self) -> Result, std::io::Error> { + text_encoding::convert_to_utf8(&self.bytes, &self.charset) + } + + pub fn to_string(&self) -> Result { + self.to_str().map(String::from) + } +} + +impl From> for TextDocument { + fn from(bytes: Vec) -> Self { + TextDocument::new(bytes, Option::<&str>::None) + } +} + /// Structure representing local or remote file. /// /// In case of remote file `url` might be different than originally requested URL, if so @@ -34,7 +77,7 @@ pub struct SourceFile { pub filename: PathBuf, pub types_header: Option, pub media_type: msg::MediaType, - pub source_code: Vec, + pub source_code: TextDocument, } /// Simple struct implementing in-process caching to prevent multiple @@ -180,8 +223,9 @@ impl SourceFileFetcher { match result { Ok(mut file) => { // TODO: move somewhere? - if file.source_code.starts_with(b"#!") { - file.source_code = filter_shebang(file.source_code); + if file.source_code.bytes.starts_with(b"#!") { + file.source_code = + filter_shebang(&file.source_code.to_str().unwrap()[..]).into(); } // Cache in-process for subsequent access. @@ -313,12 +357,12 @@ impl SourceFileFetcher { Err(e) => return Err(e.into()), }; - let media_type = map_content_type(&filepath, None); + let (media_type, charset) = map_content_type(&filepath, None); Ok(SourceFile { url: module_url.clone(), filename: filepath, media_type, - source_code, + source_code: TextDocument::new(source_code, charset), types_header: None, }) } @@ -380,7 +424,7 @@ impl SourceFileFetcher { let cache_filename = self.http_cache.get_cache_filename(module_url); let fake_filepath = PathBuf::from(module_url.path()); - let media_type = map_content_type( + let (media_type, charset) = map_content_type( &fake_filepath, headers.get("content-type").map(|e| e.as_str()), ); @@ -389,7 +433,7 @@ impl SourceFileFetcher { url: module_url.clone(), filename: cache_filename, media_type, - source_code, + source_code: TextDocument::new(source_code, charset), types_header, })) } @@ -490,7 +534,7 @@ impl SourceFileFetcher { let cache_filepath = dir.http_cache.get_cache_filename(&module_url); // Used to sniff out content type from file extension - probably to be removed let fake_filepath = PathBuf::from(module_url.path()); - let media_type = map_content_type( + let (media_type, charset) = map_content_type( &fake_filepath, headers.get("content-type").map(String::as_str), ); @@ -502,7 +546,7 @@ impl SourceFileFetcher { url: module_url.clone(), filename: cache_filepath, media_type, - source_code: source, + source_code: TextDocument::new(source, charset), types_header, }; @@ -532,16 +576,19 @@ pub fn map_file_extension(path: &Path) -> msg::MediaType { } } -// convert a ContentType string into a enumerated MediaType -fn map_content_type(path: &Path, content_type: Option<&str>) -> msg::MediaType { +// convert a ContentType string into a enumerated MediaType + optional charset +fn map_content_type( + path: &Path, + content_type: Option<&str>, +) -> (msg::MediaType, Option) { match content_type { Some(content_type) => { - // sometimes there is additional data after the media type in + // Sometimes there is additional data after the media type in // Content-Type so we have to do a bit of manipulation so we are only - // dealing with the actual media type - let ct_vector: Vec<&str> = content_type.split(';').collect(); - let ct: &str = ct_vector.first().unwrap(); - match ct.to_lowercase().as_ref() { + // dealing with the actual media type. + let mut ct_iter = content_type.split(';'); + let ct = ct_iter.next().unwrap(); + let media_type = match ct.to_lowercase().as_ref() { "application/typescript" | "text/typescript" | "video/vnd.dlna.mpeg-tts" @@ -565,9 +612,16 @@ fn map_content_type(path: &Path, content_type: Option<&str>) -> msg::MediaType { debug!("unknown content type: {}", content_type); msg::MediaType::Unknown } - } + }; + + let charset = ct_iter + .map(str::trim) + .find_map(|s| s.strip_prefix("charset=")) + .map(String::from); + + (media_type, charset) } - None => map_file_extension(path), + None => (map_file_extension(path), None), } } @@ -586,8 +640,7 @@ fn map_js_like_extension( } } -fn filter_shebang(bytes: Vec) -> Vec { - let string = str::from_utf8(&bytes).unwrap(); +fn filter_shebang(string: &str) -> Vec { if let Some(i) = string.find('\n') { let (_, rest) = string.split_at(i); rest.as_bytes().to_owned() @@ -767,7 +820,7 @@ mod tests { assert!(result.is_ok()); let r = result.unwrap(); assert_eq!( - r.source_code, + r.source_code.bytes, &b"export { printHello } from \"./print_hello.ts\";\n"[..] ); assert_eq!(&(r.media_type), &msg::MediaType::TypeScript); @@ -794,7 +847,7 @@ mod tests { assert!(result2.is_ok()); let r2 = result2.unwrap(); assert_eq!( - r2.source_code, + r2.source_code.bytes, &b"export { printHello } from \"./print_hello.ts\";\n"[..] ); // If get_source_file does not call remote, this should be JavaScript @@ -823,7 +876,7 @@ mod tests { assert!(result3.is_ok()); let r3 = result3.unwrap(); assert_eq!( - r3.source_code, + r3.source_code.bytes, &b"export { printHello } from \"./print_hello.ts\";\n"[..] ); // If get_source_file does not call remote, this should be JavaScript @@ -850,7 +903,7 @@ mod tests { assert!(result4.is_ok()); let r4 = result4.unwrap(); let expected4 = &b"export { printHello } from \"./print_hello.ts\";\n"[..]; - assert_eq!(r4.source_code, expected4); + assert_eq!(r4.source_code.bytes, expected4); // Resolved back to TypeScript assert_eq!(&(r4.media_type), &msg::MediaType::TypeScript); @@ -880,7 +933,7 @@ mod tests { assert!(result.is_ok()); let r = result.unwrap(); let expected = b"export const loaded = true;\n"; - assert_eq!(r.source_code, expected); + assert_eq!(r.source_code.bytes, expected); assert_eq!(&(r.media_type), &msg::MediaType::JavaScript); let (_, headers) = fetcher.http_cache.get(&module_url).unwrap(); assert_eq!(headers.get("content-type").unwrap(), "text/javascript"); @@ -906,7 +959,7 @@ mod tests { assert!(result2.is_ok()); let r2 = result2.unwrap(); let expected2 = b"export const loaded = true;\n"; - assert_eq!(r2.source_code, expected2); + assert_eq!(r2.source_code.bytes, expected2); // If get_source_file does not call remote, this should be TypeScript // as we modified before! (we do not overwrite .headers.json due to no http // fetch) @@ -932,7 +985,7 @@ mod tests { assert!(result3.is_ok()); let r3 = result3.unwrap(); let expected3 = b"export const loaded = true;\n"; - assert_eq!(r3.source_code, expected3); + assert_eq!(r3.source_code.bytes, expected3); // Now the old .headers.json file should be overwritten back to JavaScript! // (due to http fetch) assert_eq!(&(r3.media_type), &msg::MediaType::JavaScript); @@ -1352,7 +1405,7 @@ mod tests { .await; assert!(result.is_ok()); let r = result.unwrap(); - assert_eq!(r.source_code, b"export const loaded = true;\n"); + assert_eq!(r.source_code.bytes, b"export const loaded = true;\n"); assert_eq!(&(r.media_type), &msg::MediaType::TypeScript); // Modify .metadata.json, make sure read from local @@ -1368,7 +1421,7 @@ mod tests { let result2 = fetcher.fetch_cached_remote_source(&module_url, 1); assert!(result2.is_ok()); let r2 = result2.unwrap().unwrap(); - assert_eq!(r2.source_code, b"export const loaded = true;\n"); + assert_eq!(r2.source_code.bytes, b"export const loaded = true;\n"); // Not MediaType::TypeScript due to .headers.json modification assert_eq!(&(r2.media_type), &msg::MediaType::JavaScript); @@ -1392,7 +1445,7 @@ mod tests { .await; assert!(result.is_ok()); let r = result.unwrap(); - assert_eq!(r.source_code, b"export const loaded = true;\n"); + assert_eq!(r.source_code.bytes, b"export const loaded = true;\n"); assert_eq!(&(r.media_type), &msg::MediaType::TypeScript); let (_, headers) = fetcher.http_cache.get(module_url).unwrap(); assert_eq!(headers.get("content-type").unwrap(), "text/typescript"); @@ -1417,7 +1470,7 @@ mod tests { .await; assert!(result.is_ok()); let r2 = result.unwrap(); - assert_eq!(r2.source_code, b"export const loaded = true;\n"); + assert_eq!(r2.source_code.bytes, b"export const loaded = true;\n"); assert_eq!(&(r2.media_type), &msg::MediaType::JavaScript); let (_, headers) = fetcher.http_cache.get(module_url).unwrap(); assert_eq!(headers.get("content-type").unwrap(), "text/javascript"); @@ -1442,7 +1495,7 @@ mod tests { .await; assert!(result.is_ok()); let r3 = result.unwrap(); - assert_eq!(r3.source_code, b"export const loaded = true;\n"); + assert_eq!(r3.source_code.bytes, b"export const loaded = true;\n"); assert_eq!(&(r3.media_type), &msg::MediaType::TypeScript); let (_, headers) = fetcher.http_cache.get(module_url).unwrap(); assert_eq!(headers.get("content-type").unwrap(), "text/typescript"); @@ -1523,6 +1576,63 @@ mod tests { } } + async fn test_fetch_source_file_from_disk_nonstandard_encoding( + charset: &str, + expected_content: String, + ) { + let (_temp_dir, fetcher) = test_setup(); + + let p = std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join(format!("tests/encoding/{}.ts", charset)); + let specifier = + ModuleSpecifier::resolve_url_or_path(p.to_str().unwrap()).unwrap(); + let r = fetcher + .fetch_source_file(&specifier, None, Permissions::allow_all()) + .await; + assert!(r.is_ok()); + let fetched_file = r.unwrap(); + let source_code = fetched_file.source_code.to_str(); + assert!(source_code.is_ok()); + let actual = source_code.unwrap(); + assert_eq!(expected_content, actual); + } + + #[tokio::test] + async fn test_fetch_source_file_from_disk_utf_16_be() { + test_fetch_source_file_from_disk_nonstandard_encoding( + "utf-16be", + String::from_utf8( + b"\xEF\xBB\xBFconsole.log(\"Hello World\");\x0A".to_vec(), + ) + .unwrap(), + ) + .await; + } + + #[tokio::test] + async fn test_fetch_source_file_from_disk_utf_16_le() { + test_fetch_source_file_from_disk_nonstandard_encoding( + "utf-16le", + String::from_utf8( + b"\xEF\xBB\xBFconsole.log(\"Hello World\");\x0A".to_vec(), + ) + .unwrap(), + ) + .await; + } + + #[tokio::test] + async fn test_fetch_source_file_from_disk_utf_8_with_bom() { + test_fetch_source_file_from_disk_nonstandard_encoding( + "utf-8", + String::from_utf8( + b"\xEF\xBB\xBFconsole.log(\"Hello World\");\x0A".to_vec(), + ) + .unwrap(), + ) + .await; + } + #[test] fn test_map_file_extension() { assert_eq!( @@ -1571,43 +1681,43 @@ mod tests { fn test_map_content_type_extension_only() { // Extension only assert_eq!( - map_content_type(Path::new("foo/bar.ts"), None), + map_content_type(Path::new("foo/bar.ts"), None).0, msg::MediaType::TypeScript ); assert_eq!( - map_content_type(Path::new("foo/bar.tsx"), None), + map_content_type(Path::new("foo/bar.tsx"), None).0, msg::MediaType::TSX ); assert_eq!( - map_content_type(Path::new("foo/bar.d.ts"), None), + map_content_type(Path::new("foo/bar.d.ts"), None).0, msg::MediaType::TypeScript ); assert_eq!( - map_content_type(Path::new("foo/bar.js"), None), + map_content_type(Path::new("foo/bar.js"), None).0, msg::MediaType::JavaScript ); assert_eq!( - map_content_type(Path::new("foo/bar.txt"), None), + map_content_type(Path::new("foo/bar.txt"), None).0, msg::MediaType::Unknown ); assert_eq!( - map_content_type(Path::new("foo/bar.jsx"), None), + map_content_type(Path::new("foo/bar.jsx"), None).0, msg::MediaType::JSX ); assert_eq!( - map_content_type(Path::new("foo/bar.json"), None), + map_content_type(Path::new("foo/bar.json"), None).0, msg::MediaType::Json ); assert_eq!( - map_content_type(Path::new("foo/bar.wasm"), None), + map_content_type(Path::new("foo/bar.wasm"), None).0, msg::MediaType::Wasm ); assert_eq!( - map_content_type(Path::new("foo/bar.cjs"), None), + map_content_type(Path::new("foo/bar.cjs"), None).0, msg::MediaType::JavaScript ); assert_eq!( - map_content_type(Path::new("foo/bar"), None), + map_content_type(Path::new("foo/bar"), None).0, msg::MediaType::Unknown ); } @@ -1616,140 +1726,154 @@ mod tests { fn test_map_content_type_media_type_with_no_extension() { // Media Type assert_eq!( - map_content_type(Path::new("foo/bar"), Some("application/typescript")), + map_content_type(Path::new("foo/bar"), Some("application/typescript")).0, msg::MediaType::TypeScript ); assert_eq!( - map_content_type(Path::new("foo/bar"), Some("text/typescript")), + map_content_type(Path::new("foo/bar"), Some("text/typescript")).0, msg::MediaType::TypeScript ); assert_eq!( - map_content_type(Path::new("foo/bar"), Some("video/vnd.dlna.mpeg-tts")), + map_content_type(Path::new("foo/bar"), Some("video/vnd.dlna.mpeg-tts")).0, msg::MediaType::TypeScript ); assert_eq!( - map_content_type(Path::new("foo/bar"), Some("video/mp2t")), + map_content_type(Path::new("foo/bar"), Some("video/mp2t")).0, msg::MediaType::TypeScript ); assert_eq!( - map_content_type(Path::new("foo/bar"), Some("application/x-typescript")), + map_content_type(Path::new("foo/bar"), Some("application/x-typescript")) + .0, msg::MediaType::TypeScript ); assert_eq!( - map_content_type(Path::new("foo/bar"), Some("application/javascript")), + map_content_type(Path::new("foo/bar"), Some("application/javascript")).0, msg::MediaType::JavaScript ); assert_eq!( - map_content_type(Path::new("foo/bar"), Some("text/javascript")), + map_content_type(Path::new("foo/bar"), Some("text/javascript")).0, msg::MediaType::JavaScript ); assert_eq!( - map_content_type(Path::new("foo/bar"), Some("application/ecmascript")), + map_content_type(Path::new("foo/bar"), Some("application/ecmascript")).0, msg::MediaType::JavaScript ); assert_eq!( - map_content_type(Path::new("foo/bar"), Some("text/ecmascript")), + map_content_type(Path::new("foo/bar"), Some("text/ecmascript")).0, msg::MediaType::JavaScript ); assert_eq!( - map_content_type(Path::new("foo/bar"), Some("application/x-javascript")), + map_content_type(Path::new("foo/bar"), Some("application/x-javascript")) + .0, msg::MediaType::JavaScript ); assert_eq!( - map_content_type(Path::new("foo/bar"), Some("application/json")), + map_content_type(Path::new("foo/bar"), Some("application/json")).0, msg::MediaType::Json ); assert_eq!( - map_content_type(Path::new("foo/bar"), Some("application/node")), + map_content_type(Path::new("foo/bar"), Some("application/node")).0, msg::MediaType::JavaScript ); assert_eq!( - map_content_type(Path::new("foo/bar"), Some("text/json")), + map_content_type(Path::new("foo/bar"), Some("text/json")).0, msg::MediaType::Json ); + assert_eq!( + map_content_type(Path::new("foo/bar"), Some("text/json; charset=utf-8 ")), + (msg::MediaType::Json, Some("utf-8".to_owned())) + ); } #[test] fn test_map_file_extension_media_type_with_extension() { assert_eq!( - map_content_type(Path::new("foo/bar.ts"), Some("text/plain")), + map_content_type(Path::new("foo/bar.ts"), Some("text/plain")).0, msg::MediaType::TypeScript ); assert_eq!( - map_content_type(Path::new("foo/bar.ts"), Some("foo/bar")), + map_content_type(Path::new("foo/bar.ts"), Some("foo/bar")).0, msg::MediaType::Unknown ); assert_eq!( map_content_type( Path::new("foo/bar.tsx"), Some("application/typescript"), - ), + ) + .0, msg::MediaType::TSX ); assert_eq!( map_content_type( Path::new("foo/bar.tsx"), Some("application/javascript"), - ), + ) + .0, msg::MediaType::TSX ); assert_eq!( map_content_type( Path::new("foo/bar.tsx"), Some("application/x-typescript"), - ), + ) + .0, msg::MediaType::TSX ); assert_eq!( map_content_type( Path::new("foo/bar.tsx"), Some("video/vnd.dlna.mpeg-tts"), - ), + ) + .0, msg::MediaType::TSX ); assert_eq!( - map_content_type(Path::new("foo/bar.tsx"), Some("video/mp2t")), + map_content_type(Path::new("foo/bar.tsx"), Some("video/mp2t")).0, msg::MediaType::TSX ); assert_eq!( map_content_type( Path::new("foo/bar.jsx"), Some("application/javascript"), - ), + ) + .0, msg::MediaType::JSX ); assert_eq!( map_content_type( Path::new("foo/bar.jsx"), Some("application/x-typescript"), - ), + ) + .0, msg::MediaType::JSX ); assert_eq!( map_content_type( Path::new("foo/bar.jsx"), Some("application/ecmascript"), - ), + ) + .0, msg::MediaType::JSX ); assert_eq!( - map_content_type(Path::new("foo/bar.jsx"), Some("text/ecmascript")), + map_content_type(Path::new("foo/bar.jsx"), Some("text/ecmascript")).0, msg::MediaType::JSX ); assert_eq!( map_content_type( Path::new("foo/bar.jsx"), Some("application/x-javascript"), - ), + ) + .0, msg::MediaType::JSX ); } #[test] fn test_filter_shebang() { - assert_eq!(filter_shebang(b"#!"[..].to_owned()), b""); - assert_eq!(filter_shebang(b"#!\n\n"[..].to_owned()), b"\n\n"); - let code = b"#!/usr/bin/env deno\nconsole.log('hello');\n"[..].to_owned(); + assert_eq!(filter_shebang("#!"), b""); + assert_eq!(filter_shebang("#!\n\n"), b"\n\n"); + let code = "#!/usr/bin/env deno\nconsole.log('hello');\n"; assert_eq!(filter_shebang(code), b"\nconsole.log('hello');\n"); } @@ -1771,7 +1895,7 @@ mod tests { .await; assert!(source.is_ok()); let source = source.unwrap(); - assert_eq!(source.source_code, b"console.log('etag')"); + assert_eq!(source.source_code.bytes, b"console.log('etag')"); assert_eq!(&(source.media_type), &msg::MediaType::TypeScript); let (_, headers) = fetcher.http_cache.get(&module_url).unwrap(); @@ -1798,7 +1922,7 @@ mod tests { ) .await .unwrap(); - assert_eq!(cached_source.source_code, b"changed content"); + assert_eq!(cached_source.source_code.bytes, b"changed content"); let modified2 = metadata_path.metadata().unwrap().modified().unwrap(); @@ -1825,7 +1949,7 @@ mod tests { .await; assert!(source.is_ok()); let source = source.unwrap(); - assert_eq!(source.source_code, b"export const foo = 'foo';"); + assert_eq!(source.source_code.bytes, b"export const foo = 'foo';"); assert_eq!(&(source.media_type), &msg::MediaType::JavaScript); assert_eq!( source.types_header, @@ -1833,4 +1957,80 @@ mod tests { ); drop(http_server_guard); } + + #[tokio::test] + async fn test_fetch_source_file_from_net_utf16_le() { + let content = + std::str::from_utf8(b"\xEF\xBB\xBFconsole.log(\"Hello World\");\x0A") + .unwrap(); + test_fetch_non_utf8_source_file_from_net( + "utf-16le", + "utf-16le.ts", + content, + ) + .await; + } + + #[tokio::test] + async fn test_fetch_source_file_from_net_utf16_be() { + let content = + std::str::from_utf8(b"\xEF\xBB\xBFconsole.log(\"Hello World\");\x0A") + .unwrap(); + test_fetch_non_utf8_source_file_from_net( + "utf-16be", + "utf-16be.ts", + content, + ) + .await; + } + + #[tokio::test] + async fn test_fetch_source_file_from_net_windows_1255() { + let content = "console.log(\"\u{5E9}\u{5DC}\u{5D5}\u{5DD} \ + \u{5E2}\u{5D5}\u{5DC}\u{5DD}\");\u{A}"; + test_fetch_non_utf8_source_file_from_net( + "windows-1255", + "windows-1255", + content, + ) + .await; + } + + async fn test_fetch_non_utf8_source_file_from_net( + charset: &str, + file_name: &str, + expected_content: &str, + ) { + let http_server_guard = test_util::http_server(); + let (_temp_dir, fetcher) = test_setup(); + let module_url = Url::parse(&format!( + "http://127.0.0.1:4545/cli/tests/encoding/{}", + file_name + )) + .unwrap(); + + let source = fetcher + .fetch_remote_source( + &module_url, + false, + false, + 1, + &Permissions::allow_all(), + ) + .await; + assert!(source.is_ok()); + let source = source.unwrap(); + assert_eq!(&source.source_code.charset.to_lowercase()[..], charset); + let text = &source.source_code.to_str().unwrap(); + assert_eq!(text, expected_content); + assert_eq!(&(source.media_type), &msg::MediaType::TypeScript); + + let (_, headers) = fetcher.http_cache.get(&module_url).unwrap(); + assert_eq!( + headers.get("content-type").unwrap(), + &format!("application/typescript;charset={}", charset) + ); + + drop(http_server_guard); + } } diff --git a/cli/fmt.rs b/cli/fmt.rs index 70bc0e8bc1..319f7fece7 100644 --- a/cli/fmt.rs +++ b/cli/fmt.rs @@ -11,6 +11,7 @@ use crate::colors; use crate::diff::diff; use crate::fs::files_in_subtree; use crate::op_error::OpError; +use crate::text_encoding; use deno_core::ErrBox; use dprint_plugin_typescript as dprint; use std::fs; @@ -247,13 +248,15 @@ struct FileContents { } fn read_file_contents(file_path: &PathBuf) -> Result { - let file_text = fs::read_to_string(&file_path)?; + let file_bytes = fs::read(&file_path)?; + let charset = text_encoding::detect_charset(&file_bytes); + let file_text = text_encoding::convert_to_utf8(&file_bytes, charset)?; let had_bom = file_text.starts_with(BOM_CHAR); let text = if had_bom { // remove the BOM String::from(&file_text[BOM_CHAR.len_utf8()..]) } else { - file_text + String::from(file_text) }; Ok(FileContents { text, had_bom }) diff --git a/cli/global_state.rs b/cli/global_state.rs index a26fc453e0..a723bdd2fe 100644 --- a/cli/global_state.rs +++ b/cli/global_state.rs @@ -250,7 +250,7 @@ impl GlobalState { } } else { CompiledModule { - code: String::from_utf8(out.source_code.clone())?, + code: out.source_code.to_string()?, name: out.url.to_string(), } }; diff --git a/cli/main.rs b/cli/main.rs index cff401fbac..191355a0cd 100644 --- a/cli/main.rs +++ b/cli/main.rs @@ -11,6 +11,7 @@ extern crate futures; extern crate serde_json; extern crate clap; extern crate deno_core; +extern crate encoding_rs; extern crate indexmap; #[cfg(unix)] extern crate nix; @@ -60,6 +61,7 @@ mod startup_data; pub mod state; mod swc_util; mod test_runner; +mod text_encoding; mod tokio_util; mod tsc; mod upgrade; @@ -70,6 +72,7 @@ pub mod worker; use crate::doc::parser::DocFileLoader; use crate::file_fetcher::SourceFile; use crate::file_fetcher::SourceFileFetcher; +use crate::file_fetcher::TextDocument; use crate::fs as deno_fs; use crate::global_state::GlobalState; use crate::msg::MediaType; @@ -412,7 +415,7 @@ async fn eval_command( } else { MediaType::JavaScript }, - source_code, + source_code: TextDocument::new(source_code, Some("utf-8")), }; // Save our fake file into file fetcher cache // to allow module access by TS compiler (e.g. op_fetch_source_files) @@ -525,8 +528,7 @@ async fn doc_command( let source_file = fetcher .fetch_source_file(&specifier, None, Permissions::allow_all()) .await?; - String::from_utf8(source_file.source_code) - .map_err(|_| OpError::other("failed to parse".to_string())) + source_file.source_code.to_string().map_err(OpError::from) } .boxed_local() } @@ -601,7 +603,7 @@ async fn run_command(flags: Flags, script: String) -> Result<(), ErrBox> { url: main_module_url, types_header: None, media_type: MediaType::TypeScript, - source_code: source, + source_code: source.into(), }; // Save our fake file into file fetcher cache // to allow module access by TS compiler (e.g. op_fetch_source_files) @@ -657,7 +659,10 @@ async fn test_command( url: test_file_url, types_header: None, media_type: MediaType::TypeScript, - source_code: test_file.clone().into_bytes(), + source_code: TextDocument::new( + test_file.clone().into_bytes(), + Some("utf-8"), + ), }; // Save our fake file into file fetcher cache // to allow module access by TS compiler (e.g. op_fetch_source_files) diff --git a/cli/module_graph.rs b/cli/module_graph.rs index 3fb1379f30..8b7a52906f 100644 --- a/cli/module_graph.rs +++ b/cli/module_graph.rs @@ -458,7 +458,7 @@ impl ModuleGraphLoader { redirect: Some(source_file.url.to_string()), filename: source_file.filename.to_str().unwrap().to_string(), version_hash: checksum::gen(&[ - &source_file.source_code, + &source_file.source_code.as_bytes(), version::DENO.as_bytes(), ]), media_type: source_file.media_type, @@ -473,9 +473,11 @@ impl ModuleGraphLoader { } let module_specifier = ModuleSpecifier::from(source_file.url.clone()); - let version_hash = - checksum::gen(&[&source_file.source_code, version::DENO.as_bytes()]); - let source_code = String::from_utf8(source_file.source_code)?; + let version_hash = checksum::gen(&[ + &source_file.source_code.as_bytes(), + version::DENO.as_bytes(), + ]); + let source_code = source_file.source_code.to_string()?; if SUPPORTED_MEDIA_TYPES.contains(&source_file.media_type) { if let Some(types_specifier) = source_file.types_header { diff --git a/cli/tests/encoding/utf-16be.ts b/cli/tests/encoding/utf-16be.ts new file mode 100644 index 0000000000000000000000000000000000000000..3d0144d7c0d381e1b4b900a0864bbd23604e2c05 GIT binary patch literal 58 zcmezOpCOqcpCON-7)a(Yq%!CM$$W-%1`P%!1`i-V2h3Ms2nUK50of@)aZLtm1}*@R Cj|*1- literal 0 HcmV?d00001 diff --git a/cli/tests/encoding/utf-16le.ts b/cli/tests/encoding/utf-16le.ts new file mode 100644 index 0000000000000000000000000000000000000000..6f0e415f2cb933f4781d53b20dc5250fcf4d0809 GIT binary patch literal 58 zcmezWFPR~qA&;RLNair4GUx%xe1>!e4F)9!4 &'static str { + const UTF16_LE_BOM: &[u8] = b"\xFF\xFE"; + const UTF16_BE_BOM: &[u8] = b"\xFE\xFF"; + + if bytes.starts_with(UTF16_LE_BOM) { + "utf-16le" + } else if bytes.starts_with(UTF16_BE_BOM) { + "utf-16be" + } else { + // Assume everything else is utf-8 + "utf-8" + } +} + +/// Attempts to convert the provided bytes to a UTF-8 string. +/// +/// Supports all encodings supported by the encoding_rs crate, which includes +/// all encodings specified in the WHATWG Encoding Standard, and only those +/// encodings (see: https://encoding.spec.whatwg.org/). +pub fn convert_to_utf8<'a>( + bytes: &'a [u8], + charset: &'_ str, +) -> Result, Error> { + match Encoding::for_label(charset.as_bytes()) { + Some(encoding) => encoding + .decode_without_bom_handling_and_without_replacement(bytes) + .ok_or_else(|| ErrorKind::InvalidData.into()), + None => Err(Error::new( + ErrorKind::InvalidInput, + format!("Unsupported charset: {}", charset), + )), + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn test_detection(test_data: &[u8], expected_charset: &str) { + let detected_charset = detect_charset(test_data); + assert_eq!( + expected_charset.to_lowercase(), + detected_charset.to_lowercase() + ); + } + + #[test] + fn test_detection_utf8_no_bom() { + let test_data = "Hello UTF-8 it is \u{23F0} for Deno!" + .to_owned() + .into_bytes(); + test_detection(&test_data, "utf-8"); + } + + #[test] + fn test_detection_utf16_little_endian() { + let test_data = b"\xFF\xFEHello UTF-16LE".to_owned().to_vec(); + test_detection(&test_data, "utf-16le"); + } + + #[test] + fn test_detection_utf16_big_endian() { + let test_data = b"\xFE\xFFHello UTF-16BE".to_owned().to_vec(); + test_detection(&test_data, "utf-16be"); + } + + #[test] + fn test_decoding_unsupported_charset() { + let test_data = Vec::new(); + let result = convert_to_utf8(&test_data, "utf-32le"); + assert!(result.is_err()); + let err = result.expect_err("Err expected"); + assert!(err.kind() == ErrorKind::InvalidInput); + } + + #[test] + fn test_decoding_invalid_utf8() { + let test_data = b"\xFE\xFE\xFF\xFF".to_vec(); + let result = convert_to_utf8(&test_data, "utf-8"); + assert!(result.is_err()); + let err = result.expect_err("Err expected"); + assert!(err.kind() == ErrorKind::InvalidData); + } +} diff --git a/cli/tsc.rs b/cli/tsc.rs index fb25df8d5e..41128948b5 100644 --- a/cli/tsc.rs +++ b/cli/tsc.rs @@ -471,7 +471,7 @@ impl TsCompiler { if let Some(metadata) = self.get_metadata(&url) { // Compare version hashes let version_hash_to_validate = source_code_version_hash( - &source_file.source_code, + &source_file.source_code.as_bytes(), version::DENO, &self.config.hash, ); @@ -512,7 +512,7 @@ impl TsCompiler { .fetch_cached_source_file(&specifier, Permissions::allow_all()) { let existing_hash = crate::checksum::gen(&[ - &source_file.source_code, + &source_file.source_code.as_bytes(), version::DENO.as_bytes(), ]); let expected_hash = @@ -851,9 +851,7 @@ impl TsCompiler { let compiled_source_file = self.get_compiled_source_file(module_url)?; let compiled_module = CompiledModule { - code: str::from_utf8(&compiled_source_file.source_code) - .unwrap() - .to_string(), + code: compiled_source_file.source_code.to_string()?, name: module_url.to_string(), }; @@ -861,8 +859,8 @@ impl TsCompiler { } /// Return compiled JS file for given TS module. - // TODO: ideally we shouldn't construct SourceFile by hand, but it should be delegated to - // SourceFileFetcher + // TODO: ideally we shouldn't construct SourceFile by hand, but it should be + // delegated to SourceFileFetcher. pub fn get_compiled_source_file( &self, module_url: &Url, @@ -878,7 +876,7 @@ impl TsCompiler { url: module_url.clone(), filename: compiled_code_filename, media_type: msg::MediaType::JavaScript, - source_code: compiled_code, + source_code: compiled_code.into(), types_header: None, }; @@ -902,7 +900,7 @@ impl TsCompiler { self.mark_compiled(module_specifier.as_url()); let version_hash = source_code_version_hash( - &source_file.source_code, + &source_file.source_code.as_bytes(), version::DENO, &self.config.hash, ); @@ -935,7 +933,7 @@ impl TsCompiler { url: module_specifier.as_url().to_owned(), filename: source_map_filename, media_type: msg::MediaType::JavaScript, - source_code, + source_code: source_code.into(), types_header: None, }; @@ -981,7 +979,7 @@ impl SourceMapGetter for TsCompiler { self .try_resolve_and_get_source_file(script_name) .and_then(|out| { - str::from_utf8(&out.source_code).ok().map(|v| { + out.source_code.to_str().ok().map(|v| { // Do NOT use .lines(): it skips the terminating empty line. // (due to internally using .split_terminator() instead of .split()) let lines: Vec<&str> = v.split('\n').collect(); @@ -1020,7 +1018,7 @@ impl TsCompiler { ) -> Option> { if let Some(module_specifier) = self.try_to_resolve(script_name) { return match self.get_source_map_file(&module_specifier) { - Ok(out) => Some(out.source_code), + Ok(out) => Some(out.source_code.into_bytes()), Err(_) => { // Check if map is inlined if let Ok(compiled_source) = @@ -1566,7 +1564,7 @@ mod tests { url: specifier.as_url().clone(), filename: PathBuf::from(p.to_str().unwrap().to_string()), media_type: msg::MediaType::TypeScript, - source_code: include_bytes!("./tests/002_hello.ts").to_vec(), + source_code: include_bytes!("./tests/002_hello.ts").to_vec().into(), types_header: None, }; let dir = @@ -1642,7 +1640,7 @@ mod tests { url: specifier.as_url().clone(), filename: PathBuf::from(p.to_str().unwrap().to_string()), media_type: msg::MediaType::TypeScript, - source_code: include_bytes!("./tests/002_hello.ts").to_vec(), + source_code: include_bytes!("./tests/002_hello.ts").to_vec().into(), types_header: None, }; let dir = diff --git a/test_util/src/lib.rs b/test_util/src/lib.rs index 7c754e02d1..a6fb749144 100644 --- a/test_util/src/lib.rs +++ b/test_util/src/lib.rs @@ -430,6 +430,19 @@ fn custom_headers(path: warp::path::Peek, f: warp::fs::File) -> Box { let f = with_header(f, "Content-Length", "39"); return Box::new(f); } + if p.contains("cli/tests/encoding/") { + let charset = p + .split_terminator('/') + .last() + .unwrap() + .trim_end_matches(".ts"); + let f = with_header( + f, + "Content-Type", + &format!("application/typescript;charset={}", charset)[..], + ); + return Box::new(f); + } let content_type = if p.contains(".t1.") { Some("text/typescript") diff --git a/tools/lint.py b/tools/lint.py index fc9a73c90b..5891ba9f9a 100755 --- a/tools/lint.py +++ b/tools/lint.py @@ -69,6 +69,7 @@ def eslint(): ":!:cli/compilers/wasm_wrap.js", ":!:cli/tests/error_syntax.js", ":!:cli/tests/lint/**", + ":!:cli/tests/encoding/**", ":!:cli/dts/**", ":!:cli/tsc/*typescript.js", ])