From 45f4f5fc58eb09a9a01c546810fd3230aa6a9c51 Mon Sep 17 00:00:00 2001 From: sigoden Date: Wed, 1 Mar 2023 09:36:59 +0800 Subject: [PATCH] feat: guess plain text encoding then set content-type charset (#186) --- Cargo.lock | 12 ++++++++++++ Cargo.toml | 1 + src/server.rs | 43 +++++++++++++++++++++++++++++++++++-------- tests/fixtures.rs | 33 +++++++++++++++++++++++++-------- tests/http.rs | 40 ++++++++++++++++++++++++++++++++++++++-- 5 files changed, 111 insertions(+), 18 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 456285a..49fb386 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -204,6 +204,17 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" +[[package]] +name = "chardetng" +version = "0.1.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "14b8f0b65b7b08ae3c8187e8d77174de20cb6777864c6b832d8ad365999cf1ea" +dependencies = [ + "cfg-if", + "encoding_rs", + "memchr", +] + [[package]] name = "chrono" version = "0.4.23" @@ -425,6 +436,7 @@ dependencies = [ "async-stream", "async_zip", "base64 0.21.0", + "chardetng", "chrono", "clap", "clap_complete", diff --git a/Cargo.toml b/Cargo.toml index 068ecd9..8a7a68c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -42,6 +42,7 @@ form_urlencoded = "1.0" alphanumeric-sort = "1.4" content_inspector = "0.2" anyhow = "1.0" +chardetng = "0.1" [features] default = ["tls"] diff --git a/src/server.rs b/src/server.rs index feb2f3d..d026d8c 100644 --- a/src/server.rs +++ b/src/server.rs @@ -638,14 +638,10 @@ impl Server { None }; - if let Some(mime) = mime_guess::from_path(path).first() { - res.headers_mut().typed_insert(ContentType::from(mime)); - } else { - res.headers_mut().insert( - CONTENT_TYPE, - HeaderValue::from_static("application/octet-stream"), - ); - } + res.headers_mut().insert( + CONTENT_TYPE, + HeaderValue::from_str(&get_content_type(path).await?)?, + ); let filename = try_get_file_name(path)?; res.headers_mut().insert( @@ -1382,3 +1378,34 @@ fn set_webdav_headers(res: &mut Response) { res.headers_mut() .insert("DAV", HeaderValue::from_static("1,2")); } + +async fn get_content_type(path: &Path) -> Result { + let mut buffer: Vec = vec![]; + fs::File::open(path) + .await? + .take(1024) + .read_to_end(&mut buffer) + .await?; + let mime = mime_guess::from_path(path).first(); + let is_text = content_inspector::inspect(&buffer).is_text(); + let content_type = if is_text { + let mut detector = chardetng::EncodingDetector::new(); + detector.feed(&buffer, buffer.len() < 1024); + let (enc, confident) = detector.guess_assess(None, true); + let charset = if confident { + format!("; charset={}", enc.name()) + } else { + "".into() + }; + match mime { + Some(m) => format!("{m}{charset}"), + None => format!("text/plain{charset}"), + } + } else { + match mime { + Some(m) => m.to_string(), + None => "application/octet-stream".into(), + } + }; + Ok(content_type) +} diff --git a/tests/fixtures.rs b/tests/fixtures.rs index b785670..b855ba6 100644 --- a/tests/fixtures.rs +++ b/tests/fixtures.rs @@ -46,15 +46,12 @@ pub fn tmpdir() -> TempDir { let tmpdir = assert_fs::TempDir::new().expect("Couldn't create a temp dir for tests"); for file in FILES { if *file == BIN_FILE { - tmpdir - .child(file) - .write_binary(b"bin\0\0123") - .expect("Couldn't write to file"); + tmpdir.child(file).write_binary(b"bin\0\0123").unwrap(); } else { tmpdir .child(file) .write_str(&format!("This is {file}")) - .expect("Couldn't write to file"); + .unwrap(); } } for directory in DIRECTORIES { @@ -62,7 +59,7 @@ pub fn tmpdir() -> TempDir { tmpdir .child(format!("{}{}", directory, "index.html")) .write_str("__ASSERTS_PREFIX__index.js;DATA = __INDEX_DATA__") - .expect("Couldn't write to file"); + .unwrap(); } else { for file in FILES { if *directory == DIR_NO_INDEX && *file == "index.html" { @@ -72,17 +69,37 @@ pub fn tmpdir() -> TempDir { tmpdir .child(format!("{directory}{file}")) .write_binary(b"bin\0\0123") - .expect("Couldn't write to file"); + .unwrap(); } else { tmpdir .child(format!("{directory}{file}")) .write_str(&format!("This is {directory}{file}")) - .expect("Couldn't write to file"); + .unwrap(); } } } } tmpdir.child("dir4/hidden").touch().unwrap(); + tmpdir + .child("content-types/bin.tar") + .write_binary(b"\x7f\x45\x4c\x46\x02\x01\x00\x00") + .unwrap(); + tmpdir + .child("content-types/bin") + .write_binary(b"\x7f\x45\x4c\x46\x02\x01\x00\x00") + .unwrap(); + tmpdir + .child("content-types/file-utf8.txt") + .write_str("世界") + .unwrap(); + tmpdir + .child("content-types/file-gbk.txt") + .write_binary(b"\xca\xc0\xbd\xe7") + .unwrap(); + tmpdir + .child("content-types/file") + .write_str("世界") + .unwrap(); tmpdir } diff --git a/tests/http.rs b/tests/http.rs index ee9ff6b..6ae7790 100644 --- a/tests/http.rs +++ b/tests/http.rs @@ -148,7 +148,10 @@ fn empty_search(#[with(&["-A"])] server: TestServer) -> Result<(), Error> { fn get_file(server: TestServer) -> Result<(), Error> { let resp = reqwest::blocking::get(format!("{}index.html", server.url()))?; assert_eq!(resp.status(), 200); - assert_eq!(resp.headers().get("content-type").unwrap(), "text/html"); + assert_eq!( + resp.headers().get("content-type").unwrap(), + "text/html; charset=UTF-8" + ); assert_eq!(resp.headers().get("accept-ranges").unwrap(), "bytes"); assert!(resp.headers().contains_key("etag")); assert!(resp.headers().contains_key("last-modified")); @@ -161,7 +164,10 @@ fn get_file(server: TestServer) -> Result<(), Error> { fn head_file(server: TestServer) -> Result<(), Error> { let resp = fetch!(b"HEAD", format!("{}index.html", server.url())).send()?; assert_eq!(resp.status(), 200); - assert_eq!(resp.headers().get("content-type").unwrap(), "text/html"); + assert_eq!( + resp.headers().get("content-type").unwrap(), + "text/html; charset=UTF-8" + ); assert_eq!(resp.headers().get("accept-ranges").unwrap(), "bytes"); assert!(resp.headers().contains_key("content-disposition")); assert!(resp.headers().contains_key("etag")); @@ -259,3 +265,33 @@ fn delete_file_404(#[with(&["-A"])] server: TestServer) -> Result<(), Error> { assert_eq!(resp.status(), 404); Ok(()) } + +#[rstest] +fn get_file_content_type(server: TestServer) -> Result<(), Error> { + let resp = reqwest::blocking::get(format!("{}content-types/bin.tar", server.url()))?; + assert_eq!( + resp.headers().get("content-type").unwrap(), + "application/x-tar" + ); + let resp = reqwest::blocking::get(format!("{}content-types/bin", server.url()))?; + assert_eq!( + resp.headers().get("content-type").unwrap(), + "application/octet-stream" + ); + let resp = reqwest::blocking::get(format!("{}content-types/file-utf8.txt", server.url()))?; + assert_eq!( + resp.headers().get("content-type").unwrap(), + "text/plain; charset=UTF-8" + ); + let resp = reqwest::blocking::get(format!("{}content-types/file-gbk.txt", server.url()))?; + assert_eq!( + resp.headers().get("content-type").unwrap(), + "text/plain; charset=GBK" + ); + let resp = reqwest::blocking::get(format!("{}content-types/file", server.url()))?; + assert_eq!( + resp.headers().get("content-type").unwrap(), + "text/plain; charset=UTF-8" + ); + Ok(()) +}