feat: guess plain text encoding then set content-type charset (#186)

This commit is contained in:
sigoden 2023-03-01 09:36:59 +08:00 committed by GitHub
parent 6dcb4dcd76
commit 45f4f5fc58
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 111 additions and 18 deletions

12
Cargo.lock generated
View file

@ -204,6 +204,17 @@ version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
[[package]]
name = "chardetng"
version = "0.1.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "14b8f0b65b7b08ae3c8187e8d77174de20cb6777864c6b832d8ad365999cf1ea"
dependencies = [
"cfg-if",
"encoding_rs",
"memchr",
]
[[package]]
name = "chrono"
version = "0.4.23"
@ -425,6 +436,7 @@ dependencies = [
"async-stream",
"async_zip",
"base64 0.21.0",
"chardetng",
"chrono",
"clap",
"clap_complete",

View file

@ -42,6 +42,7 @@ form_urlencoded = "1.0"
alphanumeric-sort = "1.4"
content_inspector = "0.2"
anyhow = "1.0"
chardetng = "0.1"
[features]
default = ["tls"]

View file

@ -638,14 +638,10 @@ impl Server {
None
};
if let Some(mime) = mime_guess::from_path(path).first() {
res.headers_mut().typed_insert(ContentType::from(mime));
} else {
res.headers_mut().insert(
CONTENT_TYPE,
HeaderValue::from_static("application/octet-stream"),
);
}
res.headers_mut().insert(
CONTENT_TYPE,
HeaderValue::from_str(&get_content_type(path).await?)?,
);
let filename = try_get_file_name(path)?;
res.headers_mut().insert(
@ -1382,3 +1378,34 @@ fn set_webdav_headers(res: &mut Response) {
res.headers_mut()
.insert("DAV", HeaderValue::from_static("1,2"));
}
async fn get_content_type(path: &Path) -> Result<String> {
let mut buffer: Vec<u8> = vec![];
fs::File::open(path)
.await?
.take(1024)
.read_to_end(&mut buffer)
.await?;
let mime = mime_guess::from_path(path).first();
let is_text = content_inspector::inspect(&buffer).is_text();
let content_type = if is_text {
let mut detector = chardetng::EncodingDetector::new();
detector.feed(&buffer, buffer.len() < 1024);
let (enc, confident) = detector.guess_assess(None, true);
let charset = if confident {
format!("; charset={}", enc.name())
} else {
"".into()
};
match mime {
Some(m) => format!("{m}{charset}"),
None => format!("text/plain{charset}"),
}
} else {
match mime {
Some(m) => m.to_string(),
None => "application/octet-stream".into(),
}
};
Ok(content_type)
}

View file

@ -46,15 +46,12 @@ pub fn tmpdir() -> TempDir {
let tmpdir = assert_fs::TempDir::new().expect("Couldn't create a temp dir for tests");
for file in FILES {
if *file == BIN_FILE {
tmpdir
.child(file)
.write_binary(b"bin\0\0123")
.expect("Couldn't write to file");
tmpdir.child(file).write_binary(b"bin\0\0123").unwrap();
} else {
tmpdir
.child(file)
.write_str(&format!("This is {file}"))
.expect("Couldn't write to file");
.unwrap();
}
}
for directory in DIRECTORIES {
@ -62,7 +59,7 @@ pub fn tmpdir() -> TempDir {
tmpdir
.child(format!("{}{}", directory, "index.html"))
.write_str("__ASSERTS_PREFIX__index.js;DATA = __INDEX_DATA__")
.expect("Couldn't write to file");
.unwrap();
} else {
for file in FILES {
if *directory == DIR_NO_INDEX && *file == "index.html" {
@ -72,17 +69,37 @@ pub fn tmpdir() -> TempDir {
tmpdir
.child(format!("{directory}{file}"))
.write_binary(b"bin\0\0123")
.expect("Couldn't write to file");
.unwrap();
} else {
tmpdir
.child(format!("{directory}{file}"))
.write_str(&format!("This is {directory}{file}"))
.expect("Couldn't write to file");
.unwrap();
}
}
}
}
tmpdir.child("dir4/hidden").touch().unwrap();
tmpdir
.child("content-types/bin.tar")
.write_binary(b"\x7f\x45\x4c\x46\x02\x01\x00\x00")
.unwrap();
tmpdir
.child("content-types/bin")
.write_binary(b"\x7f\x45\x4c\x46\x02\x01\x00\x00")
.unwrap();
tmpdir
.child("content-types/file-utf8.txt")
.write_str("世界")
.unwrap();
tmpdir
.child("content-types/file-gbk.txt")
.write_binary(b"\xca\xc0\xbd\xe7")
.unwrap();
tmpdir
.child("content-types/file")
.write_str("世界")
.unwrap();
tmpdir
}

View file

@ -148,7 +148,10 @@ fn empty_search(#[with(&["-A"])] server: TestServer) -> Result<(), Error> {
fn get_file(server: TestServer) -> Result<(), Error> {
let resp = reqwest::blocking::get(format!("{}index.html", server.url()))?;
assert_eq!(resp.status(), 200);
assert_eq!(resp.headers().get("content-type").unwrap(), "text/html");
assert_eq!(
resp.headers().get("content-type").unwrap(),
"text/html; charset=UTF-8"
);
assert_eq!(resp.headers().get("accept-ranges").unwrap(), "bytes");
assert!(resp.headers().contains_key("etag"));
assert!(resp.headers().contains_key("last-modified"));
@ -161,7 +164,10 @@ fn get_file(server: TestServer) -> Result<(), Error> {
fn head_file(server: TestServer) -> Result<(), Error> {
let resp = fetch!(b"HEAD", format!("{}index.html", server.url())).send()?;
assert_eq!(resp.status(), 200);
assert_eq!(resp.headers().get("content-type").unwrap(), "text/html");
assert_eq!(
resp.headers().get("content-type").unwrap(),
"text/html; charset=UTF-8"
);
assert_eq!(resp.headers().get("accept-ranges").unwrap(), "bytes");
assert!(resp.headers().contains_key("content-disposition"));
assert!(resp.headers().contains_key("etag"));
@ -259,3 +265,33 @@ fn delete_file_404(#[with(&["-A"])] server: TestServer) -> Result<(), Error> {
assert_eq!(resp.status(), 404);
Ok(())
}
#[rstest]
fn get_file_content_type(server: TestServer) -> Result<(), Error> {
let resp = reqwest::blocking::get(format!("{}content-types/bin.tar", server.url()))?;
assert_eq!(
resp.headers().get("content-type").unwrap(),
"application/x-tar"
);
let resp = reqwest::blocking::get(format!("{}content-types/bin", server.url()))?;
assert_eq!(
resp.headers().get("content-type").unwrap(),
"application/octet-stream"
);
let resp = reqwest::blocking::get(format!("{}content-types/file-utf8.txt", server.url()))?;
assert_eq!(
resp.headers().get("content-type").unwrap(),
"text/plain; charset=UTF-8"
);
let resp = reqwest::blocking::get(format!("{}content-types/file-gbk.txt", server.url()))?;
assert_eq!(
resp.headers().get("content-type").unwrap(),
"text/plain; charset=GBK"
);
let resp = reqwest::blocking::get(format!("{}content-types/file", server.url()))?;
assert_eq!(
resp.headers().get("content-type").unwrap(),
"text/plain; charset=UTF-8"
);
Ok(())
}