From 568ac0c9026b6f4012e2511a026bb5eb31a06020 Mon Sep 17 00:00:00 2001 From: Kitson Kelly Date: Fri, 7 Dec 2018 10:23:29 +1100 Subject: [PATCH] Add ASCII support to TextDecoder --- js/text_encoding.ts | 123 ++++++++++++++++++++++++++++++++++----- js/text_encoding_test.ts | 17 ++++++ 2 files changed, 127 insertions(+), 13 deletions(-) diff --git a/js/text_encoding.ts b/js/text_encoding.ts index 7cd324c726..31b1d8ff62 100644 --- a/js/text_encoding.ts +++ b/js/text_encoding.ts @@ -67,8 +67,12 @@ export function btoa(s: string): string { return result; } +interface DecoderOptions { + fatal?: boolean; +} + interface Decoder { - handler(stream: Stream, byte: number): number | number[] | null; + handler(stream: Stream, byte: number): number | null; } interface Encoder { @@ -79,6 +83,59 @@ const CONTINUE = null; const END_OF_STREAM = -1; const FINISHED = -1; +// The encodingMap is a hash of labels that are indexed by the conical +// encoding. +const encodingMap: { [key: string]: string[] } = { + "windows-1252": [ + "ansi_x3.4-1968", + "ascii", + "cp1252", + "cp819", + "csisolatin1", + "ibm819", + "iso-8859-1", + "iso-ir-100", + "iso8859-1", + "iso88591", + "iso_8859-1", + "iso_8859-1:1987", + "l1", + "latin1", + "us-ascii", + "windows-1252", + "x-cp1252" + ], + "utf-8": ["unicode-1-1-utf-8", "utf-8", "utf8"] +}; +// We convert these into a Map where every label resolves to its canonical +// encoding type. +const encodings = new Map(); +for (const key of Object.keys(encodingMap)) { + const labels = encodingMap[key]; + for (const label of labels) { + encodings.set(label, key); + } +} + +// A map of functions that return new instances of a decoder indexed by the +// encoding type. +const decoders = new Map Decoder>(); +decoders.set("utf-8", (options: DecoderOptions) => { + return new UTF8Decoder(options); +}); + +// Single byte decoders are an array of code point lookups +const encodingIndexes = new Map(); +// tslint:disable:max-line-length +// prettier-ignore +encodingIndexes.set("windows-1252", [8364,129,8218,402,8222,8230,8224,8225,710,8240,352,8249,338,141,381,143,144,8216,8217,8220,8221,8226,8211,8212,732,8482,353,8250,339,157,382,376,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255]); +// tslint:enable +for (const [key, index] of encodingIndexes) { + decoders.set(key, (options: DecoderOptions) => { + return new SingleByteDecoder(index, options); + }); +} + function codePointsToString(codePoints: number[]): string { let s = ""; for (const cp of codePoints) { @@ -98,6 +155,10 @@ function inRange(a: number, min: number, max: number) { return min <= a && a <= max; } +function isASCIIByte(a: number) { + return inRange(a, 0x00, 0x7f); +} + function stringToCodePoints(input: string): number[] { const u: number[] = []; for (const c of input) { @@ -142,6 +203,31 @@ class Stream { } } +class SingleByteDecoder implements Decoder { + private _index: number[]; + private _fatal: boolean; + + constructor(index: number[], options: DecoderOptions) { + this._fatal = options.fatal || false; + this._index = index; + } + handler(stream: Stream, byte: number): number { + if (byte === END_OF_STREAM) { + return FINISHED; + } + if (isASCIIByte(byte)) { + return byte; + } + const codePoint = this._index[byte - 0x80]; + + if (codePoint == null) { + return decoderError(this._fatal); + } + + return codePoint; + } +} + class UTF8Decoder implements Decoder { private _codePoint = 0; private _bytesSeen = 0; @@ -150,8 +236,8 @@ class UTF8Decoder implements Decoder { private _lowerBoundary = 0x80; private _upperBoundary = 0xbf; - constructor(options = { fatal: false }) { - this._fatal = options.fatal; + constructor(options: DecoderOptions) { + this._fatal = options.fatal || false; } handler(stream: Stream, byte: number): number | null { @@ -165,7 +251,7 @@ class UTF8Decoder implements Decoder { } if (this._bytesNeeded === 0) { - if (inRange(byte, 0x00, 0x7f)) { + if (isASCIIByte(byte)) { // Single byte code point return byte; } else if (inRange(byte, 0xc2, 0xdf)) { @@ -272,26 +358,37 @@ export interface TextDecoderOptions { } export class TextDecoder { + private _encoding: string; + /** Returns encoding's name, lowercased. */ - readonly encoding = "utf-8"; + get encoding(): string { + return this._encoding; + } /** Returns `true` if error mode is "fatal", and `false` otherwise. */ readonly fatal: boolean = false; /** Returns `true` if ignore BOM flag is set, and `false` otherwise. */ readonly ignoreBOM = false; - constructor( - label: "utf-8" = "utf-8", - options: TextDecoderOptions = { fatal: false } - ) { - if (label !== "utf-8") { - throw new TypeError("Only UTF8 decoding supported."); - } + constructor(label = "utf-8", options: TextDecoderOptions = { fatal: false }) { if (options.ignoreBOM) { throw new TypeError("Ignoring the BOM not supported."); } if (options.fatal) { this.fatal = true; } + label = String(label) + .trim() + .toLowerCase(); + const encoding = encodings.get(label); + if (!encoding) { + throw new RangeError( + `The encoding label provided ('${label}') is invalid.` + ); + } + if (!decoders.has(encoding)) { + throw new TypeError(`Internal decoder ('${encoding}') not found.`); + } + this._encoding = encoding; } /** Returns the result of running encoding's decoder. */ @@ -316,7 +413,7 @@ export class TextDecoder { bytes = new Uint8Array(0); } - const decoder = new UTF8Decoder({ fatal: this.fatal }); + const decoder = decoders.get(this._encoding)!({ fatal: this.fatal }); const inputStream = new Stream(bytes); const output: number[] = []; diff --git a/js/text_encoding_test.ts b/js/text_encoding_test.ts index 402044e369..efb841fc67 100644 --- a/js/text_encoding_test.ts +++ b/js/text_encoding_test.ts @@ -48,6 +48,23 @@ test(function textDecoder2() { assertEqual(decoder.decode(fixture), "𝓽𝓮𝔁𝓽"); }); +test(function textDecoderASCII() { + const fixture = new Uint8Array([0x89, 0x95, 0x9f, 0xbf]); + const decoder = new TextDecoder("ascii"); + assertEqual(decoder.decode(fixture), "‰•Ÿ¿"); +}); + +test(function textDecoderErrorEncoding() { + let didThrow = false; + try { + const decoder = new TextDecoder("foo"); + } catch (e) { + didThrow = true; + assertEqual(e.message, "The encoding label provided ('foo') is invalid."); + } + assert(didThrow); +}); + test(function textEncoder() { const fixture = "������"; const encoder = new TextEncoder();