| /* |
| * The Original Code is Mozilla Universal charset detector code. |
| * |
| * The Initial Developer of the Original Code is |
| * Netscape Communications Corporation. |
| * Portions created by the Initial Developer are Copyright (C) 2001 |
| * the Initial Developer. All Rights Reserved. |
| * |
| * Contributor(s): |
| * António Afonso (antonio.afonso gmail.com) - port to JavaScript |
| * Mark Pilgrim - port to Python |
| * Shy Shalom - original C code |
| * |
| * This library is free software; you can redistribute it and/or |
| * modify it under the terms of the GNU Lesser General Public |
| * License as published by the Free Software Foundation; either |
| * version 2.1 of the License, or (at your option) any later version. |
| * |
| * This library is distributed in the hope that it will be useful, |
| * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| * Lesser General Public License for more details. |
| * |
| * You should have received a copy of the GNU Lesser General Public |
| * License along with this library; if not, write to the Free Software |
| * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA |
| * 02110-1301 USA |
| */ |
| |
| /** |
| * This is a port from the python port, version "2.0.1" |
| */ |
| |
| var constants = require('./constants'); |
| var MBCSGroupProber = require('./mbcsgroupprober'); |
| var SBCSGroupProber = require('./sbcsgroupprober'); |
| var Latin1Prober = require('./latin1prober'); |
| var EscCharSetProber = require('./escprober'); |
| var logger = require('./logger'); |
| |
| const supportedEncodings = (function() { |
| const BOM_UTF = [ |
| "UTF-8", "UTF-32LE", "UTF-32BE", "UTF-32BE", "UTF-16LE", "UTF-16BE", |
| "X-ISO-10646-UCS-4-3412", "X-ISO-10646-UCS-4-2143" |
| ] |
| const probers = [ |
| new EscCharSetProber(), |
| new MBCSGroupProber(), |
| new SBCSGroupProber(), |
| new Latin1Prober() |
| ]; |
| const encodings = BOM_UTF.slice(0); |
| for (const prober of probers) { |
| [].push.apply(encodings, prober.getSupportedCharsetNames()); |
| } |
| return encodings; |
| })(); |
| |
| const supportedEncodingsDenormalized = (function() { |
| const denormalizedEncodings = []; |
| for (const encoding of supportedEncodings) { |
| denormalizedEncodings.push( |
| encoding.toLocaleLowerCase(), |
| encoding.toLocaleLowerCase().replace(/-/g, "") |
| ); |
| } |
| return denormalizedEncodings; |
| })(); |
| |
| function UniversalDetector(options) { |
| if (!options) options = {}; |
| |
| if (typeof options.minimumThreshold !== "number") { |
| if (options.detectEncodings) { |
| // If encodings are narrowed down by the user allow for |
| // any threshold to be returned. |
| options.minimumThreshold = 0; |
| } else { |
| options.minimumThreshold = 0.20; |
| } |
| } |
| |
| if (options.detectEncodings) { |
| for (const encoding of options.detectEncodings) { |
| if (!supportedEncodingsDenormalized.includes(encoding.toLowerCase())) { |
| throw new Error(`Encoding ${encoding} is not supported. Supported encodings: ${supportedEncodings}.`); |
| } |
| } |
| } |
| |
| var _state = { |
| pureAscii : 0, |
| escAscii : 1, |
| highbyte : 2 |
| }; |
| var self = this; |
| |
| function init() { |
| self._highBitDetector = /[\x80-\xFF]/; |
| self._escDetector = /(\x1B|~\{)/; |
| self._mEscCharsetProber = null; |
| self._mCharsetProbers = []; |
| self.reset(); |
| } |
| |
| function canDetectEncoding(encoding) { |
| if (!options.detectEncodings) { |
| return true; |
| } |
| var lowerDetectedEncodings = options.detectEncodings.map(encoding => encoding.toLowerCase()); |
| return lowerDetectedEncodings.includes(encoding.toLowerCase()); |
| } |
| |
| this.reset = function() { |
| this.result = {"encoding": null, "confidence": 0.0}; |
| this.results = [] |
| this.done = false; |
| this._mStart = true; |
| this._mGotData = false; |
| this._mInputState = _state.pureAscii; |
| this._mLastChar = []; |
| this._mBOM = ""; |
| if( this._mEscCharsetProber ) { |
| this._mEscCharsetProber.reset(); |
| } |
| for( var i = 0, prober; prober = this._mCharsetProbers[i]; i++ ) { |
| prober.reset(); |
| } |
| } |
| |
| this.feed = function(aBuf) { |
| if( this.done ) return; |
| |
| var aLen = aBuf.length; |
| if( !aLen ) return; |
| |
| if( !this._mGotData ) { |
| this._mBOM += aBuf; |
| // If the data starts with BOM, we know it is UTF |
| if( this._mBOM.slice(0,3) == "\xEF\xBB\xBF" && canDetectEncoding("UTF-8")) { |
| // EF BB BF UTF-8 with BOM |
| this.result = {"encoding": "UTF-8", "confidence": 1.0}; |
| } else if( this._mBOM.slice(0,4) == "\xFF\xFE\x00\x00" && canDetectEncoding("UTF-32LE") ) { |
| // FF FE 00 00 UTF-32, little-endian BOM |
| this.result = {"encoding": "UTF-32LE", "confidence": 1.0}; |
| } else if( this._mBOM.slice(0,4) == "\x00\x00\xFE\xFF" && canDetectEncoding("UTF-32BE")) { |
| // 00 00 FE FF UTF-32, big-endian BOM |
| this.result = {"encoding": "UTF-32BE", "confidence": 1.0}; |
| } else if( this._mBOM.slice(0,4) == "\xFE\xFF\x00\x00" && canDetectEncoding("X-ISO-10646-UCS-4-3412")) { |
| // FE FF 00 00 UCS-4, unusual octet order BOM (3412) |
| this.result = {"encoding": "X-ISO-10646-UCS-4-3412", "confidence": 1.0}; |
| } else if( this._mBOM.slice(0,4) == "\x00\x00\xFF\xFE" && canDetectEncoding("X-ISO-10646-UCS-4-2143")) { |
| // 00 00 FF FE UCS-4, unusual octet order BOM (2143) |
| this.result = {"encoding": "X-ISO-10646-UCS-4-2143", "confidence": 1.0}; |
| } else if( this._mBOM.slice(0,2) == "\xFF\xFE" && canDetectEncoding("UTF-16LE")) { |
| // FF FE UTF-16, little endian BOM |
| this.result = {"encoding": "UTF-16LE", "confidence": 1.0}; |
| } else if( this._mBOM.slice(0,2) == "\xFE\xFF" && canDetectEncoding("UTF-16BE")) { |
| // FE FF UTF-16, big endian BOM |
| this.result = {"encoding": "UTF-16BE", "confidence": 1.0}; |
| } |
| |
| if (this.result.confidence > 0) { |
| this.results = [this.result]; |
| } |
| |
| // If we got to 4 chars without being able to detect a BOM we |
| // stop trying. |
| if( this._mBOM.length > 3 ) { |
| this._mGotData = true; |
| } |
| } |
| |
| if( this.result.encoding && (this.result.confidence > 0.0) ) { |
| this.done = true; |
| return; |
| } |
| |
| if( this._mInputState == _state.pureAscii ) { |
| if( this._highBitDetector.test(aBuf) ) { |
| this._mInputState = _state.highbyte; |
| } else if( this._escDetector.test(this._mLastChar.join('') + aBuf) ) { |
| this._mInputState = _state.escAscii; |
| } |
| } |
| |
| this._mLastChar = aBuf.slice(-1).split(''); |
| |
| if( this._mInputState == _state.escAscii ) { |
| if( !this._mEscCharsetProber ) { |
| this._mEscCharsetProber = new EscCharSetProber(); |
| } |
| if( this._mEscCharsetProber.feed(aBuf) == constants.foundIt && canDetectEncoding(this._mEscCharsetProber.getCharsetName()) ) { |
| this.result = { |
| "encoding": this._mEscCharsetProber.getCharsetName(), |
| "confidence": this._mEscCharsetProber.getConfidence() |
| }; |
| this.results = [this.result]; |
| this.done = true; |
| } |
| } else if( this._mInputState == _state.highbyte ) { |
| if( this._mCharsetProbers.length == 0 ) { |
| this._mCharsetProbers = [ |
| new MBCSGroupProber(), |
| new SBCSGroupProber(), |
| new Latin1Prober() |
| ]; |
| } |
| for( var i = 0, prober; prober = this._mCharsetProbers[i]; i++ ) { |
| if( prober.feed(aBuf) == constants.foundIt && canDetectEncoding(prober.getCharsetName()) ) { |
| this.result = { |
| "encoding": prober.getCharsetName(), |
| "confidence": prober.getConfidence() |
| }; |
| this.results = [this.result]; |
| this.done = true; |
| break; |
| } |
| } |
| } |
| } |
| |
| this.close = function() { |
| if( this.done ) return; |
| if( this._mBOM.length === 0 ) { |
| logger.log("no data received!\n"); |
| return; |
| } |
| this.done = true; |
| |
| if( this._mInputState == _state.pureAscii && canDetectEncoding("ascii") ) { |
| logger.log("pure ascii") |
| this.result = {"encoding": "ascii", "confidence": 1.0}; |
| this.results.push(this.result); |
| return this.result; |
| } |
| |
| if (this._mInputState == _state.highbyte) { |
| let windows_1252_confidence = 0; |
| let windows_1250_detected = false; |
| for (var i = 0, prober; prober = this._mCharsetProbers[i]; i++) { |
| if (!prober) continue; |
| const charsetName = prober.getCharsetName(); |
| const confidence = prober.getConfidence(); |
| if (prober.getCharsetName() === "windows-1252") { |
| windows_1252_confidence = confidence; |
| } |
| if (!charsetName || !canDetectEncoding(charsetName)) continue; |
| this.results.push({ |
| "encoding": prober.getCharsetName(), |
| "confidence": confidence |
| }); |
| if (prober.getCharsetName() === "windows-1250") { |
| windows_1250_detected = true; |
| } |
| logger.log(prober.getCharsetName() + " confidence " + confidence); |
| } |
| // HACK: When windows-1252 is detected it's almost sure that it can |
| // also be windows-1250. |
| // https://en.wikipedia.org/wiki/Windows-1250 (Central European) |
| if (windows_1252_confidence && !windows_1250_detected && canDetectEncoding("windows-1250")) { |
| this.results.push({ |
| "encoding": "windows-1250", |
| // Report the confidence just a bit under windows-1252's. |
| "confidence": windows_1252_confidence - Math.pow(5/10, (String(windows_1252_confidence).length - 1)), |
| }); |
| } |
| this.results.sort(function(a, b) { |
| return b.confidence - a.confidence; |
| }); |
| if (this.results.length > 0) { |
| var topResult = this.results[0]; |
| if (topResult.confidence >= options.minimumThreshold) { |
| this.result = topResult; |
| return topResult; |
| } |
| } |
| } |
| |
| if( logger.enabled ) { |
| logger.log("no probers hit minimum threshhold\n"); |
| for( var i = 0, prober; prober = this._mCharsetProbers[i]; i++ ) { |
| if( !prober || !canDetectEncoding(prober.getCharsetName()) ) continue; |
| logger.log(prober.getCharsetName() + " confidence = " + |
| prober.getConfidence() + "\n"); |
| } |
| } |
| } |
| |
| init(); |
| } |
| |
| module.exports = UniversalDetector; |