src/universaldetector.js - jschardet - Git at Google

 /*
  * The Original Code is Mozilla Universal charset detector code.
  *
  * The Initial Developer of the Original Code is
  * Netscape Communications Corporation.
  * Portions created by the Initial Developer are Copyright (C) 2001
  * the Initial Developer. All Rights Reserved.
  *
  * Contributor(s):
  *   António Afonso (antonio.afonso gmail.com) - port to JavaScript
  *   Mark Pilgrim - port to Python
  *   Shy Shalom - original C code
  *
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with this library; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
  * 02110-1301  USA
  */

 /**
  * This is a port from the python port, version "2.0.1"
  */

 var constants = require('./constants');
 var MBCSGroupProber = require('./mbcsgroupprober');
 var SBCSGroupProber = require('./sbcsgroupprober');
 var Latin1Prober = require('./latin1prober');
 var EscCharSetProber = require('./escprober');
 var logger = require('./logger');

 const supportedEncodings = (function() {
     const BOM_UTF = [
         "UTF-8", "UTF-32LE", "UTF-32BE", "UTF-32BE", "UTF-16LE", "UTF-16BE",
         "X-ISO-10646-UCS-4-3412", "X-ISO-10646-UCS-4-2143"
     ]
     const probers = [
         new EscCharSetProber(),
         new MBCSGroupProber(),
         new SBCSGroupProber(),
         new Latin1Prober()
     ];
     const encodings = BOM_UTF.slice(0);
     for (const prober of probers) {
         [].push.apply(encodings, prober.getSupportedCharsetNames());
     }
     return encodings;
 })();

 const supportedEncodingsDenormalized = (function() {
     const denormalizedEncodings = [];
     for (const encoding of supportedEncodings) {
         denormalizedEncodings.push(
             encoding.toLocaleLowerCase(),
             encoding.toLocaleLowerCase().replace(/-/g, "")
         );
     }
     return denormalizedEncodings;
 })();

 function UniversalDetector(options) {
     if (!options) options = {};

     if (typeof options.minimumThreshold !== "number") {
         if (options.detectEncodings) {
             // If encodings are narrowed down by the user allow for
             // any threshold to be returned.
             options.minimumThreshold = 0;
         } else {
             options.minimumThreshold = 0.20;
         }
     }

     if (options.detectEncodings) {
         for (const encoding of options.detectEncodings) {
             if (!supportedEncodingsDenormalized.includes(encoding.toLowerCase())) {
                 throw new Error(`Encoding ${encoding} is not supported. Supported encodings: ${supportedEncodings}.`);
             }
         }
     }

     var _state = {
         pureAscii   : 0,
         escAscii    : 1,
         highbyte    : 2
     };
     var self = this;

     function init() {
         self._highBitDetector = /[\x80-\xFF]/;
         self._escDetector = /(\x1B|~\{)/;
         self._mEscCharsetProber = null;
         self._mCharsetProbers = [];
         self.reset();
     }

     function canDetectEncoding(encoding) {
         if (!options.detectEncodings) {
             return true;
         }
         var lowerDetectedEncodings = options.detectEncodings.map(encoding => encoding.toLowerCase());
         return lowerDetectedEncodings.includes(encoding.toLowerCase());
     }

     this.reset = function() {
         this.result = {"encoding": null, "confidence": 0.0};
         this.results = []
         this.done = false;
         this._mStart = true;
         this._mGotData = false;
         this._mInputState = _state.pureAscii;
         this._mLastChar = [];
         this._mBOM = "";
         if( this._mEscCharsetProber ) {
             this._mEscCharsetProber.reset();
         }
         for( var i = 0, prober; prober = this._mCharsetProbers[i]; i++ ) {
             prober.reset();
         }
     }

     this.feed = function(aBuf) {
         if( this.done ) return;

         var aLen = aBuf.length;
         if( !aLen ) return;

         if( !this._mGotData ) {
             this._mBOM += aBuf;
             // If the data starts with BOM, we know it is UTF
             if( this._mBOM.slice(0,3) == "\xEF\xBB\xBF" && canDetectEncoding("UTF-8")) {
                 // EF BB BF  UTF-8 with BOM
                 this.result = {"encoding": "UTF-8", "confidence": 1.0};
             } else if( this._mBOM.slice(0,4) == "\xFF\xFE\x00\x00"  && canDetectEncoding("UTF-32LE") ) {
                 // FF FE 00 00  UTF-32, little-endian BOM
                 this.result = {"encoding": "UTF-32LE", "confidence": 1.0};
             } else if( this._mBOM.slice(0,4) == "\x00\x00\xFE\xFF"  && canDetectEncoding("UTF-32BE")) {
                 // 00 00 FE FF  UTF-32, big-endian BOM
                 this.result = {"encoding": "UTF-32BE", "confidence": 1.0};
             } else if( this._mBOM.slice(0,4) == "\xFE\xFF\x00\x00"  && canDetectEncoding("X-ISO-10646-UCS-4-3412")) {
                 // FE FF 00 00  UCS-4, unusual octet order BOM (3412)
                 this.result = {"encoding": "X-ISO-10646-UCS-4-3412", "confidence": 1.0};
             } else if( this._mBOM.slice(0,4) == "\x00\x00\xFF\xFE"  && canDetectEncoding("X-ISO-10646-UCS-4-2143")) {
                 // 00 00 FF FE  UCS-4, unusual octet order BOM (2143)
                 this.result = {"encoding": "X-ISO-10646-UCS-4-2143", "confidence": 1.0};
             } else if( this._mBOM.slice(0,2) == "\xFF\xFE" && canDetectEncoding("UTF-16LE")) {
                 // FF FE  UTF-16, little endian BOM
                 this.result = {"encoding": "UTF-16LE", "confidence": 1.0};
             } else if( this._mBOM.slice(0,2) == "\xFE\xFF"  && canDetectEncoding("UTF-16BE")) {
                 // FE FF  UTF-16, big endian BOM
                 this.result = {"encoding": "UTF-16BE", "confidence": 1.0};
             }

             if (this.result.confidence > 0) {
                 this.results = [this.result];
             }

             // If we got to 4 chars without being able to detect a BOM we
             // stop trying.
             if( this._mBOM.length > 3 ) {
                 this._mGotData = true;
             }
         }

         if( this.result.encoding && (this.result.confidence > 0.0) ) {
             this.done = true;
             return;
         }

         if( this._mInputState == _state.pureAscii ) {
             if( this._highBitDetector.test(aBuf) ) {
                 this._mInputState = _state.highbyte;
             } else if( this._escDetector.test(this._mLastChar.join('') + aBuf) ) {
                 this._mInputState = _state.escAscii;
             }
         }

         this._mLastChar = aBuf.slice(-1).split('');

         if( this._mInputState == _state.escAscii ) {
             if( !this._mEscCharsetProber ) {
                 this._mEscCharsetProber = new EscCharSetProber();
             }
             if( this._mEscCharsetProber.feed(aBuf) == constants.foundIt && canDetectEncoding(this._mEscCharsetProber.getCharsetName()) ) {
                 this.result = {
                     "encoding": this._mEscCharsetProber.getCharsetName(),
                     "confidence": this._mEscCharsetProber.getConfidence()
                 };
                 this.results = [this.result];
                 this.done = true;
             }
         } else if( this._mInputState == _state.highbyte ) {
             if( this._mCharsetProbers.length == 0 ) {
                 this._mCharsetProbers = [
                     new MBCSGroupProber(),
                     new SBCSGroupProber(),
                     new Latin1Prober()
                 ];
             }
             for( var i = 0, prober; prober = this._mCharsetProbers[i]; i++ ) {
                 if( prober.feed(aBuf) == constants.foundIt && canDetectEncoding(prober.getCharsetName()) ) {
                     this.result = {
                         "encoding": prober.getCharsetName(),
                         "confidence": prober.getConfidence()
                     };
                     this.results = [this.result];
                     this.done = true;
                     break;
                 }
             }
         }
     }

     this.close = function() {
         if( this.done ) return;
         if( this._mBOM.length === 0 ) {
             logger.log("no data received!\n");
             return;
         }
         this.done = true;

         if( this._mInputState == _state.pureAscii && canDetectEncoding("ascii") ) {
             logger.log("pure ascii")
             this.result = {"encoding": "ascii", "confidence": 1.0};
             this.results.push(this.result);
             return this.result;
         }

         if (this._mInputState == _state.highbyte) {
             let windows_1252_confidence = 0;
             let windows_1250_detected = false;
             for (var i = 0, prober; prober = this._mCharsetProbers[i]; i++) {
                 if (!prober) continue;
                 const charsetName = prober.getCharsetName();
                 const confidence = prober.getConfidence();
                 if (prober.getCharsetName() === "windows-1252") {
                     windows_1252_confidence = confidence;
                 }
                 if (!charsetName || !canDetectEncoding(charsetName)) continue;
                 this.results.push({
                     "encoding": prober.getCharsetName(),
                     "confidence": confidence
                 });
                 if (prober.getCharsetName() === "windows-1250") {
                     windows_1250_detected = true;
                 }
                 logger.log(prober.getCharsetName() + " confidence " + confidence);
             }
             // HACK: When windows-1252 is detected it's almost sure that it can
             // also be windows-1250.
             // https://en.wikipedia.org/wiki/Windows-1250 (Central European)
             if (windows_1252_confidence && !windows_1250_detected && canDetectEncoding("windows-1250")) {
                 this.results.push({
                     "encoding": "windows-1250",
                     // Report the confidence just a bit under windows-1252's.
                     "confidence": windows_1252_confidence - Math.pow(5/10, (String(windows_1252_confidence).length - 1)),
                 });
             }
             this.results.sort(function(a, b) {
                 return b.confidence - a.confidence;
             });
             if (this.results.length > 0) {
                 var topResult = this.results[0];
                 if (topResult.confidence >= options.minimumThreshold) {
                     this.result = topResult;
                     return topResult;
                 }
             }
         }

         if( logger.enabled ) {
             logger.log("no probers hit minimum threshhold\n");
             for( var i = 0, prober; prober = this._mCharsetProbers[i]; i++ ) {
                 if( !prober || !canDetectEncoding(prober.getCharsetName()) ) continue;
                 logger.log(prober.getCharsetName() + " confidence = " +
                     prober.getConfidence() + "\n");
             }
         }
     }

     init();
 }

 module.exports = UniversalDetector;
	/*
	* The Original Code is Mozilla Universal charset detector code.
	*
	* The Initial Developer of the Original Code is
	* Netscape Communications Corporation.
	* Portions created by the Initial Developer are Copyright (C) 2001
	* the Initial Developer. All Rights Reserved.
	*
	* Contributor(s):
	* António Afonso (antonio.afonso gmail.com) - port to JavaScript
	* Mark Pilgrim - port to Python
	* Shy Shalom - original C code
	*
	* This library is free software; you can redistribute it and/or
	* modify it under the terms of the GNU Lesser General Public
	* License as published by the Free Software Foundation; either
	* version 2.1 of the License, or (at your option) any later version.
	*
	* This library is distributed in the hope that it will be useful,
	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	* Lesser General Public License for more details.
	*
	* You should have received a copy of the GNU Lesser General Public
	* License along with this library; if not, write to the Free Software
	* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
	* 02110-1301 USA
	*/

	/**
	* This is a port from the python port, version "2.0.1"
	*/

	var constants = require('./constants');
	var MBCSGroupProber = require('./mbcsgroupprober');
	var SBCSGroupProber = require('./sbcsgroupprober');
	var Latin1Prober = require('./latin1prober');
	var EscCharSetProber = require('./escprober');
	var logger = require('./logger');

	const supportedEncodings = (function() {
	const BOM_UTF = [
	"UTF-8", "UTF-32LE", "UTF-32BE", "UTF-32BE", "UTF-16LE", "UTF-16BE",
	"X-ISO-10646-UCS-4-3412", "X-ISO-10646-UCS-4-2143"
	]
	const probers = [
	new EscCharSetProber(),
	new MBCSGroupProber(),
	new SBCSGroupProber(),
	new Latin1Prober()
	];
	const encodings = BOM_UTF.slice(0);
	for (const prober of probers) {
	[].push.apply(encodings, prober.getSupportedCharsetNames());
	}
	return encodings;
	})();

	const supportedEncodingsDenormalized = (function() {
	const denormalizedEncodings = [];
	for (const encoding of supportedEncodings) {
	denormalizedEncodings.push(
	encoding.toLocaleLowerCase(),
	encoding.toLocaleLowerCase().replace(/-/g, "")
	);
	}
	return denormalizedEncodings;
	})();

	function UniversalDetector(options) {
	if (!options) options = {};

	if (typeof options.minimumThreshold !== "number") {
	if (options.detectEncodings) {
	// If encodings are narrowed down by the user allow for
	// any threshold to be returned.
	options.minimumThreshold = 0;
	} else {
	options.minimumThreshold = 0.20;
	}
	}

	if (options.detectEncodings) {
	for (const encoding of options.detectEncodings) {
	if (!supportedEncodingsDenormalized.includes(encoding.toLowerCase())) {
	throw new Error(`Encoding ${encoding} is not supported. Supported encodings: ${supportedEncodings}.`);
	}
	}
	}

	var _state = {
	pureAscii : 0,
	escAscii : 1,
	highbyte : 2
	};
	var self = this;

	function init() {
	self._highBitDetector = /[\x80-\xFF]/;
	self._escDetector = /(\x1B\|~\{)/;
	self._mEscCharsetProber = null;
	self._mCharsetProbers = [];
	self.reset();
	}

	function canDetectEncoding(encoding) {
	if (!options.detectEncodings) {
	return true;
	}
	var lowerDetectedEncodings = options.detectEncodings.map(encoding => encoding.toLowerCase());
	return lowerDetectedEncodings.includes(encoding.toLowerCase());
	}

	this.reset = function() {
	this.result = {"encoding": null, "confidence": 0.0};
	this.results = []
	this.done = false;
	this._mStart = true;
	this._mGotData = false;
	this._mInputState = _state.pureAscii;
	this._mLastChar = [];
	this._mBOM = "";
	if( this._mEscCharsetProber ) {
	this._mEscCharsetProber.reset();
	}
	for( var i = 0, prober; prober = this._mCharsetProbers[i]; i++ ) {
	prober.reset();
	}
	}

	this.feed = function(aBuf) {
	if( this.done ) return;

	var aLen = aBuf.length;
	if( !aLen ) return;

	if( !this._mGotData ) {
	this._mBOM += aBuf;
	// If the data starts with BOM, we know it is UTF
	if( this._mBOM.slice(0,3) == "\xEF\xBB\xBF" && canDetectEncoding("UTF-8")) {
	// EF BB BF UTF-8 with BOM
	this.result = {"encoding": "UTF-8", "confidence": 1.0};
	} else if( this._mBOM.slice(0,4) == "\xFF\xFE\x00\x00" && canDetectEncoding("UTF-32LE") ) {
	// FF FE 00 00 UTF-32, little-endian BOM
	this.result = {"encoding": "UTF-32LE", "confidence": 1.0};
	} else if( this._mBOM.slice(0,4) == "\x00\x00\xFE\xFF" && canDetectEncoding("UTF-32BE")) {
	// 00 00 FE FF UTF-32, big-endian BOM
	this.result = {"encoding": "UTF-32BE", "confidence": 1.0};
	} else if( this._mBOM.slice(0,4) == "\xFE\xFF\x00\x00" && canDetectEncoding("X-ISO-10646-UCS-4-3412")) {
	// FE FF 00 00 UCS-4, unusual octet order BOM (3412)
	this.result = {"encoding": "X-ISO-10646-UCS-4-3412", "confidence": 1.0};
	} else if( this._mBOM.slice(0,4) == "\x00\x00\xFF\xFE" && canDetectEncoding("X-ISO-10646-UCS-4-2143")) {
	// 00 00 FF FE UCS-4, unusual octet order BOM (2143)
	this.result = {"encoding": "X-ISO-10646-UCS-4-2143", "confidence": 1.0};
	} else if( this._mBOM.slice(0,2) == "\xFF\xFE" && canDetectEncoding("UTF-16LE")) {
	// FF FE UTF-16, little endian BOM
	this.result = {"encoding": "UTF-16LE", "confidence": 1.0};
	} else if( this._mBOM.slice(0,2) == "\xFE\xFF" && canDetectEncoding("UTF-16BE")) {
	// FE FF UTF-16, big endian BOM
	this.result = {"encoding": "UTF-16BE", "confidence": 1.0};
	}

	if (this.result.confidence > 0) {
	this.results = [this.result];
	}

	// If we got to 4 chars without being able to detect a BOM we
	// stop trying.
	if( this._mBOM.length > 3 ) {
	this._mGotData = true;
	}
	}

	if( this.result.encoding && (this.result.confidence > 0.0) ) {
	this.done = true;
	return;
	}

	if( this._mInputState == _state.pureAscii ) {
	if( this._highBitDetector.test(aBuf) ) {
	this._mInputState = _state.highbyte;
	} else if( this._escDetector.test(this._mLastChar.join('') + aBuf) ) {
	this._mInputState = _state.escAscii;
	}
	}

	this._mLastChar = aBuf.slice(-1).split('');

	if( this._mInputState == _state.escAscii ) {
	if( !this._mEscCharsetProber ) {
	this._mEscCharsetProber = new EscCharSetProber();
	}
	if( this._mEscCharsetProber.feed(aBuf) == constants.foundIt && canDetectEncoding(this._mEscCharsetProber.getCharsetName()) ) {
	this.result = {
	"encoding": this._mEscCharsetProber.getCharsetName(),
	"confidence": this._mEscCharsetProber.getConfidence()
	};
	this.results = [this.result];
	this.done = true;
	}
	} else if( this._mInputState == _state.highbyte ) {
	if( this._mCharsetProbers.length == 0 ) {
	this._mCharsetProbers = [
	new MBCSGroupProber(),
	new SBCSGroupProber(),
	new Latin1Prober()
	];
	}
	for( var i = 0, prober; prober = this._mCharsetProbers[i]; i++ ) {
	if( prober.feed(aBuf) == constants.foundIt && canDetectEncoding(prober.getCharsetName()) ) {
	this.result = {
	"encoding": prober.getCharsetName(),
	"confidence": prober.getConfidence()
	};
	this.results = [this.result];
	this.done = true;
	break;
	}
	}
	}
	}

	this.close = function() {
	if( this.done ) return;
	if( this._mBOM.length === 0 ) {
	logger.log("no data received!\n");
	return;
	}
	this.done = true;

	if( this._mInputState == _state.pureAscii && canDetectEncoding("ascii") ) {
	logger.log("pure ascii")
	this.result = {"encoding": "ascii", "confidence": 1.0};
	this.results.push(this.result);
	return this.result;
	}

	if (this._mInputState == _state.highbyte) {
	let windows_1252_confidence = 0;
	let windows_1250_detected = false;
	for (var i = 0, prober; prober = this._mCharsetProbers[i]; i++) {
	if (!prober) continue;
	const charsetName = prober.getCharsetName();
	const confidence = prober.getConfidence();
	if (prober.getCharsetName() === "windows-1252") {
	windows_1252_confidence = confidence;
	}
	if (!charsetName \|\| !canDetectEncoding(charsetName)) continue;
	this.results.push({
	"encoding": prober.getCharsetName(),
	"confidence": confidence
	});
	if (prober.getCharsetName() === "windows-1250") {
	windows_1250_detected = true;
	}
	logger.log(prober.getCharsetName() + " confidence " + confidence);
	}
	// HACK: When windows-1252 is detected it's almost sure that it can
	// also be windows-1250.
	// https://en.wikipedia.org/wiki/Windows-1250 (Central European)
	if (windows_1252_confidence && !windows_1250_detected && canDetectEncoding("windows-1250")) {
	this.results.push({
	"encoding": "windows-1250",
	// Report the confidence just a bit under windows-1252's.
	"confidence": windows_1252_confidence - Math.pow(5/10, (String(windows_1252_confidence).length - 1)),
	});
	}
	this.results.sort(function(a, b) {
	return b.confidence - a.confidence;
	});
	if (this.results.length > 0) {
	var topResult = this.results[0];
	if (topResult.confidence >= options.minimumThreshold) {
	this.result = topResult;
	return topResult;
	}
	}
	}

	if( logger.enabled ) {
	logger.log("no probers hit minimum threshhold\n");
	for( var i = 0, prober; prober = this._mCharsetProbers[i]; i++ ) {
	if( !prober \|\| !canDetectEncoding(prober.getCharsetName()) ) continue;
	logger.log(prober.getCharsetName() + " confidence = " +
	prober.getConfidence() + "\n");
	}
	}
	}

	init();
	}

	module.exports = UniversalDetector;