src/chardistribution.js - jschardet - Git at Google

 /*
  * The Original Code is Mozilla Universal charset detector code.
  *
  * The Initial Developer of the Original Code is
  * Netscape Communications Corporation.
  * Portions created by the Initial Developer are Copyright (C) 2001
  * the Initial Developer. All Rights Reserved.
  *
  * Contributor(s):
  *   António Afonso (antonio.afonso gmail.com) - port to JavaScript
  *   Mark Pilgrim - port to Python
  *   Shy Shalom - original C code
  *
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with this library; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
  * 02110-1301  USA
  */

 var jisfreq = require('./jisfreq');
 var euctwfreq = require('./euctwfreq');
 var euckrfreq = require('./euckrfreq');
 var gb2312freq = require('./gb2312freq');
 var big5freq = require('./big5freq');

 function CharDistributionAnalysis() {
     var ENOUGH_DATA_THRESHOLD = 1024;
     var SURE_YES = 0.99;
     var SURE_NO = 0.01;
     var MINIMUM_DATA_THRESHOLD = 3;

     var self = this;

     function init() {
         self._mCharToFreqOrder = null; // Mapping table to get frequency order from char order (get from GetOrder())
         self._mTableSize = null; // Size of above table
         self._mTypicalDistributionRatio = null; // This is a constant value which varies from language to language, used in calculating confidence.  See http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html for further detail.
         self.reset();
     }

     /**
      * reset analyser, clear any state
      */
     this.reset = function() {
         this._mDone = false; // If this flag is set to constants.True, detection is done and conclusion has been made
         this._mTotalChars = 0; // Total characters encountered
         this._mFreqChars = 0; // The number of characters whose frequency order is less than 512
     }

     /**
      * feed a character with known length
      */
     this.feed = function(aStr, aCharLen) {
         if( aCharLen == 2 ) {
             // we only care about 2-bytes character in our distribution analysis
             var order = this.getOrder(aStr);
         } else {
             order = -1;
         }
         if( order >= 0 ) {
             this._mTotalChars++;
             // order is valid
             if( order < this._mTableSize ) {
                 if( 512 > this._mCharToFreqOrder[order] ) {
                     this._mFreqChars++;
                 }
             }
         }
     }

     /**
      * return confidence based on existing data
      */
     this.getConfidence = function() {
         // if we didn't receive any character in our consideration range, return negative answer
         if( this._mTotalChars <= 0 || this._mFreqChars <= MINIMUM_DATA_THRESHOLD) {
             return SURE_NO;
         }
         if( this._mTotalChars != this._mFreqChars ) {
             var r = this._mFreqChars / ((this._mTotalChars - this._mFreqChars) * this._mTypicalDistributionRatio);
             if( r < SURE_YES ) {
                 return r;
             }
         }

         // normalize confidence (we don't want to be 100% sure)
         return SURE_YES;
     }

     this.gotEnoughData = function() {
         // It is not necessary to receive all data to draw conclusion. For charset detection,
         // certain amount of data is enough
         return this._mTotalChars > ENOUGH_DATA_THRESHOLD;
     }

     this.getOrder = function(aStr) {
         // We do not handle characters based on the original encoding string, but
         // convert this encoding string to a number, here called order.
         // This allows multiple encodings of a language to share one frequency table.
         return -1;
     }

     init();
 }

 exports.CharDistributionAnalysis = CharDistributionAnalysis

 function EUCTWDistributionAnalysis() {
     CharDistributionAnalysis.apply(this);

     var self = this;

     function init() {
         self._mCharToFreqOrder = euctwfreq.EUCTWCharToFreqOrder;
         self._mTableSize = euctwfreq.EUCTW_TABLE_SIZE;
         self._mTypicalDistributionRatio = euctwfreq.EUCTW_TYPICAL_DISTRIBUTION_RATIO;
     }

     this.getOrder = function(aStr) {
         // for euc-TW encoding, we are interested
         //   first  byte range: 0xc4 -- 0xfe
         //   second byte range: 0xa1 -- 0xfe
         // no validation needed here. State machine has done that
         if( aStr.charCodeAt(0) >= 0xC4 ) {
             return 94 * (aStr.charCodeAt(0) - 0xC4) + aStr.charCodeAt(1) - 0xA1;
         } else {
             return -1;
         }
     }

     init();
 }
 EUCTWDistributionAnalysis.prototype = new CharDistributionAnalysis();

 exports.EUCTWDistributionAnalysis = EUCTWDistributionAnalysis

 function EUCKRDistributionAnalysis() {
     CharDistributionAnalysis.apply(this);

     var self = this;

     function init() {
         self._mCharToFreqOrder = euckrfreq.EUCKRCharToFreqOrder;
         self._mTableSize = euckrfreq.EUCKR_TABLE_SIZE;
         self._mTypicalDistributionRatio = euckrfreq.EUCKR_TYPICAL_DISTRIBUTION_RATIO;
     }

     this.getOrder = function(aStr) {
         // for euc-KR encoding, we are interested
         //   first  byte range: 0xb0 -- 0xfe
         //   second byte range: 0xa1 -- 0xfe
         // no validation needed here. State machine has done that
         if( aStr.charCodeAt(0) >= 0xB0 ) {
             return 94 * (aStr.charCodeAt(0) - 0xB0) + aStr.charCodeAt(1) - 0xA1;
         } else {
             return -1;
         }
     }

     init();
 }
 EUCKRDistributionAnalysis.prototype = new CharDistributionAnalysis();

 exports.EUCKRDistributionAnalysis = EUCKRDistributionAnalysis

 function GB2312DistributionAnalysis() {
     CharDistributionAnalysis.apply(this);

     var self = this;

     function init() {
         self._mCharToFreqOrder = gb2312freq.GB2312CharToFreqOrder;
         self._mTableSize = gb2312freq.GB2312_TABLE_SIZE;
         self._mTypicalDistributionRatio = gb2312freq.GB2312_TYPICAL_DISTRIBUTION_RATIO;
     }

     this.getOrder = function(aStr) {
         // for GB2312 encoding, we are interested
         //  first  byte range: 0xb0 -- 0xfe
         //  second byte range: 0xa1 -- 0xfe
         // no validation needed here. State machine has done that
         if( aStr.charCodeAt(0) >= 0xB0 && aStr.charCodeAt(1) >= 0xA1 ) {
             return 94 * (aStr.charCodeAt(0) - 0xB0) + aStr.charCodeAt(1) - 0xA1;
         } else {
             return -1;
         }
     }

     init();
 }
 GB2312DistributionAnalysis.prototype = new CharDistributionAnalysis();

 exports.GB2312DistributionAnalysis = GB2312DistributionAnalysis

 function Big5DistributionAnalysis() {
     CharDistributionAnalysis.apply(this);

     var self = this;

     function init() {
         self._mCharToFreqOrder = big5freq.Big5CharToFreqOrder;
         self._mTableSize = big5freq.BIG5_TABLE_SIZE;
         self._mTypicalDistributionRatio = big5freq.BIG5_TYPICAL_DISTRIBUTION_RATIO;
     }

     this.getOrder = function(aStr) {
         // for big5 encoding, we are interested
         //   first  byte range: 0xa4 -- 0xfe
         //   second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe
         // no validation needed here. State machine has done that
         if( aStr.charCodeAt(0) >= 0xA4 ) {
             if( aStr.charCodeAt(1) >= 0xA1 ) {
                 return 157 * (aStr.charCodeAt(0) - 0xA4) + aStr.charCodeAt(1) - 0xA1 + 63;
             } else {
                 return 157 * (aStr.charCodeAt(0) - 0xA4) + aStr.charCodeAt(1) - 0x40;
             }
         } else {
             return -1;
         }
     }

     init();
 }
 Big5DistributionAnalysis.prototype = new CharDistributionAnalysis();

 exports.Big5DistributionAnalysis = Big5DistributionAnalysis

 function SJISDistributionAnalysis() {
     CharDistributionAnalysis.apply(this);

     var self = this;

     function init() {
         self._mCharToFreqOrder = jisfreq.JISCharToFreqOrder;
         self._mTableSize = jisfreq.JIS_TABLE_SIZE;
         self._mTypicalDistributionRatio = jisfreq.JIS_TYPICAL_DISTRIBUTION_RATIO;
     }

     this.getOrder = function(aStr) {
         // for sjis encoding, we are interested
         //   first  byte range: 0x81 -- 0x9f , 0xe0 -- 0xef
         //   second byte range: 0x40 -- 0x7e,  0x80 -- 0xfc
         // no validation needed here. State machine has done that
         if( aStr.charCodeAt(0) >= 0x81 && aStr.charCodeAt(0) <= 0x9F ) {
             var order = 188 * (aStr.charCodeAt(0) - 0x81);
         } else if( aStr.charCodeAt(0) >= 0xE0 && aStr.charCodeAt(0) <= 0xEF ) {
             order = 188 * (aStr.charCodeAt(0) - 0xE0 + 31);
         } else {
             return -1;
         }
         order += aStr.charCodeAt(1) - 0x40;
         if( aStr.charCodeAt(1) < 0x40 || aStr.charCodeAt(1) === 0x7F || aStr.charCodeAt(1) > 0xFC) {
             order = -1;
         }
         return order;
     }

     init();
 }
 SJISDistributionAnalysis.prototype = new CharDistributionAnalysis();

 exports.SJISDistributionAnalysis = SJISDistributionAnalysis

 function EUCJPDistributionAnalysis() {
     CharDistributionAnalysis.apply(this);

     var self = this;

     function init() {
         self._mCharToFreqOrder = jisfreq.JISCharToFreqOrder;
         self._mTableSize = jisfreq.JIS_TABLE_SIZE;
         self._mTypicalDistributionRatio = jisfreq.JIS_TYPICAL_DISTRIBUTION_RATIO;
     }

     this.getOrder = function(aStr) {
         // for euc-JP encoding, we are interested
         //   first  byte range: 0xa0 -- 0xfe
         //   second byte range: 0xa1 -- 0xfe
         // no validation needed here. State machine has done that
         if( aStr[0] >= "\xA0" ) {
             return 94 * (aStr.charCodeAt(0) - 0xA1) + aStr.charCodeAt(1) - 0xA1;
         } else {
             return -1;
         }
     }

     init();
 }
 EUCJPDistributionAnalysis.prototype = new CharDistributionAnalysis();

 exports.EUCJPDistributionAnalysis = EUCJPDistributionAnalysis
	/*
	* The Original Code is Mozilla Universal charset detector code.
	*
	* The Initial Developer of the Original Code is
	* Netscape Communications Corporation.
	* Portions created by the Initial Developer are Copyright (C) 2001
	* the Initial Developer. All Rights Reserved.
	*
	* Contributor(s):
	* António Afonso (antonio.afonso gmail.com) - port to JavaScript
	* Mark Pilgrim - port to Python
	* Shy Shalom - original C code
	*
	* This library is free software; you can redistribute it and/or
	* modify it under the terms of the GNU Lesser General Public
	* License as published by the Free Software Foundation; either
	* version 2.1 of the License, or (at your option) any later version.
	*
	* This library is distributed in the hope that it will be useful,
	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	* Lesser General Public License for more details.
	*
	* You should have received a copy of the GNU Lesser General Public
	* License along with this library; if not, write to the Free Software
	* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
	* 02110-1301 USA
	*/

	var jisfreq = require('./jisfreq');
	var euctwfreq = require('./euctwfreq');
	var euckrfreq = require('./euckrfreq');
	var gb2312freq = require('./gb2312freq');
	var big5freq = require('./big5freq');

	function CharDistributionAnalysis() {
	var ENOUGH_DATA_THRESHOLD = 1024;
	var SURE_YES = 0.99;
	var SURE_NO = 0.01;
	var MINIMUM_DATA_THRESHOLD = 3;

	var self = this;

	function init() {
	self._mCharToFreqOrder = null; // Mapping table to get frequency order from char order (get from GetOrder())
	self._mTableSize = null; // Size of above table
	self._mTypicalDistributionRatio = null; // This is a constant value which varies from language to language, used in calculating confidence. See http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html for further detail.
	self.reset();
	}

	/**
	* reset analyser, clear any state
	*/
	this.reset = function() {
	this._mDone = false; // If this flag is set to constants.True, detection is done and conclusion has been made
	this._mTotalChars = 0; // Total characters encountered
	this._mFreqChars = 0; // The number of characters whose frequency order is less than 512
	}

	/**
	* feed a character with known length
	*/
	this.feed = function(aStr, aCharLen) {
	if( aCharLen == 2 ) {
	// we only care about 2-bytes character in our distribution analysis
	var order = this.getOrder(aStr);
	} else {
	order = -1;
	}
	if( order >= 0 ) {
	this._mTotalChars++;
	// order is valid
	if( order < this._mTableSize ) {
	if( 512 > this._mCharToFreqOrder[order] ) {
	this._mFreqChars++;
	}
	}
	}
	}

	/**
	* return confidence based on existing data
	*/
	this.getConfidence = function() {
	// if we didn't receive any character in our consideration range, return negative answer
	if( this._mTotalChars <= 0 \|\| this._mFreqChars <= MINIMUM_DATA_THRESHOLD) {
	return SURE_NO;
	}
	if( this._mTotalChars != this._mFreqChars ) {
	var r = this._mFreqChars / ((this._mTotalChars - this._mFreqChars) * this._mTypicalDistributionRatio);
	if( r < SURE_YES ) {
	return r;
	}
	}

	// normalize confidence (we don't want to be 100% sure)
	return SURE_YES;
	}

	this.gotEnoughData = function() {
	// It is not necessary to receive all data to draw conclusion. For charset detection,
	// certain amount of data is enough
	return this._mTotalChars > ENOUGH_DATA_THRESHOLD;
	}

	this.getOrder = function(aStr) {
	// We do not handle characters based on the original encoding string, but
	// convert this encoding string to a number, here called order.
	// This allows multiple encodings of a language to share one frequency table.
	return -1;
	}

	init();
	}

	exports.CharDistributionAnalysis = CharDistributionAnalysis

	function EUCTWDistributionAnalysis() {
	CharDistributionAnalysis.apply(this);

	var self = this;

	function init() {
	self._mCharToFreqOrder = euctwfreq.EUCTWCharToFreqOrder;
	self._mTableSize = euctwfreq.EUCTW_TABLE_SIZE;
	self._mTypicalDistributionRatio = euctwfreq.EUCTW_TYPICAL_DISTRIBUTION_RATIO;
	}

	this.getOrder = function(aStr) {
	// for euc-TW encoding, we are interested
	// first byte range: 0xc4 -- 0xfe
	// second byte range: 0xa1 -- 0xfe
	// no validation needed here. State machine has done that
	if( aStr.charCodeAt(0) >= 0xC4 ) {
	return 94 * (aStr.charCodeAt(0) - 0xC4) + aStr.charCodeAt(1) - 0xA1;
	} else {
	return -1;
	}
	}

	init();
	}
	EUCTWDistributionAnalysis.prototype = new CharDistributionAnalysis();

	exports.EUCTWDistributionAnalysis = EUCTWDistributionAnalysis

	function EUCKRDistributionAnalysis() {
	CharDistributionAnalysis.apply(this);

	var self = this;

	function init() {
	self._mCharToFreqOrder = euckrfreq.EUCKRCharToFreqOrder;
	self._mTableSize = euckrfreq.EUCKR_TABLE_SIZE;
	self._mTypicalDistributionRatio = euckrfreq.EUCKR_TYPICAL_DISTRIBUTION_RATIO;
	}

	this.getOrder = function(aStr) {
	// for euc-KR encoding, we are interested
	// first byte range: 0xb0 -- 0xfe
	// second byte range: 0xa1 -- 0xfe
	// no validation needed here. State machine has done that
	if( aStr.charCodeAt(0) >= 0xB0 ) {
	return 94 * (aStr.charCodeAt(0) - 0xB0) + aStr.charCodeAt(1) - 0xA1;
	} else {
	return -1;
	}
	}

	init();
	}
	EUCKRDistributionAnalysis.prototype = new CharDistributionAnalysis();

	exports.EUCKRDistributionAnalysis = EUCKRDistributionAnalysis

	function GB2312DistributionAnalysis() {
	CharDistributionAnalysis.apply(this);

	var self = this;

	function init() {
	self._mCharToFreqOrder = gb2312freq.GB2312CharToFreqOrder;
	self._mTableSize = gb2312freq.GB2312_TABLE_SIZE;
	self._mTypicalDistributionRatio = gb2312freq.GB2312_TYPICAL_DISTRIBUTION_RATIO;
	}

	this.getOrder = function(aStr) {
	// for GB2312 encoding, we are interested
	// first byte range: 0xb0 -- 0xfe
	// second byte range: 0xa1 -- 0xfe
	// no validation needed here. State machine has done that
	if( aStr.charCodeAt(0) >= 0xB0 && aStr.charCodeAt(1) >= 0xA1 ) {
	return 94 * (aStr.charCodeAt(0) - 0xB0) + aStr.charCodeAt(1) - 0xA1;
	} else {
	return -1;
	}
	}

	init();
	}
	GB2312DistributionAnalysis.prototype = new CharDistributionAnalysis();

	exports.GB2312DistributionAnalysis = GB2312DistributionAnalysis

	function Big5DistributionAnalysis() {
	CharDistributionAnalysis.apply(this);

	var self = this;

	function init() {
	self._mCharToFreqOrder = big5freq.Big5CharToFreqOrder;
	self._mTableSize = big5freq.BIG5_TABLE_SIZE;
	self._mTypicalDistributionRatio = big5freq.BIG5_TYPICAL_DISTRIBUTION_RATIO;
	}

	this.getOrder = function(aStr) {
	// for big5 encoding, we are interested
	// first byte range: 0xa4 -- 0xfe
	// second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe
	// no validation needed here. State machine has done that
	if( aStr.charCodeAt(0) >= 0xA4 ) {
	if( aStr.charCodeAt(1) >= 0xA1 ) {
	return 157 * (aStr.charCodeAt(0) - 0xA4) + aStr.charCodeAt(1) - 0xA1 + 63;
	} else {
	return 157 * (aStr.charCodeAt(0) - 0xA4) + aStr.charCodeAt(1) - 0x40;
	}
	} else {
	return -1;
	}
	}

	init();
	}
	Big5DistributionAnalysis.prototype = new CharDistributionAnalysis();

	exports.Big5DistributionAnalysis = Big5DistributionAnalysis

	function SJISDistributionAnalysis() {
	CharDistributionAnalysis.apply(this);

	var self = this;

	function init() {
	self._mCharToFreqOrder = jisfreq.JISCharToFreqOrder;
	self._mTableSize = jisfreq.JIS_TABLE_SIZE;
	self._mTypicalDistributionRatio = jisfreq.JIS_TYPICAL_DISTRIBUTION_RATIO;
	}

	this.getOrder = function(aStr) {
	// for sjis encoding, we are interested
	// first byte range: 0x81 -- 0x9f , 0xe0 -- 0xef
	// second byte range: 0x40 -- 0x7e, 0x80 -- 0xfc
	// no validation needed here. State machine has done that
	if( aStr.charCodeAt(0) >= 0x81 && aStr.charCodeAt(0) <= 0x9F ) {
	var order = 188 * (aStr.charCodeAt(0) - 0x81);
	} else if( aStr.charCodeAt(0) >= 0xE0 && aStr.charCodeAt(0) <= 0xEF ) {
	order = 188 * (aStr.charCodeAt(0) - 0xE0 + 31);
	} else {
	return -1;
	}
	order += aStr.charCodeAt(1) - 0x40;
	if( aStr.charCodeAt(1) < 0x40 \|\| aStr.charCodeAt(1) === 0x7F \|\| aStr.charCodeAt(1) > 0xFC) {
	order = -1;
	}
	return order;
	}

	init();
	}
	SJISDistributionAnalysis.prototype = new CharDistributionAnalysis();

	exports.SJISDistributionAnalysis = SJISDistributionAnalysis

	function EUCJPDistributionAnalysis() {
	CharDistributionAnalysis.apply(this);

	var self = this;

	function init() {
	self._mCharToFreqOrder = jisfreq.JISCharToFreqOrder;
	self._mTableSize = jisfreq.JIS_TABLE_SIZE;
	self._mTypicalDistributionRatio = jisfreq.JIS_TYPICAL_DISTRIBUTION_RATIO;
	}

	this.getOrder = function(aStr) {
	// for euc-JP encoding, we are interested
	// first byte range: 0xa0 -- 0xfe
	// second byte range: 0xa1 -- 0xfe
	// no validation needed here. State machine has done that
	if( aStr[0] >= "\xA0" ) {
	return 94 * (aStr.charCodeAt(0) - 0xA1) + aStr.charCodeAt(1) - 0xA1;
	} else {
	return -1;
	}
	}

	init();
	}
	EUCJPDistributionAnalysis.prototype = new CharDistributionAnalysis();

	exports.EUCJPDistributionAnalysis = EUCJPDistributionAnalysis