src/utf8prober.js - jschardet - Git at Google

 /*
  * The Original Code is Mozilla Universal charset detector code.
  *
  * The Initial Developer of the Original Code is
  * Netscape Communications Corporation.
  * Portions created by the Initial Developer are Copyright (C) 2001
  * the Initial Developer. All Rights Reserved.
  *
  * Contributor(s):
  *   António Afonso (antonio.afonso gmail.com) - port to JavaScript
  *   Mark Pilgrim - port to Python
  *   Shy Shalom - original C code
  *
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with this library; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
  * 02110-1301  USA
  */

 var CodingStateMachine = require('./codingstatemachine');
 var CharSetProber = require('./charsetprober');
 var constants = require('./constants');
 var UTF8SMModel = require('./mbcssm/utf8');

 function UTF8Prober() {
     CharSetProber.apply(this);

     var ONE_CHAR_PROB = 0.5;
     var self = this;

     function init() {
         self._mCodingSM = new CodingStateMachine(UTF8SMModel);
         self.reset();
     }

     this.reset = function() {
         UTF8Prober.prototype.reset.apply(this);
         this._mCodingSM.reset();
         this._mNumOfMBChar = 0;
         this._mMBCharLen = 0;
         this._mFullLen = 0;
         this._mBasicAsciiLen = 0;
     }

     this.getCharsetName = function() {
         return "UTF-8";
     }

     this.feed = function(aBuf) {
         this._mFullLen += aBuf.length;
         for( var i = 0, c; i < aBuf.length; i++ ) {
             c = aBuf[i];
             var codingState = this._mCodingSM.nextState(c);
             if( codingState == constants.error ) {
                 this._mState = constants.notMe;
                 break;
             } else if( codingState == constants.itsMe ) {
                 this._mState = constants.foundIt;
                 break;
             } else if( codingState == constants.start ) {
                 if( this._mCodingSM.getCurrentCharLen() >= 2 ) {
                     this._mNumOfMBChar++;
                     this._mMBCharLen += this._mCodingSM.getCurrentCharLen();
                 } else if( c.charCodeAt(0) < 128 ) { // codes higher than 127 are extended ASCII
                     this._mBasicAsciiLen++;
                 }
             }
         }

         if( this.getState() == constants.detecting ) {
             if( this.getConfidence() > constants.SHORTCUT_THRESHOLD ) {
                 this._mState = constants.foundIt;
             }
         }

         return this.getState();
     }

     this.getConfidence = function() {
         var unlike = 0.99;
         var mbCharRatio = 0;
         var nonBasciAsciiLen = (this._mFullLen - this._mBasicAsciiLen);
         if( nonBasciAsciiLen > 0 ) {
             mbCharRatio = this._mMBCharLen / nonBasciAsciiLen;
         }
         if( this._mNumOfMBChar < 6 && mbCharRatio <= 0.6 ) {
             unlike *= Math.pow(ONE_CHAR_PROB, this._mNumOfMBChar);
             return 1 - unlike;
         } else {
             return unlike;
         }
     }

     init();
 }
 UTF8Prober.prototype = new CharSetProber();

 module.exports = UTF8Prober;
	/*
	* The Original Code is Mozilla Universal charset detector code.
	*
	* The Initial Developer of the Original Code is
	* Netscape Communications Corporation.
	* Portions created by the Initial Developer are Copyright (C) 2001
	* the Initial Developer. All Rights Reserved.
	*
	* Contributor(s):
	* António Afonso (antonio.afonso gmail.com) - port to JavaScript
	* Mark Pilgrim - port to Python
	* Shy Shalom - original C code
	*
	* This library is free software; you can redistribute it and/or
	* modify it under the terms of the GNU Lesser General Public
	* License as published by the Free Software Foundation; either
	* version 2.1 of the License, or (at your option) any later version.
	*
	* This library is distributed in the hope that it will be useful,
	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	* Lesser General Public License for more details.
	*
	* You should have received a copy of the GNU Lesser General Public
	* License along with this library; if not, write to the Free Software
	* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
	* 02110-1301 USA
	*/

	var CodingStateMachine = require('./codingstatemachine');
	var CharSetProber = require('./charsetprober');
	var constants = require('./constants');
	var UTF8SMModel = require('./mbcssm/utf8');

	function UTF8Prober() {
	CharSetProber.apply(this);

	var ONE_CHAR_PROB = 0.5;
	var self = this;

	function init() {
	self._mCodingSM = new CodingStateMachine(UTF8SMModel);
	self.reset();
	}

	this.reset = function() {
	UTF8Prober.prototype.reset.apply(this);
	this._mCodingSM.reset();
	this._mNumOfMBChar = 0;
	this._mMBCharLen = 0;
	this._mFullLen = 0;
	this._mBasicAsciiLen = 0;
	}

	this.getCharsetName = function() {
	return "UTF-8";
	}

	this.feed = function(aBuf) {
	this._mFullLen += aBuf.length;
	for( var i = 0, c; i < aBuf.length; i++ ) {
	c = aBuf[i];
	var codingState = this._mCodingSM.nextState(c);
	if( codingState == constants.error ) {
	this._mState = constants.notMe;
	break;
	} else if( codingState == constants.itsMe ) {
	this._mState = constants.foundIt;
	break;
	} else if( codingState == constants.start ) {
	if( this._mCodingSM.getCurrentCharLen() >= 2 ) {
	this._mNumOfMBChar++;
	this._mMBCharLen += this._mCodingSM.getCurrentCharLen();
	} else if( c.charCodeAt(0) < 128 ) { // codes higher than 127 are extended ASCII
	this._mBasicAsciiLen++;
	}
	}
	}

	if( this.getState() == constants.detecting ) {
	if( this.getConfidence() > constants.SHORTCUT_THRESHOLD ) {
	this._mState = constants.foundIt;
	}
	}

	return this.getState();
	}

	this.getConfidence = function() {
	var unlike = 0.99;
	var mbCharRatio = 0;
	var nonBasciAsciiLen = (this._mFullLen - this._mBasicAsciiLen);
	if( nonBasciAsciiLen > 0 ) {
	mbCharRatio = this._mMBCharLen / nonBasciAsciiLen;
	}
	if( this._mNumOfMBChar < 6 && mbCharRatio <= 0.6 ) {
	unlike *= Math.pow(ONE_CHAR_PROB, this._mNumOfMBChar);
	return 1 - unlike;
	} else {
	return unlike;
	}
	}

	init();
	}
	UTF8Prober.prototype = new CharSetProber();

	module.exports = UTF8Prober;