| /* |
| * The Original Code is Mozilla Universal charset detector code. |
| * |
| * The Initial Developer of the Original Code is |
| * Netscape Communications Corporation. |
| * Portions created by the Initial Developer are Copyright (C) 2001 |
| * the Initial Developer. All Rights Reserved. |
| * |
| * Contributor(s): |
| * António Afonso (antonio.afonso gmail.com) - port to JavaScript |
| * Mark Pilgrim - port to Python |
| * Shy Shalom - original C code |
| * |
| * This library is free software; you can redistribute it and/or |
| * modify it under the terms of the GNU Lesser General Public |
| * License as published by the Free Software Foundation; either |
| * version 2.1 of the License, or (at your option) any later version. |
| * |
| * This library is distributed in the hope that it will be useful, |
| * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| * Lesser General Public License for more details. |
| * |
| * You should have received a copy of the GNU Lesser General Public |
| * License along with this library; if not, write to the Free Software |
| * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA |
| * 02110-1301 USA |
| */ |
| |
| var CharSetProber = require('./charsetprober'); |
| var Constants = require('./constants'); |
| |
| var UDF = 0; // undefined |
| var OTH = 1; // other |
| var ASC = 2; // ascii capital letter |
| var ASS = 3; // ascii small letter |
| var ACV = 4; // accent capital vowel |
| var ACO = 5; // accent capital other |
| var ASV = 6; // accent small vowel |
| var ASO = 7; // accent small other |
| |
| var Latin1_CharToClass = [ |
| OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 00 - 07 |
| OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 08 - 0F |
| OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 10 - 17 |
| OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 18 - 1F |
| OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 20 - 27 |
| OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 28 - 2F |
| OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 30 - 37 |
| OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 38 - 3F |
| OTH, ASC, ASC, ASC, ASC, ASC, ASC, ASC, // 40 - 47 |
| ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, // 48 - 4F |
| ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, // 50 - 57 |
| ASC, ASC, ASC, OTH, OTH, OTH, OTH, OTH, // 58 - 5F |
| OTH, ASS, ASS, ASS, ASS, ASS, ASS, ASS, // 60 - 67 |
| ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, // 68 - 6F |
| ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, // 70 - 77 |
| ASS, ASS, ASS, OTH, OTH, OTH, OTH, OTH, // 78 - 7F |
| OTH, UDF, OTH, ASO, OTH, OTH, OTH, OTH, // 80 - 87 |
| OTH, OTH, ACO, OTH, ACO, UDF, ACO, UDF, // 88 - 8F |
| UDF, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 90 - 97 |
| OTH, OTH, ASO, OTH, ASO, UDF, ASO, ACO, // 98 - 9F |
| OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // A0 - A7 |
| OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // A8 - AF |
| OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // B0 - B7 |
| OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // B8 - BF |
| ACV, ACV, ACV, ACV, ACV, ACV, ACO, ACO, // C0 - C7 |
| ACV, ACV, ACV, ACV, ACV, ACV, ACV, ACV, // C8 - CF |
| ACO, ACO, ACV, ACV, ACV, ACV, ACV, OTH, // D0 - D7 |
| ACV, ACV, ACV, ACV, ACV, ACO, ACO, ACO, // D8 - DF |
| ASV, ASV, ASV, ASV, ASV, ASV, ASO, ASO, // E0 - E7 |
| ASV, ASV, ASV, ASV, ASV, ASV, ASV, ASV, // E8 - EF |
| ASO, ASO, ASV, ASV, ASV, ASV, ASV, OTH, // F0 - F7 |
| ASV, ASV, ASV, ASV, ASV, ASO, ASO, ASO // F8 - FF |
| ]; |
| |
| // 0 : illegal |
| // 1 : very unlikely |
| // 2 : normal |
| // 3 : very likely |
| var Latin1ClassModel = [ |
| // UDF OTH ASC ASS ACV ACO ASV ASO |
| 0, 0, 0, 0, 0, 0, 0, 0, // UDF |
| 0, 3, 3, 3, 3, 3, 3, 3, // OTH |
| 0, 3, 3, 3, 3, 3, 3, 3, // ASC |
| 0, 3, 3, 3, 1, 1, 3, 3, // ASS |
| 0, 3, 3, 3, 1, 2, 1, 2, // ACV |
| 0, 3, 3, 3, 3, 3, 3, 3, // ACO |
| 0, 3, 1, 3, 1, 1, 1, 3, // ASV |
| 0, 3, 1, 3, 1, 1, 3, 3 // ASO |
| ]; |
| |
| function Latin1Prober() { |
| CharSetProber.apply(this); |
| |
| var FREQ_CAT_NUM = 4; |
| var CLASS_NUM = 8; // total classes |
| var self = this; |
| |
| function init() { |
| self.reset(); |
| } |
| |
| this.reset = function() { |
| this._mLastCharClass = OTH; |
| this._mFreqCounter = []; |
| for( var i = 0; i < FREQ_CAT_NUM; this._mFreqCounter[i++] = 0 ); |
| Latin1Prober.prototype.reset.apply(this); |
| } |
| |
| this.getCharsetName = function() { |
| return "windows-1252"; |
| } |
| |
| this.getSupportedCharsetNames = function() { |
| return [this.getCharsetName()]; |
| } |
| |
| this.feed = function (aBuf) { |
| aBuf = this.removeXmlTags(aBuf); |
| for( var i = 0; i < aBuf.length; i++ ) { |
| var c = aBuf.charCodeAt(i); |
| var charClass = Latin1_CharToClass[c]; |
| var freq = Latin1ClassModel[(this._mLastCharClass * CLASS_NUM) + charClass]; |
| if( freq == 0 ) { |
| this._mState = Constants.notMe; |
| break; |
| } |
| this._mFreqCounter[freq]++; |
| this._mLastCharClass = charClass; |
| } |
| |
| return this.getState(); |
| } |
| |
| this.getConfidence = function() { |
| var confidence; |
| |
| if( this.getState() == Constants.notMe ) { |
| return 0.01; |
| } |
| |
| var total = 0; |
| for( var i = 0; i < this._mFreqCounter.length; i++ ) { |
| total += this._mFreqCounter[i]; |
| } |
| if( total < 0.01 ) { |
| confidence = 0.0; |
| } else { |
| confidence = (this._mFreqCounter[3] / total) - (this._mFreqCounter[1] * 20 / total); |
| } |
| if( confidence < 0 ) { |
| confidence = 0.0; |
| } |
| // lower the confidence of latin1 so that other more accurate detector |
| // can take priority. |
| // |
| // antonio.afonso: need to change this otherwise languages like pt, es, fr using latin1 will never be detected. |
| confidence = confidence * 0.95; |
| return confidence; |
| } |
| |
| init(); |
| } |
| Latin1Prober.prototype = new CharSetProber(); |
| |
| module.exports = Latin1Prober |