| /* |
| * The Original Code is Mozilla Universal charset detector code. |
| * |
| * The Initial Developer of the Original Code is |
| * Netscape Communications Corporation. |
| * Portions created by the Initial Developer are Copyright (C) 2001 |
| * the Initial Developer. All Rights Reserved. |
| * |
| * Contributor(s): |
| * António Afonso (antonio.afonso gmail.com) - port to JavaScript |
| * Mark Pilgrim - port to Python |
| * Shy Shalom - original C code |
| * |
| * This library is free software; you can redistribute it and/or |
| * modify it under the terms of the GNU Lesser General Public |
| * License as published by the Free Software Foundation; either |
| * version 2.1 of the License, or (at your option) any later version. |
| * |
| * This library is distributed in the hope that it will be useful, |
| * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| * Lesser General Public License for more details. |
| * |
| * You should have received a copy of the GNU Lesser General Public |
| * License along with this library; if not, write to the Free Software |
| * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA |
| * 02110-1301 USA |
| */ |
| |
| // This prober doesn't actually recognize a language or a charset. |
| // It is a helper prober for the use of the Hebrew model probers |
| |
| ////// General ideas of the Hebrew charset recognition ////// |
| // |
| // Four main charsets exist in Hebrew: |
| // "ISO-8859-8" - Visual Hebrew |
| // "windows-1255" - Logical Hebrew |
| // "ISO-8859-8-I" - Logical Hebrew |
| // "x-mac-hebrew" - ?? Logical Hebrew ?? |
| // |
| // Both "ISO" charsets use a completely identical set of code points, whereas |
| // "windows-1255" and "x-mac-hebrew" are two different proper supersets of |
| // these code points. windows-1255 defines additional characters in the range |
| // 0x80-0x9F as some misc punctuation marks as well as some Hebrew-specific |
| // diacritics and additional 'Yiddish' ligature letters in the range 0xc0-0xd6. |
| // x-mac-hebrew defines similar additional code points but with a different |
| // mapping. |
| // |
| // As far as an average Hebrew text with no diacritics is concerned, all four |
| // charsets are identical with respect to code points. Meaning that for the |
| // main Hebrew alphabet, all four map the same values to all 27 Hebrew letters |
| // (including final letters). |
| // |
| // The dominant difference between these charsets is their directionality. |
| // "Visual" directionality means that the text is ordered as if the renderer is |
| // not aware of a BIDI rendering algorithm. The renderer sees the text and |
| // draws it from left to right. The text itself when ordered naturally is read |
| // backwards. A buffer of Visual Hebrew generally looks like so: |
| // "[last word of first line spelled backwards] [whole line ordered backwards |
| // and spelled backwards] [first word of first line spelled backwards] |
| // [end of line] [last word of second line] ... etc' " |
| // adding punctuation marks, numbers and English text to visual text is |
| // naturally also "visual" and from left to right. |
| // |
| // "Logical" directionality means the text is ordered "naturally" according to |
| // the order it is read. It is the responsibility of the renderer to display |
| // the text from right to left. A BIDI algorithm is used to place general |
| // punctuation marks, numbers and English text in the text. |
| // |
| // Texts in x-mac-hebrew are almost impossible to find on the Internet. From |
| // what little evidence I could find, it seems that its general directionality |
| // is Logical. |
| // |
| // To sum up all of the above, the Hebrew probing mechanism knows about two |
| // charsets: |
| // Visual Hebrew - "ISO-8859-8" - backwards text - Words and sentences are |
| // backwards while line order is natural. For charset recognition purposes |
| // the line order is unimportant (In fact, for this implementation, even |
| // word order is unimportant). |
| // Logical Hebrew - "windows-1255" - normal, naturally ordered text. |
| // |
| // "ISO-8859-8-I" is a subset of windows-1255 and doesn't need to be |
| // specifically identified. |
| // "x-mac-hebrew" is also identified as windows-1255. A text in x-mac-hebrew |
| // that contain special punctuation marks or diacritics is displayed with |
| // some unconverted characters showing as question marks. This problem might |
| // be corrected using another model prober for x-mac-hebrew. Due to the fact |
| // that x-mac-hebrew texts are so rare, writing another model prober isn't |
| // worth the effort and performance hit. |
| // |
| //////// The Prober //////// |
| // |
| // The prober is divided between two SBCharSetProbers and a HebrewProber, |
| // all of which are managed, created, fed data, inquired and deleted by the |
| // SBCSGroupProber. The two SBCharSetProbers identify that the text is in |
| // fact some kind of Hebrew, Logical or Visual. The final decision about which |
| // one is it is made by the HebrewProber by combining final-letter scores |
| // with the scores of the two SBCharSetProbers to produce a final answer. |
| // |
| // The SBCSGroupProber is responsible for stripping the original text of HTML |
| // tags, English characters, numbers, low-ASCII punctuation characters, spaces |
| // and new lines. It reduces any sequence of such characters to a single space. |
| // The buffer fed to each prober in the SBCS group prober is pure text in |
| // high-ASCII. |
| // The two SBCharSetProbers (model probers) share the same language model: |
| // Win1255Model. |
| // The first SBCharSetProber uses the model normally as any other |
| // SBCharSetProber does, to recognize windows-1255, upon which this model was |
| // built. The second SBCharSetProber is told to make the pair-of-letter |
| // lookup in the language model backwards. This in practice exactly simulates |
| // a visual Hebrew model using the windows-1255 logical Hebrew model. |
| // |
| // The HebrewProber is not using any language model. All it does is look for |
| // final-letter evidence suggesting the text is either logical Hebrew or visual |
| // Hebrew. Disjointed from the model probers, the results of the HebrewProber |
| // alone are meaningless. HebrewProber always returns 0.00 as confidence |
| // since it never identifies a charset by itself. Instead, the pointer to the |
| // HebrewProber is passed to the model probers as a helper "Name Prober". |
| // When the Group prober receives a positive identification from any prober, |
| // it asks for the name of the charset identified. If the prober queried is a |
| // Hebrew model prober, the model prober forwards the call to the |
| // HebrewProber to make the final decision. In the HebrewProber, the |
| // decision is made according to the final-letters scores maintained and Both |
| // model probers scores. The answer is returned in the form of the name of the |
| // charset identified, either "windows-1255" or "ISO-8859-8". |
| |
| var CharSetProber = require('./charsetprober'); |
| var constants = require('./constants') |
| |
| // https://developer.mozilla.org/En/Core_JavaScript_1.5_Reference/Objects/Array/IndexOf |
| if (!Array.prototype.indexOf) |
| { |
| Array.prototype.indexOf = function(elt /*, from*/) |
| { |
| var len = this.length >>> 0; |
| |
| var from = Number(arguments[1]) || 0; |
| from = (from < 0) |
| ? Math.ceil(from) |
| : Math.floor(from); |
| if (from < 0) |
| from += len; |
| |
| for (; from < len; from++) |
| { |
| if (from in this && |
| this[from] === elt) |
| return from; |
| } |
| return -1; |
| }; |
| } |
| |
| function HebrewProber() { |
| CharSetProber.apply(this); |
| |
| // windows-1255 / ISO-8859-8 code points of interest |
| var FINAL_KAF = '\xea' |
| var NORMAL_KAF = '\xeb' |
| var FINAL_MEM = '\xed' |
| var NORMAL_MEM = '\xee' |
| var FINAL_NUN = '\xef' |
| var NORMAL_NUN = '\xf0' |
| var FINAL_PE = '\xf3' |
| var NORMAL_PE = '\xf4' |
| var FINAL_TSADI = '\xf5' |
| var NORMAL_TSADI = '\xf6' |
| |
| // Minimum Visual vs Logical final letter score difference. |
| // If the difference is below this, don't rely solely on the final letter score distance. |
| var MIN_FINAL_CHAR_DISTANCE = 5 |
| |
| // Minimum Visual vs Logical model score difference. |
| // If the difference is below this, don't rely at all on the model score distance. |
| var MIN_MODEL_DISTANCE = 0.01 |
| |
| var VISUAL_HEBREW_NAME = "ISO-8859-8" |
| var LOGICAL_HEBREW_NAME = "windows-1255" |
| var self = this; |
| |
| function init() { |
| self._mLogicalProber = null; |
| self._mVisualProber = null; |
| self.reset(); |
| } |
| |
| this.reset = function() { |
| this._mFinalCharLogicalScore = 0; |
| this._mFinalCharVisualScore = 0; |
| // The two last characters seen in the previous buffer, |
| // mPrev and mBeforePrev are initialized to space in order to simulate a word |
| // delimiter at the beginning of the data |
| this._mPrev = " "; |
| this._mBeforePrev = " "; |
| // These probers are owned by the group prober. |
| } |
| |
| this.setModelProbers = function(logicalProber, visualProber) { |
| this._mLogicalProber = logicalProber; |
| this._mVisualProber = visualProber; |
| } |
| |
| this.isFinal = function(c) { |
| return [FINAL_KAF, FINAL_MEM, FINAL_NUN, FINAL_PE, FINAL_TSADI].indexOf(c) != -1; |
| } |
| |
| this.isNonFinal = function(c) { |
| // The normal Tsadi is not a good Non-Final letter due to words like |
| // 'lechotet' (to chat) containing an apostrophe after the tsadi. This |
| // apostrophe is converted to a space in FilterWithoutEnglishLetters causing |
| // the Non-Final tsadi to appear at an end of a word even though this is not |
| // the case in the original text. |
| // The letters Pe and Kaf rarely display a related behavior of not being a |
| // good Non-Final letter. Words like 'Pop', 'Winamp' and 'Mubarak' for |
| // example legally end with a Non-Final Pe or Kaf. However, the benefit of |
| // these letters as Non-Final letters outweighs the damage since these words |
| // are quite rare. |
| return [NORMAL_KAF, NORMAL_MEM, NORMAL_NUN, NORMAL_PE].indexOf(c) != -1; |
| } |
| |
| this.feed = function(aBuf) { |
| // Final letter analysis for logical-visual decision. |
| // Look for evidence that the received buffer is either logical Hebrew or |
| // visual Hebrew. |
| // The following cases are checked: |
| // 1) A word longer than 1 letter, ending with a final letter. This is an |
| // indication that the text is laid out "naturally" since the final letter |
| // really appears at the end. +1 for logical score. |
| // 2) A word longer than 1 letter, ending with a Non-Final letter. In normal |
| // Hebrew, words ending with Kaf, Mem, Nun, Pe or Tsadi, should not end with |
| // the Non-Final form of that letter. Exceptions to this rule are mentioned |
| // above in isNonFinal(). This is an indication that the text is laid out |
| // backwards. +1 for visual score |
| // 3) A word longer than 1 letter, starting with a final letter. Final letters |
| // should not appear at the beginning of a word. This is an indication that |
| // the text is laid out backwards. +1 for visual score. |
| // |
| // The visual score and logical score are accumulated throughout the text and |
| // are finally checked against each other in GetCharSetName(). |
| // No checking for final letters in the middle of words is done since that case |
| // is not an indication for either Logical or Visual text. |
| // |
| // We automatically filter out all 7-bit characters (replace them with spaces) |
| // so the word boundary detection works properly. [MAP] |
| |
| if( this.getState() == constants.notMe ) { |
| // Both model probers say it's not them. No reason to continue. |
| return constants.notMe; |
| } |
| |
| aBuf = this.filterHighBitOnly(aBuf); |
| |
| for( var i = 0, cur; i < aBuf.length; i++ ) { |
| cur = aBuf[i]; |
| if( cur == " " ) { |
| // We stand on a space - a word just ended |
| if( this._mBeforePrev != " " ) { |
| // next-to-last char was not a space so self._mPrev is not a 1 letter word |
| if( this.isFinal(this._mPrev) ) { |
| // case (1) [-2:not space][-1:final letter][cur:space] |
| this._mFinalCharLogicalScore++; |
| } else if( this.isNonFinal(this._mPrev) ) { |
| // case (2) [-2:not space][-1:Non-Final letter][cur:space] |
| this._mFinalCharVisualScore++; |
| } |
| } |
| } else { |
| // Not standing on a space |
| if( this._mBeforePrev == " " && this.isFinal(this._mPrev) && cur != " " ) { |
| // case (3) [-2:space][-1:final letter][cur:not space] |
| this._mFinalCharVisualScore++; |
| } |
| } |
| this._mBeforePrev = this._mPrev; |
| this._mPrev = cur; |
| } |
| // Forever detecting, till the end or until both model probers return eNotMe (handled above) |
| return constants.detecting; |
| } |
| |
| this.getCharsetName = function() { |
| // Make the decision: is it Logical or Visual? |
| // If the final letter score distance is dominant enough, rely on it. |
| var finalsub = this._mFinalCharLogicalScore - this._mFinalCharVisualScore; |
| if( finalsub >= MIN_FINAL_CHAR_DISTANCE ) { |
| return LOGICAL_HEBREW_NAME; |
| } |
| if( finalsub <= -MIN_FINAL_CHAR_DISTANCE ) { |
| return VISUAL_HEBREW_NAME; |
| } |
| |
| // It's not dominant enough, try to rely on the model scores instead. |
| var modelsub = this._mLogicalProber.getConfidence() - this._mVisualProber.getConfidence(); |
| if( modelsub > MIN_MODEL_DISTANCE ) { |
| return LOGICAL_HEBREW_NAME; |
| } |
| if( modelsub < -MIN_MODEL_DISTANCE ) { |
| return VISUAL_HEBREW_NAME; |
| } |
| |
| // Still no good, back to final letter distance, maybe it'll save the day. |
| if( finalsub < 0 ) { |
| return VISUAL_HEBREW_NAME; |
| } |
| |
| // (finalsub > 0 - Logical) or (don't know what to do) default to Logical. |
| return LOGICAL_HEBREW_NAME; |
| } |
| |
| this.getState = function() { |
| // Remain active as long as any of the model probers are active. |
| if( this._mLogicalProber.getState() == constants.notMe && |
| this._mVisualProber.getState() == constants.notMe ) { |
| return constants.notMe; |
| } |
| return constants.detecting; |
| } |
| |
| init(); |
| } |
| HebrewProber.prototype = new CharSetProber(); |
| |
| module.exports = HebrewProber |