blob: f6f6780f9cfa60b0cc59f6cc682f31e6ca5b7c61 [file] [log] [blame]
/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*-
*
* The contents of this file are subject to the Netscape Public
* License Version 1.1 (the "License"); you may not use this file
* except in compliance with the License. You may obtain a copy of
* the License at http://www.mozilla.org/NPL/
*
* Software distributed under the License is distributed on an "AS
* IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or
* implied. See the License for the specific language governing
* rights and limitations under the License.
*
* The Original Code is mozilla.org code.
*
* The Initial Developer of the Original Code is Netscape
* Communications Corporation. Portions created by Netscape are
* Copyright (C) 1998 Netscape Communications Corporation. All
* Rights Reserved.
*
* Contributor(s):
*/
package org.mozilla.intl.chardet ;
import java.lang.* ;
public abstract class nsPSMDetector {
public static final int ALL = 0 ;
public static final int JAPANESE = 1 ;
public static final int CHINESE = 2 ;
public static final int SIMPLIFIED_CHINESE = 3 ;
public static final int TRADITIONAL_CHINESE = 4 ;
public static final int KOREAN = 5 ;
public static final int NO_OF_LANGUAGES = 6 ;
public static final int MAX_VERIFIERS = 16 ;
nsVerifier[] mVerifier ;
nsEUCStatistics[] mStatisticsData ;
nsEUCSampler mSampler = new nsEUCSampler() ;
byte[] mState = new byte[MAX_VERIFIERS] ;
int[] mItemIdx = new int[MAX_VERIFIERS] ;
int mItems ;
int mClassItems ;
boolean mDone ;
boolean mRunSampler ;
boolean mClassRunSampler ;
public nsPSMDetector() {
initVerifiers( nsPSMDetector.ALL );
Reset() ;
}
public nsPSMDetector(int langFlag) {
initVerifiers(langFlag);
Reset() ;
}
public nsPSMDetector(int aItems, nsVerifier[] aVerifierSet,
nsEUCStatistics[] aStatisticsSet) {
mClassRunSampler = ( aStatisticsSet != null ) ;
mStatisticsData = aStatisticsSet ;
mVerifier = aVerifierSet ;
mClassItems = aItems ;
Reset() ;
}
public void Reset() {
mRunSampler = mClassRunSampler ;
mDone = false ;
mItems = mClassItems ;
for(int i=0; i<mItems; i++) {
mState[i] = 0;
mItemIdx[i] = i;
}
mSampler.Reset() ;
}
protected void initVerifiers(int currVerSet) {
int idx = 0 ;
int currVerifierSet ;
if (currVerSet >=0 && currVerSet < NO_OF_LANGUAGES ) {
currVerifierSet = currVerSet ;
}
else {
currVerifierSet = nsPSMDetector.ALL ;
}
mVerifier = null ;
mStatisticsData = null ;
if ( currVerifierSet == nsPSMDetector.TRADITIONAL_CHINESE ) {
mVerifier = new nsVerifier[] {
new nsUTF8Verifier(),
new nsBIG5Verifier(),
new nsISO2022CNVerifier(),
new nsEUCTWVerifier(),
new nsCP1252Verifier(),
new nsUCS2BEVerifier(),
new nsUCS2LEVerifier()
};
mStatisticsData = new nsEUCStatistics[] {
null,
new Big5Statistics(),
null,
new EUCTWStatistics(),
null,
null,
null
};
}
//==========================================================
else if ( currVerifierSet == nsPSMDetector.KOREAN ) {
mVerifier = new nsVerifier[] {
new nsUTF8Verifier(),
new nsEUCKRVerifier(),
new nsISO2022KRVerifier(),
new nsCP1252Verifier(),
new nsUCS2BEVerifier(),
new nsUCS2LEVerifier()
};
}
//==========================================================
else if ( currVerifierSet == nsPSMDetector.SIMPLIFIED_CHINESE ) {
mVerifier = new nsVerifier[] {
new nsUTF8Verifier(),
new nsGB2312Verifier(),
new nsGB18030Verifier(),
new nsISO2022CNVerifier(),
new nsHZVerifier(),
new nsCP1252Verifier(),
new nsUCS2BEVerifier(),
new nsUCS2LEVerifier()
};
}
//==========================================================
else if ( currVerifierSet == nsPSMDetector.JAPANESE ) {
mVerifier = new nsVerifier[] {
new nsUTF8Verifier(),
new nsSJISVerifier(),
new nsEUCJPVerifier(),
new nsISO2022JPVerifier(),
new nsCP1252Verifier(),
new nsUCS2BEVerifier(),
new nsUCS2LEVerifier()
};
}
//==========================================================
else if ( currVerifierSet == nsPSMDetector.CHINESE ) {
mVerifier = new nsVerifier[] {
new nsUTF8Verifier(),
new nsGB2312Verifier(),
new nsGB18030Verifier(),
new nsBIG5Verifier(),
new nsISO2022CNVerifier(),
new nsHZVerifier(),
new nsEUCTWVerifier(),
new nsCP1252Verifier(),
new nsUCS2BEVerifier(),
new nsUCS2LEVerifier()
};
mStatisticsData = new nsEUCStatistics[] {
null,
new GB2312Statistics(),
null,
new Big5Statistics(),
null,
null,
new EUCTWStatistics(),
null,
null,
null
};
}
//==========================================================
else if ( currVerifierSet == nsPSMDetector.ALL ) {
mVerifier = new nsVerifier[] {
new nsUTF8Verifier(),
new nsSJISVerifier(),
new nsEUCJPVerifier(),
new nsISO2022JPVerifier(),
new nsEUCKRVerifier(),
new nsISO2022KRVerifier(),
new nsBIG5Verifier(),
new nsEUCTWVerifier(),
new nsGB2312Verifier(),
new nsGB18030Verifier(),
new nsISO2022CNVerifier(),
new nsHZVerifier(),
new nsCP1252Verifier(),
new nsUCS2BEVerifier(),
new nsUCS2LEVerifier()
};
mStatisticsData = new nsEUCStatistics[] {
null,
null,
new EUCJPStatistics(),
null,
new EUCKRStatistics(),
null,
new Big5Statistics(),
new EUCTWStatistics(),
new GB2312Statistics(),
null,
null,
null,
null,
null,
null
};
}
mClassRunSampler = ( mStatisticsData != null ) ;
mClassItems = mVerifier.length ;
}
public abstract void Report(String charset) ;
public boolean HandleData(byte[] aBuf, int len) {
int i,j;
byte b, st;
for( i=0; i < len; i++) {
b = aBuf[i] ;
for (j=0; j < mItems; )
{
st = nsVerifier.getNextState( mVerifier[mItemIdx[j]],
b, mState[j]) ;
//if (st != 0)
//System.out.println( "state(0x" + Integer.toHexString(0xFF&b) +") =>"+ Integer.toHexString(st&0xFF)+ " " + mVerifier[mItemIdx[j]].charset());
if (st == nsVerifier.eItsMe) {
//System.out.println( "eItsMe(0x" + Integer.toHexString(0xFF&b) +") =>"+ mVerifier[mItemIdx[j]].charset());
Report( mVerifier[mItemIdx[j]].charset() );
mDone = true ;
return mDone ;
} else if (st == nsVerifier.eError ) {
//System.out.println( "eNotMe(0x" + Integer.toHexString(0xFF&b) +") =>"+ mVerifier[mItemIdx[j]].charset());
mItems--;
if (j < mItems ) {
mItemIdx[j] = mItemIdx[mItems];
mState[j] = mState[mItems];
}
} else {
mState[j++] = st ;
}
}
if ( mItems <= 1 ) {
if( 1 == mItems) {
Report( mVerifier[mItemIdx[0]].charset() );
}
mDone = true ;
return mDone ;
}
else {
int nonUCS2Num=0;
int nonUCS2Idx=0;
for(j=0; j<mItems; j++) {
if ( (!(mVerifier[mItemIdx[j]].isUCS2())) &&
(!(mVerifier[mItemIdx[j]].isUCS2())) )
{
nonUCS2Num++ ;
nonUCS2Idx = j ;
}
}
if (1 == nonUCS2Num) {
Report( mVerifier[mItemIdx[nonUCS2Idx]].charset() );
mDone = true ;
return mDone ;
}
}
} // End of for( i=0; i < len ...
if (mRunSampler)
Sample(aBuf, len);
return mDone ;
}
public void DataEnd() {
if (mDone == true)
return ;
if (mItems == 2) {
if ((mVerifier[mItemIdx[0]].charset()).equals("GB18030")) {
Report(mVerifier[mItemIdx[1]].charset()) ;
mDone = true ;
} else if ((mVerifier[mItemIdx[1]].charset()).equals("GB18030")) {
Report(mVerifier[mItemIdx[0]].charset()) ;
mDone = true ;
}
}
if (mRunSampler)
Sample(null, 0, true);
}
public void Sample(byte[] aBuf, int aLen) {
Sample(aBuf, aLen, false) ;
}
public void Sample(byte[] aBuf, int aLen, boolean aLastChance)
{
int possibleCandidateNum = 0;
int j;
int eucNum=0 ;
for (j=0; j< mItems; j++) {
if (null != mStatisticsData[mItemIdx[j]])
eucNum++ ;
if ((!mVerifier[mItemIdx[j]].isUCS2()) &&
(!(mVerifier[mItemIdx[j]].charset()).equals("GB18030")))
possibleCandidateNum++ ;
}
mRunSampler = (eucNum > 1) ;
if (mRunSampler) {
mRunSampler = mSampler.Sample(aBuf, aLen);
if(((aLastChance && mSampler.GetSomeData()) ||
mSampler.EnoughData())
&& (eucNum == possibleCandidateNum)) {
mSampler.CalFreq();
int bestIdx = -1;
int eucCnt=0;
float bestScore = 0.0f;
for(j = 0; j < mItems; j++) {
if((null != mStatisticsData[mItemIdx[j]]) &&
(!(mVerifier[mItemIdx[j]].charset()).equals("Big5")))
{
float score = mSampler.GetScore(
mStatisticsData[mItemIdx[j]].mFirstByteFreq(),
mStatisticsData[mItemIdx[j]].mFirstByteWeight(),
mStatisticsData[mItemIdx[j]].mSecondByteFreq(),
mStatisticsData[mItemIdx[j]].mSecondByteWeight() );
//System.out.println("FequencyScore("+mVerifier[mItemIdx[j]].charset()+")= "+ score);
if(( 0 == eucCnt++) || (bestScore > score )) {
bestScore = score;
bestIdx = j;
} // if(( 0 == eucCnt++) || (bestScore > score ))
} // if(null != ...)
} // for
if (bestIdx >= 0)
{
Report( mVerifier[mItemIdx[bestIdx]].charset());
mDone = true;
}
} // if (eucNum == possibleCandidateNum)
} // if(mRunSampler)
}
public String[] getProbableCharsets() {
if (mItems <= 0) {
String[] nomatch = new String[1];
nomatch[0] = "nomatch" ;
return nomatch ;
}
String ret[] = new String[mItems] ;
for (int i=0; i<mItems; i++)
ret[i] = mVerifier[mItemIdx[i]].charset() ;
return ret ;
}
}