blob: 5a778c2638f632378e2546277c7e0df979bdcb2b [file] [log] [blame]
/****************************************************************************
**
** Copyright (C) 2016 The Qt Company Ltd.
** Contact: https://www.qt.io/licensing/
**
** This file is part of the QtCore module of the Qt Toolkit.
**
** $QT_BEGIN_LICENSE:LGPL$
** Commercial License Usage
** Licensees holding valid commercial Qt licenses may use this file in
** accordance with the commercial license agreement provided with the
** Software or, alternatively, in accordance with the terms contained in
** a written agreement between you and The Qt Company. For licensing terms
** and conditions see https://www.qt.io/terms-conditions. For further
** information use the contact form at https://www.qt.io/contact-us.
**
** GNU Lesser General Public License Usage
** Alternatively, this file may be used under the terms of the GNU Lesser
** General Public License version 3 as published by the Free Software
** Foundation and appearing in the file LICENSE.LGPL3 included in the
** packaging of this file. Please review the following information to
** ensure the GNU Lesser General Public License version 3 requirements
** will be met: https://www.gnu.org/licenses/lgpl-3.0.html.
**
** GNU General Public License Usage
** Alternatively, this file may be used under the terms of the GNU
** General Public License version 2.0 or (at your option) the GNU General
** Public license version 3 or any later version approved by the KDE Free
** Qt Foundation. The licenses are as published by the Free Software
** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3
** included in the packaging of this file. Please review the following
** information to ensure the GNU General Public License requirements will
** be met: https://www.gnu.org/licenses/gpl-2.0.html and
** https://www.gnu.org/licenses/gpl-3.0.html.
**
** $QT_END_LICENSE$
**
****************************************************************************/
#include "qicucodec_p.h"
#include "qtextcodec_p.h"
#include "qutfcodec_p.h"
#include "qlatincodec_p.h"
#include "qsimplecodec_p.h"
#include "private/qcoreglobaldata_p.h"
#include "qdebug.h"
#include "unicode/ucnv.h"
#if QT_CONFIG(codecs)
#include "qtsciicodec_p.h"
#include "qisciicodec_p.h"
#endif
QT_BEGIN_NAMESPACE
typedef QList<QTextCodec*>::ConstIterator TextCodecListConstIt;
typedef QList<QByteArray>::ConstIterator ByteArrayListConstIt;
static void qIcuCodecStateFree(QTextCodec::ConverterState *state)
{
ucnv_close(static_cast<UConverter *>(state->d));
}
bool qTextCodecNameMatch(const char *n, const char *h)
{
return ucnv_compareNames(n, h) == 0;
}
/* The list below is generated from http://www.iana.org/assignments/character-sets/
using the snippet of code below:
#include <QtCore>
#include <unicode/ucnv.h>
int main(int argc, char **argv)
{
QCoreApplication app(argc, argv);
QFile file("character-sets.txt");
file.open(QFile::ReadOnly);
QByteArray name;
int mib = -1;
QByteArray nameList;
int pos = 0;
while (!file.atEnd()) {
QByteArray s = file.readLine().trimmed();
if (s.isEmpty()) {
if (mib != -1) {
UErrorCode error = U_ZERO_ERROR;
const char *standard_name = ucnv_getStandardName(name, "MIME", &error);
if (U_FAILURE(error) || !standard_name) {
error = U_ZERO_ERROR;
standard_name = ucnv_getStandardName(name, "IANA", &error);
}
UConverter *conv = ucnv_open(standard_name, &error);
if (!U_FAILURE(error) && conv && standard_name) {
ucnv_close(conv);
printf(" { %d, %d },\n", mib, pos);
nameList += "\"";
nameList += standard_name;
nameList += "\\0\"\n";
pos += strlen(standard_name) + 1;
}
}
name = QByteArray();
mib = -1;
}
if (s.startsWith("Name: ")) {
name = s.mid(5).trimmed();
if (name.indexOf(' ') > 0)
name = name.left(name.indexOf(' '));
}
if (s.startsWith("MIBenum:"))
mib = s.mid(8).trimmed().toInt();
if (s.startsWith("Alias:") && s.contains("MIME")) {
name = s.mid(6).trimmed();
name = name.left(name.indexOf(' ')).trimmed();
}
}
qDebug() << nameList;
}
*/
struct MibToName {
short mib;
short index;
};
static const MibToName mibToName[] = {
{ 3, 0 },
{ 4, 9 },
{ 5, 20 },
{ 6, 31 },
{ 7, 42 },
{ 8, 53 },
{ 9, 64 },
{ 10, 75 },
{ 11, 86 },
{ 12, 97 },
{ 13, 108 },
{ 16, 120 },
{ 17, 134 },
{ 18, 144 },
{ 30, 151 },
{ 36, 160 },
{ 37, 167 },
{ 38, 179 },
{ 39, 186 },
{ 40, 198 },
{ 57, 212 },
{ 81, 223 },
{ 82, 234 },
{ 84, 245 },
{ 85, 256 },
{ 104, 267 },
{ 105, 279 },
{ 106, 295 },
{ 109, 301 },
{ 110, 313 },
{ 111, 325 },
{ 113, 337 },
{ 114, 341 },
{ 1000, 349 },
{ 1001, 356 },
{ 1011, 363 },
{ 1012, 368 },
{ 1013, 374 },
{ 1014, 383 },
{ 1015, 392 },
{ 1016, 399 },
{ 1017, 406 },
{ 1018, 413 },
{ 1019, 422 },
{ 1020, 431 },
{ 2004, 438 },
{ 2005, 448 },
{ 2009, 472 },
{ 2013, 479 },
{ 2016, 486 },
{ 2024, 495 },
{ 2025, 505 },
{ 2026, 512 },
{ 2027, 517 },
{ 2028, 527 },
{ 2030, 534 },
{ 2033, 541 },
{ 2034, 548 },
{ 2035, 555 },
{ 2037, 562 },
{ 2038, 569 },
{ 2039, 576 },
{ 2040, 583 },
{ 2041, 590 },
{ 2043, 597 },
{ 2011, 604 },
{ 2044, 611 },
{ 2045, 618 },
{ 2010, 624 },
{ 2046, 631 },
{ 2047, 638 },
{ 2048, 645 },
{ 2049, 652 },
{ 2050, 659 },
{ 2051, 666 },
{ 2052, 673 },
{ 2053, 680 },
{ 2054, 687 },
{ 2055, 694 },
{ 2056, 701 },
{ 2062, 708 },
{ 2063, 715 },
{ 2084, 723 },
{ 2085, 730 },
{ 2086, 741 },
{ 2087, 748 },
{ 2088, 755 },
{ 2089, 762 },
{ 2091, 771 },
{ 2092, 780 },
{ 2093, 789 },
{ 2094, 798 },
{ 2095, 807 },
{ 2096, 816 },
{ 2097, 825 },
{ 2098, 834 },
{ 2099, 843 },
{ 2100, 852 },
{ 2101, 861 },
{ 2102, 872 },
{ 2250, 880 },
{ 2251, 893 },
{ 2252, 906 },
{ 2253, 919 },
{ 2254, 932 },
{ 2255, 945 },
{ 2256, 958 },
{ 2257, 971 },
{ 2258, 984 },
{ 2259, 997 },
};
int mibToNameSize = sizeof(mibToName)/sizeof(MibToName);
static const char mibToNameTable[] =
"US-ASCII\0"
"ISO-8859-1\0"
"ISO-8859-2\0"
"ISO-8859-3\0"
"ISO-8859-4\0"
"ISO-8859-5\0"
"ISO-8859-6\0"
"ISO-8859-7\0"
"ISO-8859-8\0"
"ISO-8859-9\0"
"ISO-8859-10\0"
"ISO-2022-JP-1\0"
"Shift_JIS\0"
"EUC-JP\0"
"US-ASCII\0"
"EUC-KR\0"
"ISO-2022-KR\0"
"EUC-KR\0"
"ISO-2022-JP\0"
"ISO-2022-JP-2\0"
"GB_2312-80\0"
"ISO-8859-6\0"
"ISO-8859-6\0"
"ISO-8859-8\0"
"ISO-8859-8\0"
"ISO-2022-CN\0"
"ISO-2022-CN-EXT\0"
"UTF-8\0"
"ISO-8859-13\0"
"ISO-8859-14\0"
"ISO-8859-15\0"
"GBK\0"
"GB18030\0"
"UTF-16\0"
"UTF-32\0"
"SCSU\0"
"UTF-7\0"
"UTF-16BE\0"
"UTF-16LE\0"
"UTF-16\0"
"CESU-8\0"
"UTF-32\0"
"UTF-32BE\0"
"UTF-32LE\0"
"BOCU-1\0"
"hp-roman8\0"
"Adobe-Standard-Encoding\0"
"IBM850\0"
"IBM862\0"
"IBM-Thai\0"
"Shift_JIS\0"
"GB2312\0"
"Big5\0"
"macintosh\0"
"IBM037\0"
"IBM273\0"
"IBM277\0"
"IBM278\0"
"IBM280\0"
"IBM284\0"
"IBM285\0"
"IBM290\0"
"IBM297\0"
"IBM420\0"
"IBM424\0"
"IBM437\0"
"IBM500\0"
"cp851\0"
"IBM852\0"
"IBM855\0"
"IBM857\0"
"IBM860\0"
"IBM861\0"
"IBM863\0"
"IBM864\0"
"IBM865\0"
"IBM868\0"
"IBM869\0"
"IBM870\0"
"IBM871\0"
"IBM918\0"
"IBM1026\0"
"KOI8-R\0"
"HZ-GB-2312\0"
"IBM866\0"
"IBM775\0"
"KOI8-U\0"
"IBM00858\0"
"IBM01140\0"
"IBM01141\0"
"IBM01142\0"
"IBM01143\0"
"IBM01144\0"
"IBM01145\0"
"IBM01146\0"
"IBM01147\0"
"IBM01148\0"
"IBM01149\0"
"Big5-HKSCS\0"
"IBM1047\0"
"windows-1250\0"
"windows-1251\0"
"windows-1252\0"
"windows-1253\0"
"windows-1254\0"
"windows-1255\0"
"windows-1256\0"
"windows-1257\0"
"windows-1258\0"
"TIS-620\0";
static QTextCodec *loadQtCodec(const char *name)
{
if (!strcmp(name, "UTF-8"))
return new QUtf8Codec;
if (!strcmp(name, "UTF-16"))
return new QUtf16Codec;
if (!strcmp(name, "ISO-8859-1"))
return new QLatin1Codec;
if (!strcmp(name, "UTF-16BE"))
return new QUtf16BECodec;
if (!strcmp(name, "UTF-16LE"))
return new QUtf16LECodec;
if (!strcmp(name, "UTF-32"))
return new QUtf32Codec;
if (!strcmp(name, "UTF-32BE"))
return new QUtf32BECodec;
if (!strcmp(name, "UTF-32LE"))
return new QUtf32LECodec;
if (!strcmp(name, "ISO-8859-16") || !strcmp(name, "latin10") || !strcmp(name, "iso-ir-226"))
return new QSimpleTextCodec(13 /* == 8859-16*/);
#if QT_CONFIG(codecs)
if (!strcmp(name, "TSCII"))
return new QTsciiCodec;
if (!qstrnicmp(name, "iscii", 5))
return QIsciiCodec::create(name);
#endif
return 0;
}
/// \threadsafe
QList<QByteArray> QIcuCodec::availableCodecs()
{
QList<QByteArray> codecs;
int n = ucnv_countAvailable();
for (int i = 0; i < n; ++i) {
const char *name = ucnv_getAvailableName(i);
UErrorCode error = U_ZERO_ERROR;
const char *standardName = ucnv_getStandardName(name, "MIME", &error);
if (U_FAILURE(error) || !standardName) {
error = U_ZERO_ERROR;
standardName = ucnv_getStandardName(name, "IANA", &error);
}
if (U_FAILURE(error))
continue;
error = U_ZERO_ERROR;
int ac = ucnv_countAliases(standardName, &error);
if (U_FAILURE(error))
continue;
for (int j = 0; j < ac; ++j) {
error = U_ZERO_ERROR;
const char *alias = ucnv_getAlias(standardName, j, &error);
if (!U_SUCCESS(error))
continue;
codecs += alias;
}
}
// handled by Qt and not in ICU:
codecs += "TSCII";
return codecs;
}
/// \threadsafe
QList<int> QIcuCodec::availableMibs()
{
QList<int> mibs;
mibs.reserve(mibToNameSize + 1);
for (int i = 0; i < mibToNameSize; ++i)
mibs += mibToName[i].mib;
// handled by Qt and not in ICU:
mibs += 2107; // TSCII
return mibs;
}
QTextCodec *QIcuCodec::defaultCodecUnlocked()
{
QCoreGlobalData *globalData = QCoreGlobalData::instance();
if (!globalData)
return 0;
QTextCodec *c = globalData->codecForLocale.loadAcquire();
if (c)
return c;
#if defined(QT_LOCALE_IS_UTF8)
const char *name = "UTF-8";
#else
const char *name = ucnv_getDefaultName();
#endif
c = codecForNameUnlocked(name);
globalData->codecForLocale.storeRelease(c);
return c;
}
QTextCodec *QIcuCodec::codecForNameUnlocked(const char *name)
{
// backwards compatibility with Qt 4.x
if (!qstrcmp(name, "CP949"))
name = "windows-949";
else if (!qstrcmp(name, "Apple Roman"))
name = "macintosh";
// these are broken data in ICU 4.4, and can't be resolved even though they are aliases to tis-620
if (!qstrcmp(name, "windows-874-2000")
|| !qstrcmp(name, "windows-874")
|| !qstrcmp(name, "MS874")
|| !qstrcmp(name, "x-windows-874")
|| !qstrcmp(name, "ISO 8859-11"))
name = "TIS-620";
UErrorCode error = U_ZERO_ERROR;
// MIME gives better default names
const char *standardName = ucnv_getStandardName(name, "MIME", &error);
if (U_FAILURE(error) || !standardName) {
error = U_ZERO_ERROR;
standardName = ucnv_getStandardName(name, "IANA", &error);
}
bool qt_only = false;
if (U_FAILURE(error) || !standardName) {
standardName = name;
qt_only = true;
} else {
// correct some issues where the ICU data set contains duplicated entries.
// Where this happens it's because one data set is a subset of another. We
// always use the larger data set.
if (qstrcmp(standardName, "GB2312") == 0 || qstrcmp(standardName, "GB_2312-80") == 0)
standardName = "GBK";
else if (qstrcmp(standardName, "KSC_5601") == 0 || qstrcmp(standardName, "EUC-KR") == 0 || qstrcmp(standardName, "cp1363") == 0)
standardName = "windows-949";
}
QCoreGlobalData *globalData = QCoreGlobalData::instance();
QTextCodecCache *cache = &globalData->codecCache;
QTextCodec *codec;
if (cache) {
codec = cache->value(standardName);
if (codec)
return codec;
}
for (TextCodecListConstIt it = globalData->allCodecs.constBegin(), cend = globalData->allCodecs.constEnd(); it != cend; ++it) {
QTextCodec *cursor = *it;
if (qTextCodecNameMatch(cursor->name(), standardName)) {
if (cache)
cache->insert(standardName, cursor);
return cursor;
}
QList<QByteArray> aliases = cursor->aliases();
for (ByteArrayListConstIt ait = aliases.constBegin(), acend = aliases.constEnd(); ait != acend; ++ait) {
if (qTextCodecNameMatch(*ait, standardName)) {
if (cache)
cache->insert(standardName, cursor);
return cursor;
}
}
}
QTextCodec *c = loadQtCodec(standardName);
if (c)
return c;
if (qt_only)
return 0;
// check whether there is really a converter for the name available.
UConverter *conv = ucnv_open(standardName, &error);
if (!conv) {
qDebug("codecForName: ucnv_open failed %s %s", standardName, u_errorName(error));
return 0;
}
//qDebug() << "QIcuCodec: Standard name for " << name << "is" << standardName;
ucnv_close(conv);
c = new QIcuCodec(standardName);
if (cache)
cache->insert(standardName, c);
return c;
}
QTextCodec *QIcuCodec::codecForMibUnlocked(int mib)
{
for (int i = 0; i < mibToNameSize; ++i) {
if (mibToName[i].mib == mib)
return codecForNameUnlocked(mibToNameTable + mibToName[i].index);
}
if (mib == 2107)
return codecForNameUnlocked("TSCII");
return 0;
}
QIcuCodec::QIcuCodec(const char *name)
: m_name(name)
{
}
QIcuCodec::~QIcuCodec()
{
}
UConverter *QIcuCodec::getConverter(QTextCodec::ConverterState *state) const
{
UConverter *conv = 0;
if (state) {
if (!state->d) {
// first time
state->flags |= QTextCodec::FreeFunction;
QTextCodecUnalignedPointer::encode(state->state_data, qIcuCodecStateFree);
UErrorCode error = U_ZERO_ERROR;
state->d = ucnv_open(m_name, &error);
ucnv_setSubstChars(static_cast<UConverter *>(state->d),
state->flags & QTextCodec::ConvertInvalidToNull ? "\0" : "?", 1, &error);
if (U_FAILURE(error))
qDebug("getConverter(state) ucnv_open failed %s %s", m_name, u_errorName(error));
}
conv = static_cast<UConverter *>(state->d);
}
if (!conv) {
// stateless conversion
UErrorCode error = U_ZERO_ERROR;
conv = ucnv_open(m_name, &error);
ucnv_setSubstChars(conv, "?", 1, &error);
if (U_FAILURE(error))
qDebug("getConverter(no state) ucnv_open failed %s %s", m_name, u_errorName(error));
}
return conv;
}
QString QIcuCodec::convertToUnicode(const char *chars, int length, QTextCodec::ConverterState *state) const
{
UConverter *conv = getConverter(state);
QString string(length + 2, Qt::Uninitialized);
const char *end = chars + length;
int convertedChars = 0;
while (1) {
UChar *uc = (UChar *)string.data();
UChar *ucEnd = uc + string.length();
uc += convertedChars;
UErrorCode error = U_ZERO_ERROR;
ucnv_toUnicode(conv,
&uc, ucEnd,
&chars, end,
0, false, &error);
if (!U_SUCCESS(error) && error != U_BUFFER_OVERFLOW_ERROR) {
qDebug("convertToUnicode failed: %s", u_errorName(error));
break;
}
convertedChars = uc - (UChar *)string.data();
if (chars >= end)
break;
string.resize(string.length()*2);
}
string.resize(convertedChars);
if (!state)
ucnv_close(conv);
return string;
}
QByteArray QIcuCodec::convertFromUnicode(const QChar *unicode, int length, QTextCodec::ConverterState *state) const
{
UConverter *conv = getConverter(state);
int requiredLength = UCNV_GET_MAX_BYTES_FOR_STRING(length, ucnv_getMaxCharSize(conv));
QByteArray string(requiredLength, Qt::Uninitialized);
const UChar *uc = (const UChar *)unicode;
const UChar *end = uc + length;
int convertedChars = 0;
while (1) {
char *ch = (char *)string.data();
char *chEnd = ch + string.length();
ch += convertedChars;
UErrorCode error = U_ZERO_ERROR;
ucnv_fromUnicode(conv,
&ch, chEnd,
&uc, end,
0, false, &error);
if (!U_SUCCESS(error))
qDebug("convertFromUnicode failed: %s", u_errorName(error));
convertedChars = ch - string.data();
if (uc >= end)
break;
string.resize(string.length()*2);
}
string.resize(convertedChars);
if (!state)
ucnv_close(conv);
return string;
}
QByteArray QIcuCodec::name() const
{
return m_name;
}
QList<QByteArray> QIcuCodec::aliases() const
{
UErrorCode error = U_ZERO_ERROR;
int n = ucnv_countAliases(m_name, &error);
QList<QByteArray> aliases;
for (int i = 0; i < n; ++i) {
const char *a = ucnv_getAlias(m_name, i, &error);
// skip the canonical name
if (!a || !qstrcmp(a, m_name))
continue;
aliases += a;
}
return aliases;
}
int QIcuCodec::mibEnum() const
{
for (int i = 0; i < mibToNameSize; ++i) {
if (qTextCodecNameMatch(m_name, (mibToNameTable + mibToName[i].index)))
return mibToName[i].mib;
}
return 0;
}
QT_END_NAMESPACE