| /**************************************************************************** |
| ** |
| ** Copyright (C) 2016 The Qt Company Ltd. |
| ** Contact: https://www.qt.io/licensing/ |
| ** |
| ** This file is part of the QtCore module of the Qt Toolkit. |
| ** |
| ** $QT_BEGIN_LICENSE:LGPL$ |
| ** Commercial License Usage |
| ** Licensees holding valid commercial Qt licenses may use this file in |
| ** accordance with the commercial license agreement provided with the |
| ** Software or, alternatively, in accordance with the terms contained in |
| ** a written agreement between you and The Qt Company. For licensing terms |
| ** and conditions see https://www.qt.io/terms-conditions. For further |
| ** information use the contact form at https://www.qt.io/contact-us. |
| ** |
| ** GNU Lesser General Public License Usage |
| ** Alternatively, this file may be used under the terms of the GNU Lesser |
| ** General Public License version 3 as published by the Free Software |
| ** Foundation and appearing in the file LICENSE.LGPL3 included in the |
| ** packaging of this file. Please review the following information to |
| ** ensure the GNU Lesser General Public License version 3 requirements |
| ** will be met: https://www.gnu.org/licenses/lgpl-3.0.html. |
| ** |
| ** GNU General Public License Usage |
| ** Alternatively, this file may be used under the terms of the GNU |
| ** General Public License version 2.0 or (at your option) the GNU General |
| ** Public license version 3 or any later version approved by the KDE Free |
| ** Qt Foundation. The licenses are as published by the Free Software |
| ** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3 |
| ** included in the packaging of this file. Please review the following |
| ** information to ensure the GNU General Public License requirements will |
| ** be met: https://www.gnu.org/licenses/gpl-2.0.html and |
| ** https://www.gnu.org/licenses/gpl-3.0.html. |
| ** |
| ** $QT_END_LICENSE$ |
| ** |
| ****************************************************************************/ |
| |
| #include "qicucodec_p.h" |
| |
| #include "qtextcodec_p.h" |
| #include "qutfcodec_p.h" |
| #include "qlatincodec_p.h" |
| #include "qsimplecodec_p.h" |
| #include "private/qcoreglobaldata_p.h" |
| #include "qdebug.h" |
| |
| #include "unicode/ucnv.h" |
| |
| #if QT_CONFIG(codecs) |
| #include "qtsciicodec_p.h" |
| #include "qisciicodec_p.h" |
| #endif |
| |
| QT_BEGIN_NAMESPACE |
| |
| typedef QList<QTextCodec*>::ConstIterator TextCodecListConstIt; |
| typedef QList<QByteArray>::ConstIterator ByteArrayListConstIt; |
| |
| static void qIcuCodecStateFree(QTextCodec::ConverterState *state) |
| { |
| ucnv_close(static_cast<UConverter *>(state->d)); |
| } |
| |
| bool qTextCodecNameMatch(const char *n, const char *h) |
| { |
| return ucnv_compareNames(n, h) == 0; |
| } |
| |
| /* The list below is generated from http://www.iana.org/assignments/character-sets/ |
| using the snippet of code below: |
| |
| #include <QtCore> |
| #include <unicode/ucnv.h> |
| |
| int main(int argc, char **argv) |
| { |
| QCoreApplication app(argc, argv); |
| |
| QFile file("character-sets.txt"); |
| file.open(QFile::ReadOnly); |
| QByteArray name; |
| int mib = -1; |
| QByteArray nameList; |
| int pos = 0; |
| while (!file.atEnd()) { |
| QByteArray s = file.readLine().trimmed(); |
| if (s.isEmpty()) { |
| if (mib != -1) { |
| UErrorCode error = U_ZERO_ERROR; |
| const char *standard_name = ucnv_getStandardName(name, "MIME", &error); |
| if (U_FAILURE(error) || !standard_name) { |
| error = U_ZERO_ERROR; |
| standard_name = ucnv_getStandardName(name, "IANA", &error); |
| } |
| UConverter *conv = ucnv_open(standard_name, &error); |
| if (!U_FAILURE(error) && conv && standard_name) { |
| ucnv_close(conv); |
| printf(" { %d, %d },\n", mib, pos); |
| nameList += "\""; |
| nameList += standard_name; |
| nameList += "\\0\"\n"; |
| pos += strlen(standard_name) + 1; |
| } |
| } |
| name = QByteArray(); |
| mib = -1; |
| } |
| if (s.startsWith("Name: ")) { |
| name = s.mid(5).trimmed(); |
| if (name.indexOf(' ') > 0) |
| name = name.left(name.indexOf(' ')); |
| } |
| if (s.startsWith("MIBenum:")) |
| mib = s.mid(8).trimmed().toInt(); |
| if (s.startsWith("Alias:") && s.contains("MIME")) { |
| name = s.mid(6).trimmed(); |
| name = name.left(name.indexOf(' ')).trimmed(); |
| } |
| } |
| qDebug() << nameList; |
| } |
| */ |
| |
| struct MibToName { |
| short mib; |
| short index; |
| }; |
| |
| static const MibToName mibToName[] = { |
| { 3, 0 }, |
| { 4, 9 }, |
| { 5, 20 }, |
| { 6, 31 }, |
| { 7, 42 }, |
| { 8, 53 }, |
| { 9, 64 }, |
| { 10, 75 }, |
| { 11, 86 }, |
| { 12, 97 }, |
| { 13, 108 }, |
| { 16, 120 }, |
| { 17, 134 }, |
| { 18, 144 }, |
| { 30, 151 }, |
| { 36, 160 }, |
| { 37, 167 }, |
| { 38, 179 }, |
| { 39, 186 }, |
| { 40, 198 }, |
| { 57, 212 }, |
| { 81, 223 }, |
| { 82, 234 }, |
| { 84, 245 }, |
| { 85, 256 }, |
| { 104, 267 }, |
| { 105, 279 }, |
| { 106, 295 }, |
| { 109, 301 }, |
| { 110, 313 }, |
| { 111, 325 }, |
| { 113, 337 }, |
| { 114, 341 }, |
| { 1000, 349 }, |
| { 1001, 356 }, |
| { 1011, 363 }, |
| { 1012, 368 }, |
| { 1013, 374 }, |
| { 1014, 383 }, |
| { 1015, 392 }, |
| { 1016, 399 }, |
| { 1017, 406 }, |
| { 1018, 413 }, |
| { 1019, 422 }, |
| { 1020, 431 }, |
| { 2004, 438 }, |
| { 2005, 448 }, |
| { 2009, 472 }, |
| { 2013, 479 }, |
| { 2016, 486 }, |
| { 2024, 495 }, |
| { 2025, 505 }, |
| { 2026, 512 }, |
| { 2027, 517 }, |
| { 2028, 527 }, |
| { 2030, 534 }, |
| { 2033, 541 }, |
| { 2034, 548 }, |
| { 2035, 555 }, |
| { 2037, 562 }, |
| { 2038, 569 }, |
| { 2039, 576 }, |
| { 2040, 583 }, |
| { 2041, 590 }, |
| { 2043, 597 }, |
| { 2011, 604 }, |
| { 2044, 611 }, |
| { 2045, 618 }, |
| { 2010, 624 }, |
| { 2046, 631 }, |
| { 2047, 638 }, |
| { 2048, 645 }, |
| { 2049, 652 }, |
| { 2050, 659 }, |
| { 2051, 666 }, |
| { 2052, 673 }, |
| { 2053, 680 }, |
| { 2054, 687 }, |
| { 2055, 694 }, |
| { 2056, 701 }, |
| { 2062, 708 }, |
| { 2063, 715 }, |
| { 2084, 723 }, |
| { 2085, 730 }, |
| { 2086, 741 }, |
| { 2087, 748 }, |
| { 2088, 755 }, |
| { 2089, 762 }, |
| { 2091, 771 }, |
| { 2092, 780 }, |
| { 2093, 789 }, |
| { 2094, 798 }, |
| { 2095, 807 }, |
| { 2096, 816 }, |
| { 2097, 825 }, |
| { 2098, 834 }, |
| { 2099, 843 }, |
| { 2100, 852 }, |
| { 2101, 861 }, |
| { 2102, 872 }, |
| { 2250, 880 }, |
| { 2251, 893 }, |
| { 2252, 906 }, |
| { 2253, 919 }, |
| { 2254, 932 }, |
| { 2255, 945 }, |
| { 2256, 958 }, |
| { 2257, 971 }, |
| { 2258, 984 }, |
| { 2259, 997 }, |
| }; |
| int mibToNameSize = sizeof(mibToName)/sizeof(MibToName); |
| |
| static const char mibToNameTable[] = |
| "US-ASCII\0" |
| "ISO-8859-1\0" |
| "ISO-8859-2\0" |
| "ISO-8859-3\0" |
| "ISO-8859-4\0" |
| "ISO-8859-5\0" |
| "ISO-8859-6\0" |
| "ISO-8859-7\0" |
| "ISO-8859-8\0" |
| "ISO-8859-9\0" |
| "ISO-8859-10\0" |
| "ISO-2022-JP-1\0" |
| "Shift_JIS\0" |
| "EUC-JP\0" |
| "US-ASCII\0" |
| "EUC-KR\0" |
| "ISO-2022-KR\0" |
| "EUC-KR\0" |
| "ISO-2022-JP\0" |
| "ISO-2022-JP-2\0" |
| "GB_2312-80\0" |
| "ISO-8859-6\0" |
| "ISO-8859-6\0" |
| "ISO-8859-8\0" |
| "ISO-8859-8\0" |
| "ISO-2022-CN\0" |
| "ISO-2022-CN-EXT\0" |
| "UTF-8\0" |
| "ISO-8859-13\0" |
| "ISO-8859-14\0" |
| "ISO-8859-15\0" |
| "GBK\0" |
| "GB18030\0" |
| "UTF-16\0" |
| "UTF-32\0" |
| "SCSU\0" |
| "UTF-7\0" |
| "UTF-16BE\0" |
| "UTF-16LE\0" |
| "UTF-16\0" |
| "CESU-8\0" |
| "UTF-32\0" |
| "UTF-32BE\0" |
| "UTF-32LE\0" |
| "BOCU-1\0" |
| "hp-roman8\0" |
| "Adobe-Standard-Encoding\0" |
| "IBM850\0" |
| "IBM862\0" |
| "IBM-Thai\0" |
| "Shift_JIS\0" |
| "GB2312\0" |
| "Big5\0" |
| "macintosh\0" |
| "IBM037\0" |
| "IBM273\0" |
| "IBM277\0" |
| "IBM278\0" |
| "IBM280\0" |
| "IBM284\0" |
| "IBM285\0" |
| "IBM290\0" |
| "IBM297\0" |
| "IBM420\0" |
| "IBM424\0" |
| "IBM437\0" |
| "IBM500\0" |
| "cp851\0" |
| "IBM852\0" |
| "IBM855\0" |
| "IBM857\0" |
| "IBM860\0" |
| "IBM861\0" |
| "IBM863\0" |
| "IBM864\0" |
| "IBM865\0" |
| "IBM868\0" |
| "IBM869\0" |
| "IBM870\0" |
| "IBM871\0" |
| "IBM918\0" |
| "IBM1026\0" |
| "KOI8-R\0" |
| "HZ-GB-2312\0" |
| "IBM866\0" |
| "IBM775\0" |
| "KOI8-U\0" |
| "IBM00858\0" |
| "IBM01140\0" |
| "IBM01141\0" |
| "IBM01142\0" |
| "IBM01143\0" |
| "IBM01144\0" |
| "IBM01145\0" |
| "IBM01146\0" |
| "IBM01147\0" |
| "IBM01148\0" |
| "IBM01149\0" |
| "Big5-HKSCS\0" |
| "IBM1047\0" |
| "windows-1250\0" |
| "windows-1251\0" |
| "windows-1252\0" |
| "windows-1253\0" |
| "windows-1254\0" |
| "windows-1255\0" |
| "windows-1256\0" |
| "windows-1257\0" |
| "windows-1258\0" |
| "TIS-620\0"; |
| |
| static QTextCodec *loadQtCodec(const char *name) |
| { |
| if (!strcmp(name, "UTF-8")) |
| return new QUtf8Codec; |
| if (!strcmp(name, "UTF-16")) |
| return new QUtf16Codec; |
| if (!strcmp(name, "ISO-8859-1")) |
| return new QLatin1Codec; |
| if (!strcmp(name, "UTF-16BE")) |
| return new QUtf16BECodec; |
| if (!strcmp(name, "UTF-16LE")) |
| return new QUtf16LECodec; |
| if (!strcmp(name, "UTF-32")) |
| return new QUtf32Codec; |
| if (!strcmp(name, "UTF-32BE")) |
| return new QUtf32BECodec; |
| if (!strcmp(name, "UTF-32LE")) |
| return new QUtf32LECodec; |
| if (!strcmp(name, "ISO-8859-16") || !strcmp(name, "latin10") || !strcmp(name, "iso-ir-226")) |
| return new QSimpleTextCodec(13 /* == 8859-16*/); |
| #if QT_CONFIG(codecs) |
| if (!strcmp(name, "TSCII")) |
| return new QTsciiCodec; |
| if (!qstrnicmp(name, "iscii", 5)) |
| return QIsciiCodec::create(name); |
| #endif |
| |
| return 0; |
| } |
| |
| /// \threadsafe |
| QList<QByteArray> QIcuCodec::availableCodecs() |
| { |
| QList<QByteArray> codecs; |
| int n = ucnv_countAvailable(); |
| for (int i = 0; i < n; ++i) { |
| const char *name = ucnv_getAvailableName(i); |
| |
| UErrorCode error = U_ZERO_ERROR; |
| const char *standardName = ucnv_getStandardName(name, "MIME", &error); |
| if (U_FAILURE(error) || !standardName) { |
| error = U_ZERO_ERROR; |
| standardName = ucnv_getStandardName(name, "IANA", &error); |
| } |
| if (U_FAILURE(error)) |
| continue; |
| |
| error = U_ZERO_ERROR; |
| int ac = ucnv_countAliases(standardName, &error); |
| if (U_FAILURE(error)) |
| continue; |
| for (int j = 0; j < ac; ++j) { |
| error = U_ZERO_ERROR; |
| const char *alias = ucnv_getAlias(standardName, j, &error); |
| if (!U_SUCCESS(error)) |
| continue; |
| codecs += alias; |
| } |
| } |
| |
| // handled by Qt and not in ICU: |
| codecs += "TSCII"; |
| |
| return codecs; |
| } |
| |
| /// \threadsafe |
| QList<int> QIcuCodec::availableMibs() |
| { |
| QList<int> mibs; |
| mibs.reserve(mibToNameSize + 1); |
| for (int i = 0; i < mibToNameSize; ++i) |
| mibs += mibToName[i].mib; |
| |
| // handled by Qt and not in ICU: |
| mibs += 2107; // TSCII |
| |
| return mibs; |
| } |
| |
| QTextCodec *QIcuCodec::defaultCodecUnlocked() |
| { |
| QCoreGlobalData *globalData = QCoreGlobalData::instance(); |
| if (!globalData) |
| return 0; |
| QTextCodec *c = globalData->codecForLocale.loadAcquire(); |
| if (c) |
| return c; |
| |
| #if defined(QT_LOCALE_IS_UTF8) |
| const char *name = "UTF-8"; |
| #else |
| const char *name = ucnv_getDefaultName(); |
| #endif |
| c = codecForNameUnlocked(name); |
| globalData->codecForLocale.storeRelease(c); |
| return c; |
| } |
| |
| |
| QTextCodec *QIcuCodec::codecForNameUnlocked(const char *name) |
| { |
| // backwards compatibility with Qt 4.x |
| if (!qstrcmp(name, "CP949")) |
| name = "windows-949"; |
| else if (!qstrcmp(name, "Apple Roman")) |
| name = "macintosh"; |
| // these are broken data in ICU 4.4, and can't be resolved even though they are aliases to tis-620 |
| if (!qstrcmp(name, "windows-874-2000") |
| || !qstrcmp(name, "windows-874") |
| || !qstrcmp(name, "MS874") |
| || !qstrcmp(name, "x-windows-874") |
| || !qstrcmp(name, "ISO 8859-11")) |
| name = "TIS-620"; |
| |
| UErrorCode error = U_ZERO_ERROR; |
| // MIME gives better default names |
| const char *standardName = ucnv_getStandardName(name, "MIME", &error); |
| if (U_FAILURE(error) || !standardName) { |
| error = U_ZERO_ERROR; |
| standardName = ucnv_getStandardName(name, "IANA", &error); |
| } |
| bool qt_only = false; |
| if (U_FAILURE(error) || !standardName) { |
| standardName = name; |
| qt_only = true; |
| } else { |
| // correct some issues where the ICU data set contains duplicated entries. |
| // Where this happens it's because one data set is a subset of another. We |
| // always use the larger data set. |
| |
| if (qstrcmp(standardName, "GB2312") == 0 || qstrcmp(standardName, "GB_2312-80") == 0) |
| standardName = "GBK"; |
| else if (qstrcmp(standardName, "KSC_5601") == 0 || qstrcmp(standardName, "EUC-KR") == 0 || qstrcmp(standardName, "cp1363") == 0) |
| standardName = "windows-949"; |
| } |
| |
| QCoreGlobalData *globalData = QCoreGlobalData::instance(); |
| QTextCodecCache *cache = &globalData->codecCache; |
| |
| QTextCodec *codec; |
| if (cache) { |
| codec = cache->value(standardName); |
| if (codec) |
| return codec; |
| } |
| |
| for (TextCodecListConstIt it = globalData->allCodecs.constBegin(), cend = globalData->allCodecs.constEnd(); it != cend; ++it) { |
| QTextCodec *cursor = *it; |
| if (qTextCodecNameMatch(cursor->name(), standardName)) { |
| if (cache) |
| cache->insert(standardName, cursor); |
| return cursor; |
| } |
| QList<QByteArray> aliases = cursor->aliases(); |
| for (ByteArrayListConstIt ait = aliases.constBegin(), acend = aliases.constEnd(); ait != acend; ++ait) { |
| if (qTextCodecNameMatch(*ait, standardName)) { |
| if (cache) |
| cache->insert(standardName, cursor); |
| return cursor; |
| } |
| } |
| } |
| |
| QTextCodec *c = loadQtCodec(standardName); |
| if (c) |
| return c; |
| |
| if (qt_only) |
| return 0; |
| |
| // check whether there is really a converter for the name available. |
| UConverter *conv = ucnv_open(standardName, &error); |
| if (!conv) { |
| qDebug("codecForName: ucnv_open failed %s %s", standardName, u_errorName(error)); |
| return 0; |
| } |
| //qDebug() << "QIcuCodec: Standard name for " << name << "is" << standardName; |
| ucnv_close(conv); |
| |
| |
| c = new QIcuCodec(standardName); |
| if (cache) |
| cache->insert(standardName, c); |
| return c; |
| } |
| |
| |
| QTextCodec *QIcuCodec::codecForMibUnlocked(int mib) |
| { |
| for (int i = 0; i < mibToNameSize; ++i) { |
| if (mibToName[i].mib == mib) |
| return codecForNameUnlocked(mibToNameTable + mibToName[i].index); |
| } |
| |
| if (mib == 2107) |
| return codecForNameUnlocked("TSCII"); |
| |
| return 0; |
| } |
| |
| |
| QIcuCodec::QIcuCodec(const char *name) |
| : m_name(name) |
| { |
| } |
| |
| QIcuCodec::~QIcuCodec() |
| { |
| } |
| |
| UConverter *QIcuCodec::getConverter(QTextCodec::ConverterState *state) const |
| { |
| UConverter *conv = 0; |
| if (state) { |
| if (!state->d) { |
| // first time |
| state->flags |= QTextCodec::FreeFunction; |
| QTextCodecUnalignedPointer::encode(state->state_data, qIcuCodecStateFree); |
| UErrorCode error = U_ZERO_ERROR; |
| state->d = ucnv_open(m_name, &error); |
| ucnv_setSubstChars(static_cast<UConverter *>(state->d), |
| state->flags & QTextCodec::ConvertInvalidToNull ? "\0" : "?", 1, &error); |
| if (U_FAILURE(error)) |
| qDebug("getConverter(state) ucnv_open failed %s %s", m_name, u_errorName(error)); |
| } |
| conv = static_cast<UConverter *>(state->d); |
| } |
| if (!conv) { |
| // stateless conversion |
| UErrorCode error = U_ZERO_ERROR; |
| conv = ucnv_open(m_name, &error); |
| ucnv_setSubstChars(conv, "?", 1, &error); |
| if (U_FAILURE(error)) |
| qDebug("getConverter(no state) ucnv_open failed %s %s", m_name, u_errorName(error)); |
| } |
| return conv; |
| } |
| |
| QString QIcuCodec::convertToUnicode(const char *chars, int length, QTextCodec::ConverterState *state) const |
| { |
| UConverter *conv = getConverter(state); |
| |
| QString string(length + 2, Qt::Uninitialized); |
| |
| const char *end = chars + length; |
| int convertedChars = 0; |
| while (1) { |
| UChar *uc = (UChar *)string.data(); |
| UChar *ucEnd = uc + string.length(); |
| uc += convertedChars; |
| UErrorCode error = U_ZERO_ERROR; |
| ucnv_toUnicode(conv, |
| &uc, ucEnd, |
| &chars, end, |
| 0, false, &error); |
| if (!U_SUCCESS(error) && error != U_BUFFER_OVERFLOW_ERROR) { |
| qDebug("convertToUnicode failed: %s", u_errorName(error)); |
| break; |
| } |
| |
| convertedChars = uc - (UChar *)string.data(); |
| if (chars >= end) |
| break; |
| string.resize(string.length()*2); |
| } |
| string.resize(convertedChars); |
| |
| if (!state) |
| ucnv_close(conv); |
| return string; |
| } |
| |
| |
| QByteArray QIcuCodec::convertFromUnicode(const QChar *unicode, int length, QTextCodec::ConverterState *state) const |
| { |
| UConverter *conv = getConverter(state); |
| |
| int requiredLength = UCNV_GET_MAX_BYTES_FOR_STRING(length, ucnv_getMaxCharSize(conv)); |
| QByteArray string(requiredLength, Qt::Uninitialized); |
| |
| const UChar *uc = (const UChar *)unicode; |
| const UChar *end = uc + length; |
| int convertedChars = 0; |
| while (1) { |
| char *ch = (char *)string.data(); |
| char *chEnd = ch + string.length(); |
| ch += convertedChars; |
| UErrorCode error = U_ZERO_ERROR; |
| ucnv_fromUnicode(conv, |
| &ch, chEnd, |
| &uc, end, |
| 0, false, &error); |
| if (!U_SUCCESS(error)) |
| qDebug("convertFromUnicode failed: %s", u_errorName(error)); |
| convertedChars = ch - string.data(); |
| if (uc >= end) |
| break; |
| string.resize(string.length()*2); |
| } |
| string.resize(convertedChars); |
| |
| if (!state) |
| ucnv_close(conv); |
| |
| return string; |
| } |
| |
| |
| QByteArray QIcuCodec::name() const |
| { |
| return m_name; |
| } |
| |
| |
| QList<QByteArray> QIcuCodec::aliases() const |
| { |
| UErrorCode error = U_ZERO_ERROR; |
| |
| int n = ucnv_countAliases(m_name, &error); |
| |
| QList<QByteArray> aliases; |
| for (int i = 0; i < n; ++i) { |
| const char *a = ucnv_getAlias(m_name, i, &error); |
| // skip the canonical name |
| if (!a || !qstrcmp(a, m_name)) |
| continue; |
| aliases += a; |
| } |
| |
| return aliases; |
| } |
| |
| |
| int QIcuCodec::mibEnum() const |
| { |
| for (int i = 0; i < mibToNameSize; ++i) { |
| if (qTextCodecNameMatch(m_name, (mibToNameTable + mibToName[i].index))) |
| return mibToName[i].mib; |
| } |
| |
| return 0; |
| } |
| |
| QT_END_NAMESPACE |