blob: 7a59bc598689c7cd6eeea91e4740013570c5035c [file] [log] [blame]
/****************************************************************************
**
** Copyright (C) 2019 The Qt Company Ltd.
** Contact: https://www.qt.io/licensing/
**
** This file is part of the tools applications of the Qt Toolkit.
**
** $QT_BEGIN_LICENSE:GPL-EXCEPT$
** Commercial License Usage
** Licensees holding valid commercial Qt licenses may use this file in
** accordance with the commercial license agreement provided with the
** Software or, alternatively, in accordance with the terms contained in
** a written agreement between you and The Qt Company. For licensing terms
** and conditions see https://www.qt.io/terms-conditions. For further
** information use the contact form at https://www.qt.io/contact-us.
**
** GNU General Public License Usage
** Alternatively, this file may be used under the terms of the GNU
** General Public License version 3 as published by the Free Software
** Foundation with exceptions as appearing in the file LICENSE.GPL3-EXCEPT
** included in the packaging of this file. Please review the following
** information to ensure the GNU General Public License requirements will
** be met: https://www.gnu.org/licenses/gpl-3.0.html.
**
** $QT_END_LICENSE$
**
****************************************************************************/
#include "tokenizer.h"
#include "config.h"
#include "generator.h"
#include <QtCore/qfile.h>
#include <QtCore/qhash.h>
#include <QtCore/qregexp.h>
#include <QtCore/qstring.h>
#include <QtCore/qtextcodec.h>
#include <ctype.h>
#include <string.h>
QT_BEGIN_NAMESPACE
#define LANGUAGE_CPP "Cpp"
/* qmake ignore Q_OBJECT */
/*
Keep in sync with tokenizer.h.
*/
static const char *kwords[] = { "char",
"class",
"const",
"double",
"enum",
"explicit",
"friend",
"inline",
"int",
"long",
"namespace",
"operator",
"private",
"protected",
"public",
"short",
"signals",
"signed",
"slots",
"static",
"struct",
"template",
"typedef",
"typename",
"union",
"unsigned",
"using",
"virtual",
"void",
"volatile",
"__int64",
"default",
"delete",
"final",
"override",
"Q_OBJECT",
"Q_OVERRIDE",
"Q_PROPERTY",
"Q_PRIVATE_PROPERTY",
"Q_DECLARE_SEQUENTIAL_ITERATOR",
"Q_DECLARE_MUTABLE_SEQUENTIAL_ITERATOR",
"Q_DECLARE_ASSOCIATIVE_ITERATOR",
"Q_DECLARE_MUTABLE_ASSOCIATIVE_ITERATOR",
"Q_DECLARE_FLAGS",
"Q_SIGNALS",
"Q_SLOTS",
"QT_COMPAT",
"QT_COMPAT_CONSTRUCTOR",
"QT_DEPRECATED",
"QT_MOC_COMPAT",
"QT_MODULE",
"QT3_SUPPORT",
"QT3_SUPPORT_CONSTRUCTOR",
"QT3_MOC_SUPPORT",
"QDOC_PROPERTY",
"QPrivateSignal" };
static const int KwordHashTableSize = 4096;
static int kwordHashTable[KwordHashTableSize];
static QHash<QByteArray, bool> *ignoredTokensAndDirectives = nullptr;
static QRegExp *comment = nullptr;
static QRegExp *versionX = nullptr;
static QRegExp *definedX = nullptr;
static QRegExp *defines = nullptr;
static QRegExp *falsehoods = nullptr;
#ifndef QT_NO_TEXTCODEC
static QTextCodec *sourceCodec = nullptr;
#endif
/*
This function is a perfect hash function for the 37 keywords of C99
(with a hash table size of 512). It should perform well on our
Qt-enhanced C++ subset.
*/
static int hashKword(const char *s, int len)
{
return (((uchar)s[0]) + (((uchar)s[2]) << 5) + (((uchar)s[len - 1]) << 3)) % KwordHashTableSize;
}
static void insertKwordIntoHash(const char *s, int number)
{
int k = hashKword(s, int(strlen(s)));
while (kwordHashTable[k]) {
if (++k == KwordHashTableSize)
k = 0;
}
kwordHashTable[k] = number;
}
Tokenizer::Tokenizer(const Location &loc, QFile &in)
{
init();
yyIn = in.readAll();
yyPos = 0;
start(loc);
}
Tokenizer::Tokenizer(const Location &loc, const QByteArray &in) : yyIn(in)
{
init();
yyPos = 0;
start(loc);
}
Tokenizer::~Tokenizer()
{
delete[] yyLexBuf1;
delete[] yyLexBuf2;
}
int Tokenizer::getToken()
{
char *t = yyPrevLex;
yyPrevLex = yyLex;
yyLex = t;
while (yyCh != EOF) {
yyTokLoc = yyCurLoc;
yyLexLen = 0;
if (isspace(yyCh)) {
do {
yyCh = getChar();
} while (isspace(yyCh));
} else if (isalpha(yyCh) || yyCh == '_') {
do {
yyCh = getChar();
} while (isalnum(yyCh) || yyCh == '_');
int k = hashKword(yyLex, int(yyLexLen));
for (;;) {
int i = kwordHashTable[k];
if (i == 0) {
return Tok_Ident;
} else if (i == -1) {
if (!parsingMacro && ignoredTokensAndDirectives->contains(yyLex)) {
if (ignoredTokensAndDirectives->value(yyLex)) { // it's a directive
int parenDepth = 0;
while (yyCh != EOF && (yyCh != ')' || parenDepth > 1)) {
if (yyCh == '(')
++parenDepth;
else if (yyCh == ')')
--parenDepth;
yyCh = getChar();
}
if (yyCh == ')')
yyCh = getChar();
}
break;
}
} else if (strcmp(yyLex, kwords[i - 1]) == 0) {
int ret = (int)Tok_FirstKeyword + i - 1;
if (ret != Tok_typename)
return ret;
break;
}
if (++k == KwordHashTableSize)
k = 0;
}
} else if (isdigit(yyCh)) {
do {
yyCh = getChar();
} while (isalnum(yyCh) || yyCh == '.' || yyCh == '+' || yyCh == '-');
return Tok_Number;
} else {
switch (yyCh) {
case '!':
case '%':
yyCh = getChar();
if (yyCh == '=')
yyCh = getChar();
return Tok_SomeOperator;
case '"':
yyCh = getChar();
while (yyCh != EOF && yyCh != '"') {
if (yyCh == '\\')
yyCh = getChar();
yyCh = getChar();
}
yyCh = getChar();
if (yyCh == EOF)
yyTokLoc.warning(tr("Unterminated C++ string literal"),
tr("Maybe you forgot '/*!' at the beginning of the file?"));
else
return Tok_String;
break;
case '#':
return getTokenAfterPreprocessor();
case '&':
yyCh = getChar();
/*
Removed check for '&&', only interpret '&=' as an operator.
'&&' is also used for an rvalue reference. QTBUG-32675
*/
if (yyCh == '=') {
yyCh = getChar();
return Tok_SomeOperator;
} else {
return Tok_Ampersand;
}
case '\'':
yyCh = getChar();
/*
Allow empty character literal. QTBUG-25775
*/
if (yyCh == '\'') {
yyCh = getChar();
break;
}
if (yyCh == '\\')
yyCh = getChar();
do {
yyCh = getChar();
} while (yyCh != EOF && yyCh != '\'');
if (yyCh == EOF) {
yyTokLoc.warning(tr("Unterminated C++ character literal"));
} else {
yyCh = getChar();
return Tok_Number;
}
break;
case '(':
yyCh = getChar();
if (yyNumPreprocessorSkipping == 0)
yyParenDepth++;
if (isspace(yyCh)) {
do {
yyCh = getChar();
} while (isspace(yyCh));
yyLexLen = 1;
yyLex[1] = '\0';
}
if (yyCh == '*') {
yyCh = getChar();
return Tok_LeftParenAster;
}
return Tok_LeftParen;
case ')':
yyCh = getChar();
if (yyNumPreprocessorSkipping == 0)
yyParenDepth--;
return Tok_RightParen;
case '*':
yyCh = getChar();
if (yyCh == '=') {
yyCh = getChar();
return Tok_SomeOperator;
} else {
return Tok_Aster;
}
case '^':
yyCh = getChar();
if (yyCh == '=') {
yyCh = getChar();
return Tok_SomeOperator;
} else {
return Tok_Caret;
}
case '+':
yyCh = getChar();
if (yyCh == '+' || yyCh == '=')
yyCh = getChar();
return Tok_SomeOperator;
case ',':
yyCh = getChar();
return Tok_Comma;
case '-':
yyCh = getChar();
if (yyCh == '-' || yyCh == '=') {
yyCh = getChar();
} else if (yyCh == '>') {
yyCh = getChar();
if (yyCh == '*')
yyCh = getChar();
}
return Tok_SomeOperator;
case '.':
yyCh = getChar();
if (yyCh == '*') {
yyCh = getChar();
} else if (yyCh == '.') {
do {
yyCh = getChar();
} while (yyCh == '.');
return Tok_Ellipsis;
} else if (isdigit(yyCh)) {
do {
yyCh = getChar();
} while (isalnum(yyCh) || yyCh == '.' || yyCh == '+' || yyCh == '-');
return Tok_Number;
}
return Tok_SomeOperator;
case '/':
yyCh = getChar();
if (yyCh == '/') {
do {
yyCh = getChar();
} while (yyCh != EOF && yyCh != '\n');
} else if (yyCh == '*') {
bool metDoc = false; // empty doc is no doc
bool metSlashAsterBang = false;
bool metAster = false;
bool metAsterSlash = false;
yyCh = getChar();
if (yyCh == '!')
metSlashAsterBang = true;
while (!metAsterSlash) {
if (yyCh == EOF) {
yyTokLoc.warning(tr("Unterminated C++ comment"));
break;
} else {
if (yyCh == '*') {
metAster = true;
} else if (metAster && yyCh == '/') {
metAsterSlash = true;
} else {
metAster = false;
if (isgraph(yyCh))
metDoc = true;
}
}
yyCh = getChar();
}
if (metSlashAsterBang && metDoc)
return Tok_Doc;
else if (yyParenDepth > 0)
return Tok_Comment;
} else {
if (yyCh == '=')
yyCh = getChar();
return Tok_SomeOperator;
}
break;
case ':':
yyCh = getChar();
if (yyCh == ':') {
yyCh = getChar();
return Tok_Gulbrandsen;
} else {
return Tok_Colon;
}
case ';':
yyCh = getChar();
return Tok_Semicolon;
case '<':
yyCh = getChar();
if (yyCh == '<') {
yyCh = getChar();
if (yyCh == '=')
yyCh = getChar();
return Tok_SomeOperator;
} else if (yyCh == '=') {
yyCh = getChar();
return Tok_SomeOperator;
} else {
return Tok_LeftAngle;
}
case '=':
yyCh = getChar();
if (yyCh == '=') {
yyCh = getChar();
return Tok_SomeOperator;
} else {
return Tok_Equal;
}
case '>':
yyCh = getChar();
if (yyCh == '>') {
yyCh = getChar();
if (yyCh == '=')
yyCh = getChar();
return Tok_SomeOperator;
} else if (yyCh == '=') {
yyCh = getChar();
return Tok_SomeOperator;
} else {
return Tok_RightAngle;
}
case '?':
yyCh = getChar();
return Tok_SomeOperator;
case '[':
yyCh = getChar();
if (yyNumPreprocessorSkipping == 0)
yyBracketDepth++;
return Tok_LeftBracket;
case '\\':
yyCh = getChar();
yyCh = getChar(); // skip one character
break;
case ']':
yyCh = getChar();
if (yyNumPreprocessorSkipping == 0)
yyBracketDepth--;
return Tok_RightBracket;
case '{':
yyCh = getChar();
if (yyNumPreprocessorSkipping == 0)
yyBraceDepth++;
return Tok_LeftBrace;
case '}':
yyCh = getChar();
if (yyNumPreprocessorSkipping == 0)
yyBraceDepth--;
return Tok_RightBrace;
case '|':
yyCh = getChar();
if (yyCh == '|' || yyCh == '=')
yyCh = getChar();
return Tok_SomeOperator;
case '~':
yyCh = getChar();
return Tok_Tilde;
case '@':
yyCh = getChar();
return Tok_At;
default:
// ### We should really prevent qdoc from looking at snippet files rather than
// ### suppress warnings when reading them.
if (yyNumPreprocessorSkipping == 0
&& !(yyTokLoc.fileName().endsWith(".qdoc")
|| yyTokLoc.fileName().endsWith(".js"))) {
yyTokLoc.warning(
tr("Hostile character 0x%1 in C++ source").arg((uchar)yyCh, 1, 16));
}
yyCh = getChar();
}
}
}
if (yyPreprocessorSkipping.count() > 1) {
yyTokLoc.warning(tr("Expected #endif before end of file"));
// clear it out or we get an infinite loop!
while (!yyPreprocessorSkipping.isEmpty()) {
popSkipping();
}
}
strcpy(yyLex, "end-of-input");
yyLexLen = strlen(yyLex);
return Tok_Eoi;
}
void Tokenizer::initialize()
{
Config &config = Config::instance();
QString versionSym = config.getString(CONFIG_VERSIONSYM);
QString sourceEncoding = config.getString(CONFIG_SOURCEENCODING);
if (sourceEncoding.isEmpty())
sourceEncoding = QLatin1String("ISO-8859-1");
#ifndef QT_NO_TEXTCODEC
sourceCodec = QTextCodec::codecForName(sourceEncoding.toLocal8Bit());
#endif
comment = new QRegExp("/(?:\\*.*\\*/|/.*\n|/[^\n]*$)");
comment->setMinimal(true);
versionX = new QRegExp("$cannot possibly match^");
if (!versionSym.isEmpty())
versionX->setPattern("[ \t]*(?:" + QRegExp::escape(versionSym)
+ ")[ \t]+\"([^\"]*)\"[ \t]*");
definedX = new QRegExp("defined ?\\(?([A-Z_0-9a-z]+) ?\\)?");
QStringList d = config.getStringList(CONFIG_DEFINES);
d += "qdoc";
defines = new QRegExp(d.join('|'));
falsehoods = new QRegExp(config.getStringList(CONFIG_FALSEHOODS).join('|'));
/*
The keyword hash table is always cleared before any words are inserted.
*/
memset(kwordHashTable, 0, sizeof(kwordHashTable));
for (int i = 0; i < Tok_LastKeyword - Tok_FirstKeyword + 1; i++)
insertKwordIntoHash(kwords[i], i + 1);
ignoredTokensAndDirectives = new QHash<QByteArray, bool>;
const QStringList tokens =
config.getStringList(LANGUAGE_CPP + Config::dot + CONFIG_IGNORETOKENS);
for (const auto &token : tokens) {
const QByteArray tb = token.toLatin1();
ignoredTokensAndDirectives->insert(tb, false);
insertKwordIntoHash(tb.data(), -1);
}
const QStringList directives =
config.getStringList(LANGUAGE_CPP + Config::dot + CONFIG_IGNOREDIRECTIVES);
for (const auto &directive : directives) {
const QByteArray db = directive.toLatin1();
ignoredTokensAndDirectives->insert(db, true);
insertKwordIntoHash(db.data(), -1);
}
}
/*!
The heap allocated variables are freed here. The keyword
hash table is not cleared here, but it is cleared in the
initialize() function, before any keywords are inserted.
*/
void Tokenizer::terminate()
{
delete comment;
comment = nullptr;
delete versionX;
versionX = nullptr;
delete definedX;
definedX = nullptr;
delete defines;
defines = nullptr;
delete falsehoods;
falsehoods = nullptr;
delete ignoredTokensAndDirectives;
ignoredTokensAndDirectives = nullptr;
}
void Tokenizer::init()
{
yyLexBuf1 = new char[(int)yyLexBufSize];
yyLexBuf2 = new char[(int)yyLexBufSize];
yyPrevLex = yyLexBuf1;
yyPrevLex[0] = '\0';
yyLex = yyLexBuf2;
yyLex[0] = '\0';
yyLexLen = 0;
yyPreprocessorSkipping.push(false);
yyNumPreprocessorSkipping = 0;
yyBraceDepth = 0;
yyParenDepth = 0;
yyBracketDepth = 0;
yyCh = '\0';
parsingMacro = false;
}
void Tokenizer::start(const Location &loc)
{
yyTokLoc = loc;
yyCurLoc = loc;
yyCurLoc.start();
strcpy(yyPrevLex, "beginning-of-input");
strcpy(yyLex, "beginning-of-input");
yyLexLen = strlen(yyLex);
yyBraceDepth = 0;
yyParenDepth = 0;
yyBracketDepth = 0;
yyCh = '\0';
yyCh = getChar();
}
/*
Returns the next token, if # was met. This function interprets the
preprocessor directive, skips over any #ifdef'd out tokens, and returns the
token after all of that.
*/
int Tokenizer::getTokenAfterPreprocessor()
{
yyCh = getChar();
while (isspace(yyCh) && yyCh != '\n')
yyCh = getChar();
/*
#directive condition
*/
QString directive;
QString condition;
while (isalpha(yyCh)) {
directive += QChar(yyCh);
yyCh = getChar();
}
if (!directive.isEmpty()) {
while (yyCh != EOF && yyCh != '\n') {
if (yyCh == '\\') {
yyCh = getChar();
if (yyCh == '\r')
yyCh = getChar();
}
condition += yyCh;
yyCh = getChar();
}
condition.remove(*comment);
condition = condition.simplified();
/*
The #if, #ifdef, #ifndef, #elif, #else, and #endif
directives have an effect on the skipping stack. For
instance, if the code processed so far is
#if 1
#if 0
#if 1
// ...
#else
the skipping stack contains, from bottom to top, false true
true (assuming 0 is false and 1 is true). If at least one
entry of the stack is true, the tokens are skipped.
This mechanism is simple yet hard to understand.
*/
if (directive[0] == QChar('i')) {
if (directive == QString("if"))
pushSkipping(!isTrue(condition));
else if (directive == QString("ifdef"))
pushSkipping(!defines->exactMatch(condition));
else if (directive == QString("ifndef"))
pushSkipping(defines->exactMatch(condition));
} else if (directive[0] == QChar('e')) {
if (directive == QString("elif")) {
bool old = popSkipping();
if (old)
pushSkipping(!isTrue(condition));
else
pushSkipping(true);
} else if (directive == QString("else")) {
pushSkipping(!popSkipping());
} else if (directive == QString("endif")) {
popSkipping();
}
} else if (directive == QString("define")) {
if (versionX->exactMatch(condition))
yyVersion = versionX->cap(1);
}
}
int tok;
do {
/*
We set yyLex now, and after getToken() this will be
yyPrevLex. This way, we skip over the preprocessor
directive.
*/
qstrcpy(yyLex, yyPrevLex);
/*
If getToken() meets another #, it will call
getTokenAfterPreprocessor() once again, which could in turn
call getToken() again, etc. Unless there are 10,000 or so
preprocessor directives in a row, this shouldn't overflow
the stack.
*/
tok = getToken();
} while (yyNumPreprocessorSkipping > 0 && tok != Tok_Eoi);
return tok;
}
/*
Pushes a new skipping value onto the stack. This corresponds to entering a
new #if block.
*/
void Tokenizer::pushSkipping(bool skip)
{
yyPreprocessorSkipping.push(skip);
if (skip)
yyNumPreprocessorSkipping++;
}
/*
Pops a skipping value from the stack. This corresponds to reaching a #endif.
*/
bool Tokenizer::popSkipping()
{
if (yyPreprocessorSkipping.isEmpty()) {
yyTokLoc.warning(tr("Unexpected #elif, #else or #endif"));
return true;
}
bool skip = yyPreprocessorSkipping.pop();
if (skip)
yyNumPreprocessorSkipping--;
return skip;
}
/*
Returns \c true if the condition evaluates as true, otherwise false. The
condition is represented by a string. Unsophisticated parsing techniques are
used. The preprocessing method could be named StriNg-Oriented PreProcessing,
as SNOBOL stands for StriNg-Oriented symBOlic Language.
*/
bool Tokenizer::isTrue(const QString &condition)
{
int firstOr = -1;
int firstAnd = -1;
int parenDepth = 0;
/*
Find the first logical operator at top level, but be careful
about precedence. Examples:
X || Y // the or
X || Y || Z // the leftmost or
X || Y && Z // the or
X && Y || Z // the or
(X || Y) && Z // the and
*/
for (int i = 0; i < condition.length() - 1; i++) {
QChar ch = condition[i];
if (ch == QChar('(')) {
parenDepth++;
} else if (ch == QChar(')')) {
parenDepth--;
} else if (parenDepth == 0) {
if (condition[i + 1] == ch) {
if (ch == QChar('|')) {
firstOr = i;
break;
} else if (ch == QChar('&')) {
if (firstAnd == -1)
firstAnd = i;
}
}
}
}
if (firstOr != -1)
return isTrue(condition.left(firstOr)) || isTrue(condition.mid(firstOr + 2));
if (firstAnd != -1)
return isTrue(condition.left(firstAnd)) && isTrue(condition.mid(firstAnd + 2));
QString t = condition.simplified();
if (t.isEmpty())
return true;
if (t[0] == QChar('!'))
return !isTrue(t.mid(1));
if (t[0] == QChar('(') && t.endsWith(QChar(')')))
return isTrue(t.mid(1, t.length() - 2));
if (definedX->exactMatch(t))
return defines->exactMatch(definedX->cap(1));
else
return !falsehoods->exactMatch(t);
}
QString Tokenizer::lexeme() const
{
#ifndef QT_NO_TEXTCODEC
return sourceCodec->toUnicode(yyLex);
#else
return QString::fromUtf8(yyLex);
#endif
}
QString Tokenizer::previousLexeme() const
{
#ifndef QT_NO_TEXTCODEC
return sourceCodec->toUnicode(yyPrevLex);
#else
return QString::fromUtf8(yyPrevLex);
#endif
}
QT_END_NAMESPACE