blob: cfaf59e2588cd86619ead7a432424ff5913b001a [file] [log] [blame]
/****************************************************************************
** Copyright (C) 2017 Ford Motor Company.
** All rights reserved.
**
** Copyright (C) 2017 The Qt Company Ltd.
** Contact: https://www.qt.io/licensing/
**
** This file is part of the QtRemoteObjects module of the Qt Toolkit.
**
** $QT_BEGIN_LICENSE:LGPL$
** Commercial License Usage
** Licensees holding valid commercial Qt licenses may use this file in
** accordance with the commercial license agreement provided with the
** Software or, alternatively, in accordance with the terms contained in
** a written agreement between you and The Qt Company. For licensing terms
** and conditions see https://www.qt.io/terms-conditions. For further
** information use the contact form at https://www.qt.io/contact-us.
**
** GNU Lesser General Public License Usage
** Alternatively, this file may be used under the terms of the GNU Lesser
** General Public License version 3 as published by the Free Software
** Foundation and appearing in the file LICENSE.LGPL3 included in the
** packaging of this file. Please review the following information to
** ensure the GNU Lesser General Public License version 3 requirements
** will be met: https://www.gnu.org/licenses/lgpl-3.0.html.
**
** GNU General Public License Usage
** Alternatively, this file may be used under the terms of the GNU
** General Public License version 2.0 or (at your option) the GNU General
** Public license version 3 or any later version approved by the KDE Free
** Qt Foundation. The licenses are as published by the Free Software
** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3
** included in the packaging of this file. Please review the following
** information to ensure the GNU General Public License requirements will
** be met: https://www.gnu.org/licenses/gpl-2.0.html and
** https://www.gnu.org/licenses/gpl-3.0.html.
**
** $QT_END_LICENSE$
**
****************************************************************************/
#ifndef QREGEXPARSER_H
#define QREGEXPARSER_H
#include <QtCore/qshareddata.h>
#include <QtCore/qvarlengtharray.h>
#include <QtCore/qvariant.h>
#ifdef QT_BOOTSTRAPPED
# include <QtCore/qregexp.h>
# define REGEX QRegExp
#else
# include <QtCore/qregularexpression.h>
# define REGEX QRegularExpression
#endif
#include <QtCore/qmap.h>
#include <QtCore/qfile.h>
#include <QtCore/qtextstream.h>
#include <QtCore/qdebug.h>
struct MatchCandidate {
MatchCandidate(const QString &n, const QString &t, int i) : name(n), matchText(t), index(i) {}
QString name;
QString matchText;
int index;
};
QT_BEGIN_NAMESPACE
template <typename _Parser, typename _Table>
class QRegexParser: protected _Table
{
public:
QRegexParser(int maxMatchLen=4096);
virtual ~QRegexParser();
virtual bool parse();
virtual void reset() {}
inline QVariant &sym(int index);
void setBuffer(const QString &buffer);
void setBufferFromDevice(QIODevice *device);
void setDebug();
QString errorString() const
{
return m_errorString;
}
void setErrorString(const QString &error)
{
m_errorString = error;
qWarning() << m_errorString;
}
inline const QMap<QString, QString>& captured() const
{
return m_captured;
}
inline bool isDebug() const
{
return m_debug;
}
inline int lineNumber() const
{
return m_lineno;
}
private:
int nextToken();
inline bool consumeRule(int rule)
{
return static_cast<_Parser*> (this)->consumeRule(rule);
}
enum { DefaultStackSize = 128 };
struct Data: public QSharedData
{
Data(): stackSize (DefaultStackSize), tos (0) {}
QVarLengthArray<int, DefaultStackSize> stateStack;
QVarLengthArray<QVariant, DefaultStackSize> parseStack;
int stackSize;
int tos;
void reallocateStack() {
stackSize <<= 1;
stateStack.resize(stackSize);
parseStack.resize(stackSize);
}
};
inline QString escapeString(QString s)
{
return s.replace(QLatin1Char('\n'), QLatin1String("\\n")).replace(QLatin1Char('\t'), QLatin1String("\\t"));
}
QSharedDataPointer<Data> d;
QList<REGEX> m_regexes;
#ifndef QT_BOOTSTRAPPED
QMap<QChar, QList<int> > regexCandidates;
#endif
QList<int> m_tokens;
QString m_buffer, m_lastMatchText;
int m_loc, m_lastNewlinePosition;
int m_lineno;
int m_debug;
QStringList m_tokenNames;
QMap<QString, QString> m_captured;
int m_maxMatchLen;
QString m_errorString;
QVector<QMap<int, QString> > m_names; //storage for match names
};
template <typename _Parser, typename _Table>
inline QVariant &QRegexParser<_Parser, _Table>::sym(int n)
{
return d->parseStack [d->tos + n - 1];
}
template <typename _Parser, typename _Table>
QRegexParser<_Parser, _Table>::~QRegexParser()
{
}
template <typename _Parser, typename _Table>
bool QRegexParser<_Parser, _Table>::parse()
{
m_errorString.clear();
reset();
const int INITIAL_STATE = 0;
d->tos = 0;
d->reallocateStack();
int act = d->stateStack[++d->tos] = INITIAL_STATE;
int token = -1;
Q_FOREVER {
if (token == -1 && - _Table::TERMINAL_COUNT != _Table::action_index[act])
token = nextToken();
act = _Table::t_action(act, token);
if (d->stateStack[d->tos] == _Table::ACCEPT_STATE)
return true;
else if (act > 0) {
if (++d->tos == d->stackSize)
d->reallocateStack();
d->parseStack[d->tos] = d->parseStack[d->tos - 1];
d->stateStack[d->tos] = act;
token = -1;
}
else if (act < 0) {
int r = - act - 1;
d->tos -= _Table::rhs[r];
act = d->stateStack[d->tos++];
if (!consumeRule(r))
return false;
act = d->stateStack[d->tos] = _Table::nt_action(act, _Table::lhs[r] - _Table::TERMINAL_COUNT);
}
else break;
}
setErrorString(QStringLiteral("Unknown token encountered"));
return false;
}
template <typename _Parser, typename _Table>
QRegexParser<_Parser, _Table>::QRegexParser(int maxMatchLen) : d(new Data()), m_loc(0), m_lastNewlinePosition(0), m_lineno(1), m_debug(0), m_maxMatchLen(maxMatchLen)
{
REGEX re(QStringLiteral("\\[([_a-zA-Z][_0-9a-zA-Z]*)(,\\s*M)?\\](.+)$"));
#ifdef QT_BOOTSTRAPPED
REGEX nameMatch(QStringLiteral("\\((\\?<(.*)>).+\\)"));
nameMatch.setMinimal(true);
#else
re.optimize();
#endif
QMap<QString, int> token_lookup;
QMap<int, QString> names;
for (int i = 1; i < _Table::lhs[0]; i++) {
const QString text = QLatin1String(_Table::spell[i]);
names.clear();
#ifdef QT_BOOTSTRAPPED
if (re.indexIn(text) == 0) {
const QString token = re.cap(1);
const bool multiline = re.cap(2).length() > 0;
QString pattern = re.cap(3);
//We need to identify/remove any match names in the pattern, since
//QRegExp doesn't support that feature
int pos = 0, counter = 1, loc = nameMatch.indexIn(pattern, pos);
while (loc >= 0) {
const QString res = nameMatch.cap(2);
if (!res.isEmpty()) {
names.insert(counter, res);
pattern.remove(nameMatch.cap(1));
}
pos += loc + nameMatch.matchedLength() - nameMatch.cap(1).length();
loc = nameMatch.indexIn(pattern, pos);
++counter;
}
//We need to use indexIn, but that will search past the location we
//pass in. So prepend '^' and use QRegExp::CaretAtOffset.
if (pattern.at(0) != QChar(QLatin1Char('^')))
pattern.prepend(QChar(QLatin1Char('^')));
#else
QRegularExpressionMatch match = re.match(text, 0, QRegularExpression::NormalMatch, QRegularExpression::DontCheckSubjectStringMatchOption);
if (match.hasMatch()) {
const QString token = match.captured(1);
const bool multiline = match.captured(2).length() > 0;
const QString pattern = match.captured(3);
#endif
m_tokenNames.append(token);
int index = i;
if (token_lookup.contains(token))
index = token_lookup[token];
else
token_lookup[token] = i;
#ifdef QT_BOOTSTRAPPED
if (multiline)
qWarning() << "The multiline grammar option is ignore in force_bootstrap mode.";
#endif
REGEX pat(pattern);
#ifndef QT_BOOTSTRAPPED
if (multiline)
pat.setPatternOptions(QRegularExpression::DotMatchesEverythingOption);
#endif
if (!pat.isValid())
qCritical() << "Pattern error for token #" << i << "for" << text << "pattern =" << pat << ":" << pat.errorString();
else {
#ifndef QT_BOOTSTRAPPED
pat.optimize();
int counter = 0;
const auto namedCaptureGroups = pat.namedCaptureGroups();
for (const QString &name : namedCaptureGroups) {
if (!name.isEmpty())
names.insert(counter, name);
++counter;
}
#endif
m_names.append(names);
m_regexes.append(pat);
if (token.startsWith(QLatin1String("ignore")))
m_tokens.append(-1);
else
m_tokens.append(index);
}
} else {
qCritical() << "Error parsing regex at token #" << i << "for" << text << "Invalid syntax";
}
}
}
template <typename _Parser, typename _Table>
void QRegexParser<_Parser, _Table>::setBuffer(const QString &buffer)
{
m_buffer = buffer;
}
template <typename _Parser, typename _Table>
void QRegexParser<_Parser, _Table>::setBufferFromDevice(QIODevice *device)
{
QTextStream in(device);
m_buffer = in.readAll();
}
template <typename _Parser, typename _Table>
void QRegexParser<_Parser, _Table>::setDebug()
{
m_debug = true;
for (int r = 0; r < _Table::RULE_COUNT; ++r)
{
int ridx = _Table::rule_index[r];
int _rhs = _Table::rhs[r];
qDebug("%3d) %s ::=", r + 1, _Table::spell[_Table::rule_info[ridx]]);
++ridx;
for (int i = ridx; i < ridx + _rhs; ++i)
{
int symbol = _Table::rule_info[i];
if (symbol > 0 && symbol < _Table::lhs[0])
qDebug(" token_%s (pattern = %s)",qPrintable(m_tokenNames[symbol-1]),qPrintable(m_regexes[symbol-1].pattern()));
else if (const char *name = _Table::spell[symbol])
qDebug(" %s", name);
else
qDebug(" #%d", symbol);
}
qDebug();
}
}
template <typename _Parser, typename _Table>
int QRegexParser<_Parser, _Table>::nextToken()
{
static const REGEX newline(QLatin1String("(\\n)"));
int token = -1;
while (token < 0)
{
if (m_loc == m_buffer.size())
return _Table::EOF_SYMBOL;
//Check m_lastMatchText for newlines and update m_lineno
//This isn't necessary, but being able to provide the line # and character #
//where the match is failing sure makes building/debugging grammars easier.
#ifdef QT_BOOTSTRAPPED
int loc = 0, pos = newline.indexIn(m_lastMatchText, loc);
while (pos >= 0) {
m_lineno++;
loc += pos + 1;
m_lastNewlinePosition += pos + 1;
pos = newline.indexIn(m_lastMatchText, loc);
}
#else //QT_BOOTSTRAPPED
QRegularExpressionMatchIterator matches = newline.globalMatch(m_lastMatchText);
while (matches.hasNext()) {
m_lineno++;
QRegularExpressionMatch match = matches.next();
if (!matches.hasNext())
m_lastNewlinePosition += match.capturedEnd();
}
#endif //!QT_BOOTSTRAPPED
if (m_debug) {
qDebug();
qDebug() << "nextToken loop, line =" << m_lineno
<< "line position =" << m_loc - m_lastNewlinePosition
<< "next 5 characters =" << escapeString(m_buffer.mid(m_loc, 5));
}
int best = -1, maxLen = -1;
#ifndef QT_BOOTSTRAPPED
QRegularExpressionMatch bestRegex;
#endif
//Find the longest match.
//If more than one are the same (longest) length, return the first one in
//the order defined.
QList<MatchCandidate> candidates;
#ifndef QT_BOOTSTRAPPED
{
//We used PCRE's PartialMatch to eliminate most of the regexes by the first
//character, so we keep a regexCandidates map with the list of possible regexes
//based on initial characters found so far.
const QChar nextChar = m_buffer.at(m_loc);
//Populate the list if we haven't seeen this character before
if (!regexCandidates.contains(nextChar)) {
# if (QT_VERSION >= QT_VERSION_CHECK(5, 5, 0))
const QStringRef tmp = m_buffer.midRef(m_loc,1);
# else
const QString tmp = m_buffer.mid(m_loc,1);
# endif
int i = 0;
regexCandidates[nextChar] = QList<int>();
for (const QRegularExpression &re : qAsConst(m_regexes))
{
QRegularExpressionMatch match = re.match(tmp, 0, QRegularExpression::PartialPreferFirstMatch, QRegularExpression::DontCheckSubjectStringMatchOption);
//qDebug() << nextChar << tmp << match.hasMatch() << match.hasPartialMatch() << re.pattern();
if (match.hasMatch() || match.hasPartialMatch())
regexCandidates[nextChar] << i;
i++;
}
}
const auto indices = regexCandidates.value(nextChar);
for (int i : indices)
{
//Seems like I should be able to run the regex on the entire string, but performance is horrible
//unless I use a substring.
//QRegularExpressionMatch match = m_regexes[i].match(m_buffer, m_loc, QRegularExpression::NormalMatch, QRegularExpression::AnchoredMatchOption);
# if (QT_VERSION >= QT_VERSION_CHECK(5, 5, 0))
QRegularExpressionMatch match = m_regexes.at(i).match(m_buffer.midRef(m_loc, m_maxMatchLen), 0, QRegularExpression::NormalMatch, QRegularExpression::AnchoredMatchOption | QRegularExpression::DontCheckSubjectStringMatchOption);
# else
QRegularExpressionMatch match = m_regexes.at(i).match(m_buffer.mid(m_loc, m_maxMatchLen), 0, QRegularExpression::NormalMatch, QRegularExpression::AnchoredMatchOption | QRegularExpression::DontCheckSubjectStringMatchOption);
# endif
if (match.hasMatch()) {
if (m_debug)
candidates << MatchCandidate(m_tokenNames[i], match.captured(), i);
if (match.capturedLength() > maxLen) {
best = i;
maxLen = match.capturedLength();
bestRegex = match;
}
}
}
}
#else
{
int i = 0;
for (const QRegExp &r : qAsConst(m_regexes))
{
if (r.indexIn(m_buffer, m_loc, QRegExp::CaretAtOffset) == m_loc) {
if (m_debug)
candidates << MatchCandidate(m_tokenNames[i], r.cap(0), i);
if (r.matchedLength() > maxLen) {
best = i;
maxLen = r.matchedLength();
}
}
++i;
}
}
#endif
if (best < 0) {
setErrorString(QLatin1String("Error generating tokens from file, next characters >%1<").arg(m_buffer.midRef(m_loc, 15)));
return -1;
} else {
const QMap<int, QString> &map = m_names.at(best);
if (!map.isEmpty())
m_captured.clear();
for (auto iter = map.cbegin(), end = map.cend(); iter != end; ++iter) {
#ifdef QT_BOOTSTRAPPED
m_captured.insert(iter.value(), m_regexes.at(best).cap(iter.key()));
#else
m_captured.insert(iter.value(), bestRegex.captured(iter.key()));
#endif
}
if (m_debug) {
qDebug() << "Match candidates:";
for (const MatchCandidate &m : qAsConst(candidates)) {
QLatin1String result = m.index == best ? QLatin1String(" * ") : QLatin1String(" ");
qDebug() << qPrintable(result) << qPrintable(m.name) << qPrintable(escapeString(m.matchText));
}
}
m_loc += maxLen;
if (m_tokens.at(best) >= 0)
token = m_tokens.at(best);
#ifdef QT_BOOTSTRAPPED
m_lastMatchText = m_regexes.at(best).cap(0);
#else
m_lastMatchText = bestRegex.captured(0);
#endif
}
}
return token;
}
QT_END_NAMESPACE
#endif // QREGEXPARSER_H