blob: 33a31a6ac69a689b7f5885011cbc84e39b45def9 [file] [log] [blame]
/****************************************************************************
**
** Copyright (C) 2016 The Qt Company Ltd.
** Contact: https://www.qt.io/licensing/
**
** This file is part of the QtXmlPatterns module of the Qt Toolkit.
**
** $QT_BEGIN_LICENSE:LGPL$
** Commercial License Usage
** Licensees holding valid commercial Qt licenses may use this file in
** accordance with the commercial license agreement provided with the
** Software or, alternatively, in accordance with the terms contained in
** a written agreement between you and The Qt Company. For licensing terms
** and conditions see https://www.qt.io/terms-conditions. For further
** information use the contact form at https://www.qt.io/contact-us.
**
** GNU Lesser General Public License Usage
** Alternatively, this file may be used under the terms of the GNU Lesser
** General Public License version 3 as published by the Free Software
** Foundation and appearing in the file LICENSE.LGPL3 included in the
** packaging of this file. Please review the following information to
** ensure the GNU Lesser General Public License version 3 requirements
** will be met: https://www.gnu.org/licenses/lgpl-3.0.html.
**
** GNU General Public License Usage
** Alternatively, this file may be used under the terms of the GNU
** General Public License version 2.0 or (at your option) the GNU General
** Public license version 3 or any later version approved by the KDE Free
** Qt Foundation. The licenses are as published by the Free Software
** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3
** included in the packaging of this file. Please review the following
** information to ensure the GNU General Public License requirements will
** be met: https://www.gnu.org/licenses/gpl-2.0.html and
** https://www.gnu.org/licenses/gpl-3.0.html.
**
** $QT_END_LICENSE$
**
****************************************************************************/
//
// W A R N I N G
// -------------
//
// This file is not part of the Qt API. It exists purely as an
// implementation detail. This header file may change from version to
// version without notice, or even be removed.
//
// We mean it.
#ifndef Patternist_XQueryTokenizer_H
#define Patternist_XQueryTokenizer_H
#include <QHash>
#include <QSet>
#include <QStack>
#include <QString>
#include <QUrl>
#include <private/qtokenizer_p.h>
QT_BEGIN_NAMESPACE
namespace QPatternist
{
struct TokenMap;
/**
* @short A hand-written tokenizer which tokenizes XQuery 1.0 & XPath 2.0,
* and delivers tokens to the Bison generated parser.
*
* @author Frans Englich <frans.englich@nokia.com>
*/
class XQueryTokenizer : public Tokenizer
{
public:
/**
* Tokenizer states. Organized alphabetically.
*/
enum State
{
AfterAxisSeparator,
AposAttributeContent,
Axis,
Default,
ElementContent,
EndTag,
ItemType,
KindTest,
KindTestForPI,
NamespaceDecl,
NamespaceKeyword,
OccurrenceIndicator,
Operator,
Pragma,
PragmaContent,
ProcessingInstructionContent,
ProcessingInstructionName,
QuotAttributeContent,
StartTag,
VarName,
XMLComment,
XMLSpaceDecl,
XQueryVersion
};
XQueryTokenizer(const QString &query,
const QUrl &location,
const State startingState = Default);
virtual Token nextToken(YYLTYPE *const sourceLocator);
virtual int commenceScanOnly();
virtual void resumeTokenizationFrom(const int position);
/**
* Does nothing.
*/
virtual void setParserContext(const ParserContext::Ptr &parseInfo);
private:
/**
* Returns the character corresponding to the builtin reference @p
* reference. For instance, passing @c gt will give you '>' in return.
*
* If @p reference is an invalid character reference, a null QChar is
* returned.
*
* @see QChar::isNull()
*/
QChar charForReference(const QString &reference);
inline Token tokenAndChangeState(const TokenType code,
const State state,
const int advance = 1);
inline Token tokenAndChangeState(const TokenType code,
const QString &value,
const State state);
inline Token tokenAndAdvance(const TokenType code,
const int advance = 1);
QString tokenizeCharacterReference();
inline Token tokenizeStringLiteral();
inline Token tokenizeNumberLiteral();
/**
* @returns the character @p length characters from the current
* position.
*/
inline char peekAhead(const int length = 1) const;
/**
* @returns whether the stream, starting from @p offset from the
* current position, matches @p chs. The length of @p chs is @p len.
*/
inline bool aheadEquals(const char *const chs,
const int len,
const int offset = 1) const;
inline Token tokenizeNCName();
static inline bool isOperatorKeyword(const TokenType);
static inline bool isDigit(const char ch);
static inline Token error();
inline TokenType consumeWhitespace();
/**
* @short Returns the character at the current position, converted to
* @c ASCII.
*
* Equivalent to calling:
*
* @code
* current().toLatin1();
* @endcode
*/
inline char peekCurrent() const;
/**
* Disregarding encoding conversion, equivalent to calling:
*
* @code
* peekAhead(0);
* @endcode
*/
inline const QChar current() const;
/**
* @p hadWhitespace is always set to a proper value.
*
* @returns the length of whitespace scanned before reaching "::", or
* -1 if something else was found.
*/
int peekForColonColon() const;
static inline bool isNCNameStart(const QChar ch);
static inline bool isNCNameBody(const QChar ch);
static inline const TokenMap *lookupKeyword(const QString &keyword);
inline void popState();
inline void pushState(const State state);
inline State state() const;
inline void setState(const State s);
static bool isTypeToken(const TokenType t);
inline Token tokenizeNCNameOrQName();
/**
* Advances m_pos until content is encountered.
*
* Returned is the length stretching from m_pos when starting, until
* @p content is encountered. @p content is not included in the length.
*/
int scanUntil(const char *const content);
/**
* Same as calling:
* @code
* pushState(currentState());
* @endcode
*/
inline void pushState();
/**
* Consumes only whitespace, in the traditional sense. The function exits
* if non-whitespace is encountered, such as the start of a comment.
*
* @returns @c true if the end was reached, otherwise @c false
*/
inline bool consumeRawWhitespace();
/**
* @short Parses comments: <tt>(: comment content :)</tt>. It recurses for
* parsing nested comments.
*
* It is assumed that the start token for the comment, "(:", has
* already been parsed.
*
* Typically, don't call this function, but ignoreWhitespace().
*
* @see <a href="http://www.w3.org/TR/xpath20/#comments">XML Path Language (XPath)
* 2.0, 2.6 Comments</a>
* @returns
* - SUCCESS if everything went ok
* - ERROR if there was an error in parsing one or more comments
* - END_OF_FILE if the end was reached
*/
Tokenizer::TokenType consumeComment();
/**
* Determines whether @p code is a keyword
* that is followed by a second keyword. For instance <tt>declare
* function</tt>.
*/
static inline bool isPhraseKeyword(const TokenType code);
/**
* A set of indexes into a QString, the one being passed to
* normalizeEOL() whose characters shouldn't be normalized. */
typedef QSet<int> CharacterSkips;
/**
* Returns @p input, normalized according to
* <a href="http://www.w3.org/TR/xquery/#id-eol-handling">XQuery 1.0:
* An XML Query Language, A.2.3 End-of-Line Handling</a>
*/
static QString normalizeEOL(const QString &input,
const CharacterSkips &characterSkips);
inline bool atEnd() const
{
return m_pos == m_length;
}
Token nextToken();
/**
* Instead of recognizing and tokenizing embedded expressions in
* direct attriute constructors, this function is essentially a mini
* recursive-descent parser that has the necessary logic to recognize
* embedded expressions and their potentially interfering string literals, in
* order to scan to the very end of the attribute value, and return the
* whole as a string.
*
* There is of course syntax errors this function will not detect, but
* that is ok since the attributes will be parsed once more.
*
* An inelegant solution, but which gets the job done.
*
* @see commenceScanOnly(), resumeTokenizationFrom()
*/
Token attributeAsRaw(const QChar separator,
int &stack,
const int startPos,
const bool inLiteral,
QString &result);
const QString m_data;
const int m_length;
State m_state;
QStack<State> m_stateStack;
int m_pos;
/**
* The current line number.
*
* The line number and column number both starts at 1.
*/
int m_line;
/**
* The offset into m_length for where
* the current column starts. So m_length - m_columnOffset
* is the current column.
*
* The line number and column number both starts at 1.
*/
int m_columnOffset;
const NamePool::Ptr m_namePool;
QStack<Token> m_tokenStack;
QHash<QString, QChar> m_charRefs;
bool m_scanOnly;
Q_DISABLE_COPY(XQueryTokenizer)
};
}
QT_END_NAMESPACE
#endif