blob: 9e0da3bb07d9d93dc5452420c8014394ac064ff5 [file] [log] [blame]
package org.codehaus.jackson.smile;
/**
* Constants used by {@link SmileGenerator} and {@link SmileParser}
*
* @author tatu
*/
public final class SmileConstants
{
/*
/**********************************************************
/* Thresholds
/**********************************************************
*/
/**
* Encoding has special "short" forms for value Strings that can
* be represented by 64 bytes of UTF-8 or less.
*/
public final static int MAX_SHORT_VALUE_STRING_BYTES = 64;
/**
* Encoding has special "short" forms for field names that can
* be represented by 64 bytes of UTF-8 or less.
*/
public final static int MAX_SHORT_NAME_ASCII_BYTES = 64;
/**
* Maximum byte length for short non-ASCII names is slightly
* less due to having to reserve bytes 0xF8 and above (but
* we get one more as values 0 and 1 are not valid)
*/
public final static int MAX_SHORT_NAME_UNICODE_BYTES = 56;
/**
* Longest back reference we use for field names is 10 bits; no point
* in keeping much more around
*/
public final static int MAX_SHARED_NAMES = 1024;
/**
* Longest back reference we use for short shared String values is 10 bits,
* so up to (1 << 10) values to keep track of.
*/
public final static int MAX_SHARED_STRING_VALUES = 1024;
/**
* Also: whereas we can refer to names of any length, we will only consider
* text values that are considered "tiny" or "short" (ones encoded with
* length prefix); this value thereby has to be maximum length of Strings
* that can be encoded as such.
*/
public final static int MAX_SHARED_STRING_LENGTH_BYTES = 65;
/**
* And to make encoding logic tight and simple, we can always
* require that output buffer has this amount of space
* available before encoding possibly short String (3 bytes since
* longest UTF-8 encoded Java char is 3 bytes).
* Two extra bytes need to be reserved as well; first for token indicator,
* and second for terminating null byte (in case it's not a short String after all)
*/
public final static int MIN_BUFFER_FOR_POSSIBLE_SHORT_STRING = 1 + (3 * 65);
/*
/**********************************************************
/* Byte markers
/**********************************************************
*/
/**
* We need a byte marker to denote end of variable-length Strings. Although
* null byte is commonly used, let's try to avoid using it since it can't
* be embedded in Web Sockets content (similarly, 0xFF can't). There are
* multiple candidates for bytes UTF-8 can not have; 0xFC is chosen to
* allow reasonable ordering (highest values meaning most significant
* framing function; 0xFF being end-of-content and so on)
*/
public final static int INT_MARKER_END_OF_STRING = 0xFC;
public final static byte BYTE_MARKER_END_OF_STRING = (byte) INT_MARKER_END_OF_STRING;
/**
* In addition we can use a marker to allow simple framing; splitting
* of physical data (like file) into distinct logical sections like
* JSON documents. 0xFF makes sense here since it is also used
* as end marker for Web Sockets.
*/
public final static byte BYTE_MARKER_END_OF_CONTENT = (byte) 0xFF;
/*
/**********************************************************
/* Format header: put smile on your data...
/**********************************************************
*/
/**
* First byte of data header
*/
public final static byte HEADER_BYTE_1 = (byte) ':';
/**
* Second byte of data header
*/
public final static byte HEADER_BYTE_2 = (byte) ')';
/**
* Third byte of data header
*/
public final static byte HEADER_BYTE_3 = (byte) '\n';
/**
* Current version consists of four zero bits (nibble)
*/
public final static int HEADER_VERSION_0 = 0x0;
/**
* Fourth byte of data header; contains version nibble, may
* have flags
*/
public final static byte HEADER_BYTE_4 = (HEADER_VERSION_0 << 4);
/**
* Indicator bit that indicates whether encoded content may
* have Shared names (back references to recently encoded field
* names). If no header available, must be
* processed as if this was set to true.
* If (and only if) header exists, and value is 0, can parser
* omit storing of seen names, as it is guaranteed that no back
* references exist.
*/
public final static int HEADER_BIT_HAS_SHARED_NAMES = 0x01;
/**
* Indicator bit that indicates whether encoded content may
* have shared String values (back references to recently encoded
* 'short' String values, where short is defined as 64 bytes or less).
* If no header available, can be assumed to be 0 (false).
* If header exists, and bit value is 1, parsers has to store up
* to 1024 most recently seen distinct short String values.
*/
public final static int HEADER_BIT_HAS_SHARED_STRING_VALUES = 0x02;
/**
* Indicator bit that indicates whether encoded content may
* contain raw (unquoted) binary values.
* If no header available, can be assumed to be 0 (false).
* If header exists, and bit value is 1, parser can not assume that
* specific byte values always have default meaning (specifically,
* content end marker 0xFF and header signature can be contained
* in binary values)
*<p>
* Note that this bit being true does not automatically mean that
* such raw binary content indeed exists; just that it may exist.
* This because header is written before any binary data may be
* written.
*/
public final static int HEADER_BIT_HAS_RAW_BINARY = 0x04;
/*
/**********************************************************
/* Type prefixes: 3 MSB of token byte
/**********************************************************
*/
// Shared strings are back references for last 63 short (< 64 byte) string values
// NOTE: 0x00 is reserved, not used with current version (may be used in future)
public final static int TOKEN_PREFIX_SHARED_STRING_SHORT = 0x00;
// literals are put between 0x20 and 0x3F to reserve markers (smiley), along with ints/doubles
//public final static int TOKEN_PREFIX_MISC_NUMBERS = 0x20;
public final static int TOKEN_PREFIX_TINY_ASCII = 0x40;
public final static int TOKEN_PREFIX_SMALL_ASCII = 0x60;
public final static int TOKEN_PREFIX_TINY_UNICODE = 0x80;
public final static int TOKEN_PREFIX_SHORT_UNICODE = 0xA0;
// Small ints are 4-bit (-16 to +15) integer constants
public final static int TOKEN_PREFIX_SMALL_INT = 0xC0;
// And misc types have empty at the end too, to reserve 0xF8 - 0xFF
public final static int TOKEN_PREFIX_MISC_OTHER = 0xE0;
/*
/**********************************************************
/* Token literals, normal mode
/**********************************************************
*/
// First, non-structured literals
public final static byte TOKEN_LITERAL_EMPTY_STRING = 0x20;
public final static byte TOKEN_LITERAL_NULL = 0x21;
public final static byte TOKEN_LITERAL_FALSE = 0x22;
public final static byte TOKEN_LITERAL_TRUE = 0x23;
// And then structured literals
public final static byte TOKEN_LITERAL_START_ARRAY = (byte) 0xF8;
public final static byte TOKEN_LITERAL_END_ARRAY = (byte) 0xF9;
public final static byte TOKEN_LITERAL_START_OBJECT = (byte) 0xFA;
public final static byte TOKEN_LITERAL_END_OBJECT = (byte) 0xFB;
/*
/**********************************************************
/* Subtype constants for misc text/binary types
/**********************************************************
*/
/**
* Type (for misc, other) used
* for regular integral types (byte/short/int/long)
*/
public final static int TOKEN_MISC_INTEGER = 0x24;
/**
* Type (for misc, other) used
* for regular floating-point types (float, double)
*/
public final static int TOKEN_MISC_FP = 0x28;
/**
* Type (for misc, other) used for
* variable length UTF-8 encoded text, when it is known to only contain ASCII chars.
* Note: 2 LSB are reserved for future use; must be zeroes for now
*/
public final static int TOKEN_MISC_LONG_TEXT_ASCII = 0xE0;
/**
* Type (for misc, other) used
* for variable length UTF-8 encoded text, when it is NOT known to only contain ASCII chars
* (which means it MAY have multi-byte characters)
* Note: 2 LSB are reserved for future use; must be zeroes for now
*/
public final static int TOKEN_MISC_LONG_TEXT_UNICODE = 0xE4;
/**
* Type (for misc, other) used
* for "safe" (encoded by only using 7 LSB, giving 8/7 expansion ratio).
* This is usually done to ensure that certain bytes are never included
* in encoded data (like 0xFF)
* Note: 2 LSB are reserved for future use; must be zeroes for now
*/
public final static int TOKEN_MISC_BINARY_7BIT = 0xE8;
/**
* Type (for misc, other) used for shared String values where index
* does not fit in "short" reference range (which is 0 - 30). If so,
* 2 LSB from here and full following byte are used to get 10-bit
* index. Values
*/
public final static int TOKEN_MISC_SHARED_STRING_LONG = 0xEC;
/**
* Raw binary data marker is specifically chosen as separate from
* other types, since it can have significant impact on framing
* (or rather fast scanning based on structure and framing markers).
*/
public final static int TOKEN_MISC_BINARY_RAW = 0xFD;
/*
/**********************************************************
/* Modifiers for numeric entries
/**********************************************************
*/
/**
* Numeric subtype (2 LSB) for {@link #TOKEN_MISC_INTEGER},
* indicating 32-bit integer (int)
*/
public final static int TOKEN_MISC_INTEGER_32 = 0x00;
/**
* Numeric subtype (2 LSB) for {@link #TOKEN_MISC_INTEGER},
* indicating 32-bit integer (long)
*/
public final static int TOKEN_MISC_INTEGER_64 = 0x01;
/**
* Numeric subtype (2 LSB) for {@link #TOKEN_MISC_INTEGER},
* indicating {@link java.math.BigInteger} type.
*/
public final static int TOKEN_MISC_INTEGER_BIG = 0x02;
// Note: type 3 (0xF3) reserved for future use
/**
* Numeric subtype (2 LSB) for {@link #TOKEN_MISC_FP},
* indicating 32-bit IEEE single precision floating point number.
*/
public final static int TOKEN_MISC_FLOAT_32 = 0x00;
/**
* Numeric subtype (2 LSB) for {@link #TOKEN_MISC_FP},
* indicating 64-bit IEEE double precision floating point number.
*/
public final static int TOKEN_MISC_FLOAT_64 = 0x01;
/**
* Numeric subtype (2 LSB) for {@link #TOKEN_MISC_FP},
* indicating {@link java.math.BigDecimal} type.
*/
public final static int TOKEN_MISC_FLOAT_BIG = 0x02;
// Note: type 3 (0xF7) reserved for future use
/*
/**********************************************************
/* Token types for keys
/**********************************************************
*/
/**
* Let's use same code for empty key as for empty String value
*/
public final static byte TOKEN_KEY_EMPTY_STRING = 0x20;
public final static int TOKEN_PREFIX_KEY_SHARED_LONG = 0x30;
public final static byte TOKEN_KEY_LONG_STRING = 0x34;
public final static int TOKEN_PREFIX_KEY_SHARED_SHORT = 0x40;
public final static int TOKEN_PREFIX_KEY_ASCII = 0x80;
public final static int TOKEN_PREFIX_KEY_UNICODE = 0xC0;
/*
/**********************************************************
/* Basic UTF-8 decode/encode table
/**********************************************************
*/
/**
* Additionally we can combine UTF-8 decoding info into similar
* data table.
* Values indicate "byte length - 1"; meaning -1 is used for
* invalid bytes, 0 for single-byte codes, 1 for 2-byte codes
* and 2 for 3-byte codes.
*/
public final static int[] sUtf8UnitLengths;
static {
int[] table = new int[256];
for (int c = 128; c < 256; ++c) {
int code;
// We'll add number of bytes needed for decoding
if ((c & 0xE0) == 0xC0) { // 2 bytes (0x0080 - 0x07FF)
code = 1;
} else if ((c & 0xF0) == 0xE0) { // 3 bytes (0x0800 - 0xFFFF)
code = 2;
} else if ((c & 0xF8) == 0xF0) {
// 4 bytes; double-char with surrogates and all...
code = 3;
} else {
// And -1 seems like a good "universal" error marker...
code = -1;
}
table[c] = code;
}
sUtf8UnitLengths = table;
}
}