blob: 50094b00ee68960d5307f9c1ac9981893a4b191e [file] [log] [blame]
package org.codehaus.jackson.impl;
import java.io.*;
import org.codehaus.jackson.*;
import org.codehaus.jackson.format.InputAccessor;
import org.codehaus.jackson.format.MatchStrength;
import org.codehaus.jackson.io.*;
import org.codehaus.jackson.sym.BytesToNameCanonicalizer;
import org.codehaus.jackson.sym.CharsToNameCanonicalizer;
/**
* This class is used to determine the encoding of byte stream
* that is to contain JSON content. Rules are fairly simple, and
* defined in JSON specification (RFC-4627 or newer), except
* for BOM handling, which is a property of underlying
* streams.
*/
public final class ByteSourceBootstrapper
{
final static byte UTF8_BOM_1 = (byte) 0xEF;
final static byte UTF8_BOM_2 = (byte) 0xBB;
final static byte UTF8_BOM_3 = (byte) 0xBF;
/*
/**********************************************************
/* Configuration
/**********************************************************
*/
protected final IOContext _context;
protected final InputStream _in;
/*
/**********************************************************
/* Input buffering
/**********************************************************
*/
protected final byte[] _inputBuffer;
private int _inputPtr;
private int _inputEnd;
/**
* Flag that indicates whether buffer above is to be recycled
* after being used or not.
*/
private final boolean _bufferRecyclable;
/*
/**********************************************************
/* Input location
/**********************************************************
*/
/**
* Current number of input units (bytes or chars) that were processed in
* previous blocks,
* before contents of current input buffer.
*<p>
* Note: includes possible BOMs, if those were part of the input.
*/
protected int _inputProcessed;
/*
/**********************************************************
/* Data gathered
/**********************************************************
*/
protected boolean _bigEndian = true;
protected int _bytesPerChar = 0; // 0 means "dunno yet"
/*
/**********************************************************
/* Life-cycle
/**********************************************************
*/
public ByteSourceBootstrapper(IOContext ctxt, InputStream in)
{
_context = ctxt;
_in = in;
_inputBuffer = ctxt.allocReadIOBuffer();
_inputEnd = _inputPtr = 0;
_inputProcessed = 0;
_bufferRecyclable = true;
}
public ByteSourceBootstrapper(IOContext ctxt, byte[] inputBuffer, int inputStart, int inputLen)
{
_context = ctxt;
_in = null;
_inputBuffer = inputBuffer;
_inputPtr = inputStart;
_inputEnd = (inputStart + inputLen);
// Need to offset this for correct location info
_inputProcessed = -inputStart;
_bufferRecyclable = false;
}
/*
/**********************************************************
/* Encoding detection during bootstrapping
/**********************************************************
*/
/**
* Method that should be called after constructing an instace.
* It will figure out encoding that content uses, to allow
* for instantiating a proper scanner object.
*/
public JsonEncoding detectEncoding()
throws IOException, JsonParseException
{
boolean foundEncoding = false;
// First things first: BOM handling
/* Note: we can require 4 bytes to be read, since no
* combination of BOM + valid JSON content can have
* shorter length (shortest valid JSON content is single
* digit char, but BOMs are chosen such that combination
* is always at least 4 chars long)
*/
if (ensureLoaded(4)) {
int quad = (_inputBuffer[_inputPtr] << 24)
| ((_inputBuffer[_inputPtr+1] & 0xFF) << 16)
| ((_inputBuffer[_inputPtr+2] & 0xFF) << 8)
| (_inputBuffer[_inputPtr+3] & 0xFF);
if (handleBOM(quad)) {
foundEncoding = true;
} else {
/* If no BOM, need to auto-detect based on first char;
* this works since it must be 7-bit ascii (wrt. unicode
* compatible encodings, only ones JSON can be transferred
* over)
*/
// UTF-32?
if (checkUTF32(quad)) {
foundEncoding = true;
} else if (checkUTF16(quad >>> 16)) {
foundEncoding = true;
}
}
} else if (ensureLoaded(2)) {
int i16 = ((_inputBuffer[_inputPtr] & 0xFF) << 8)
| (_inputBuffer[_inputPtr+1] & 0xFF);
if (checkUTF16(i16)) {
foundEncoding = true;
}
}
JsonEncoding enc;
/* Not found yet? As per specs, this means it must be UTF-8. */
if (!foundEncoding) {
enc = JsonEncoding.UTF8;
} else {
switch (_bytesPerChar) {
case 1:
enc = JsonEncoding.UTF8;
break;
case 2:
enc = _bigEndian ? JsonEncoding.UTF16_BE : JsonEncoding.UTF16_LE;
break;
case 4:
enc = _bigEndian ? JsonEncoding.UTF32_BE : JsonEncoding.UTF32_LE;
break;
default:
throw new RuntimeException("Internal error"); // should never get here
}
}
_context.setEncoding(enc);
return enc;
}
/*
/**********************************************************
/* Constructing a Reader
/**********************************************************
*/
public Reader constructReader()
throws IOException
{
JsonEncoding enc = _context.getEncoding();
switch (enc) {
case UTF32_BE:
case UTF32_LE:
return new UTF32Reader(_context, _in, _inputBuffer, _inputPtr, _inputEnd,
_context.getEncoding().isBigEndian());
case UTF16_BE:
case UTF16_LE:
case UTF8: // only in non-common case where we don't want to do direct mapping
{
// First: do we have a Stream? If not, need to create one:
InputStream in = _in;
if (in == null) {
in = new ByteArrayInputStream(_inputBuffer, _inputPtr, _inputEnd);
} else {
/* Also, if we have any read but unused input (usually true),
* need to merge that input in:
*/
if (_inputPtr < _inputEnd) {
in = new MergedStream(_context, in, _inputBuffer, _inputPtr, _inputEnd);
}
}
return new InputStreamReader(in, enc.getJavaName());
}
}
throw new RuntimeException("Internal error"); // should never get here
}
public JsonParser constructParser(int features, ObjectCodec codec, BytesToNameCanonicalizer rootByteSymbols, CharsToNameCanonicalizer rootCharSymbols)
throws IOException, JsonParseException
{
JsonEncoding enc = detectEncoding();
// As per [JACKSON-259], may want to fully disable canonicalization:
boolean canonicalize = JsonParser.Feature.CANONICALIZE_FIELD_NAMES.enabledIn(features);
boolean intern = JsonParser.Feature.INTERN_FIELD_NAMES.enabledIn(features);
if (enc == JsonEncoding.UTF8) {
/* and without canonicalization, byte-based approach is not performance; just use std UTF-8 reader
* (which is ok for larger input; not so hot for smaller; but this is not a common case)
*/
if (canonicalize) {
BytesToNameCanonicalizer can = rootByteSymbols.makeChild(canonicalize, intern);
return new Utf8StreamParser(_context, features, _in, codec, can, _inputBuffer, _inputPtr, _inputEnd, _bufferRecyclable);
}
}
return new ReaderBasedParser(_context, features, constructReader(), codec, rootCharSymbols.makeChild(canonicalize, intern));
}
/*
/**********************************************************
/* Encoding detection for data format auto-detection
/**********************************************************
*/
/**
* Current implementation is not as thorough as other functionality
* ({@link org.codehaus.jackson.impl.ByteSourceBootstrapper});
* supports UTF-8, for example. But it should work, for now, and can
* be improved as necessary.
*
* @since 1.8
*/
public static MatchStrength hasJSONFormat(InputAccessor acc) throws IOException
{
// Ideally we should see "[" or "{"; but if not, we'll accept double-quote (String)
// in future could also consider accepting non-standard matches?
if (!acc.hasMoreBytes()) {
return MatchStrength.INCONCLUSIVE;
}
byte b = acc.nextByte();
// Very first thing, a UTF-8 BOM?
if (b == UTF8_BOM_1) { // yes, looks like UTF-8 BOM
if (!acc.hasMoreBytes()) {
return MatchStrength.INCONCLUSIVE;
}
if (acc.nextByte() != UTF8_BOM_2) {
return MatchStrength.NO_MATCH;
}
if (!acc.hasMoreBytes()) {
return MatchStrength.INCONCLUSIVE;
}
if (acc.nextByte() != UTF8_BOM_3) {
return MatchStrength.NO_MATCH;
}
if (!acc.hasMoreBytes()) {
return MatchStrength.INCONCLUSIVE;
}
b = acc.nextByte();
}
// Then possible leading space
int ch = skipSpace(acc, b);
if (ch < 0) {
return MatchStrength.INCONCLUSIVE;
}
// First, let's see if it looks like a structured type:
if (ch == '{') { // JSON object?
// Ideally we need to find either double-quote or closing bracket
ch = skipSpace(acc);
if (ch < 0) {
return MatchStrength.INCONCLUSIVE;
}
if (ch == '"' || ch == '}') {
return MatchStrength.SOLID_MATCH;
}
// ... should we allow non-standard? Let's not yet... can add if need be
return MatchStrength.NO_MATCH;
}
MatchStrength strength;
if (ch == '[') {
ch = skipSpace(acc);
if (ch < 0) {
return MatchStrength.INCONCLUSIVE;
}
// closing brackets is easy; but for now, let's also accept opening...
if (ch == ']' || ch == '[') {
return MatchStrength.SOLID_MATCH;
}
return MatchStrength.SOLID_MATCH;
} else {
// plain old value is not very convincing...
strength = MatchStrength.WEAK_MATCH;
}
if (ch == '"') { // string value
return strength;
}
if (ch <= '9' && ch >= '0') { // number
return strength;
}
if (ch == '-') { // negative number
ch = skipSpace(acc);
if (ch < 0) {
return MatchStrength.INCONCLUSIVE;
}
return (ch <= '9' && ch >= '0') ? strength : MatchStrength.NO_MATCH;
}
// or one of literals
if (ch == 'n') { // null
return tryMatch(acc, "ull", strength);
}
if (ch == 't') { // true
return tryMatch(acc, "rue", strength);
}
if (ch == 'f') { // false
return tryMatch(acc, "alse", strength);
}
return MatchStrength.NO_MATCH;
}
private final static MatchStrength tryMatch(InputAccessor acc, String matchStr, MatchStrength fullMatchStrength)
throws IOException
{
for (int i = 0, len = matchStr.length(); i < len; ++i) {
if (!acc.hasMoreBytes()) {
return MatchStrength.INCONCLUSIVE;
}
if (acc.nextByte() != matchStr.charAt(i)) {
return MatchStrength.NO_MATCH;
}
}
return fullMatchStrength;
}
private final static int skipSpace(InputAccessor acc) throws IOException
{
if (!acc.hasMoreBytes()) {
return -1;
}
return skipSpace(acc, acc.nextByte());
}
private final static int skipSpace(InputAccessor acc, byte b) throws IOException
{
while (true) {
int ch = (int) b & 0xFF;
if (!(ch == ' ' || ch == '\r' || ch == '\n' || ch == '\t')) {
return ch;
}
if (!acc.hasMoreBytes()) {
return -1;
}
b = acc.nextByte();
ch = (int) b & 0xFF;
}
}
/*
/**********************************************************
/* Internal methods, parsing
/**********************************************************
*/
/**
* @return True if a BOM was succesfully found, and encoding
* thereby recognized.
*/
private boolean handleBOM(int quad)
throws IOException
{
/* Handling of (usually) optional BOM (required for
* multi-byte formats); first 32-bit charsets:
*/
switch (quad) {
case 0x0000FEFF:
_bigEndian = true;
_inputPtr += 4;
_bytesPerChar = 4;
return true;
case 0xFFFE0000: // UCS-4, LE?
_inputPtr += 4;
_bytesPerChar = 4;
_bigEndian = false;
return true;
case 0x0000FFFE: // UCS-4, in-order...
reportWeirdUCS4("2143"); // throws exception
case 0xFEFF0000: // UCS-4, in-order...
reportWeirdUCS4("3412"); // throws exception
}
// Ok, if not, how about 16-bit encoding BOMs?
int msw = quad >>> 16;
if (msw == 0xFEFF) { // UTF-16, BE
_inputPtr += 2;
_bytesPerChar = 2;
_bigEndian = true;
return true;
}
if (msw == 0xFFFE) { // UTF-16, LE
_inputPtr += 2;
_bytesPerChar = 2;
_bigEndian = false;
return true;
}
// And if not, then UTF-8 BOM?
if ((quad >>> 8) == 0xEFBBBF) { // UTF-8
_inputPtr += 3;
_bytesPerChar = 1;
_bigEndian = true; // doesn't really matter
return true;
}
return false;
}
private boolean checkUTF32(int quad)
throws IOException
{
/* Handling of (usually) optional BOM (required for
* multi-byte formats); first 32-bit charsets:
*/
if ((quad >> 8) == 0) { // 0x000000?? -> UTF32-BE
_bigEndian = true;
} else if ((quad & 0x00FFFFFF) == 0) { // 0x??000000 -> UTF32-LE
_bigEndian = false;
} else if ((quad & ~0x00FF0000) == 0) { // 0x00??0000 -> UTF32-in-order
reportWeirdUCS4("3412");
} else if ((quad & ~0x0000FF00) == 0) { // 0x0000??00 -> UTF32-in-order
reportWeirdUCS4("2143");
} else {
// Can not be valid UTF-32 encoded JSON...
return false;
}
// Not BOM (just regular content), nothing to skip past:
//_inputPtr += 4;
_bytesPerChar = 4;
return true;
}
private boolean checkUTF16(int i16)
{
if ((i16 & 0xFF00) == 0) { // UTF-16BE
_bigEndian = true;
} else if ((i16 & 0x00FF) == 0) { // UTF-16LE
_bigEndian = false;
} else { // nope, not UTF-16
return false;
}
// Not BOM (just regular content), nothing to skip past:
//_inputPtr += 2;
_bytesPerChar = 2;
return true;
}
/*
/**********************************************************
/* Internal methods, problem reporting
/**********************************************************
*/
private void reportWeirdUCS4(String type)
throws IOException
{
throw new CharConversionException("Unsupported UCS-4 endianness ("+type+") detected");
}
/*
/**********************************************************
/* Internal methods, raw input access
/**********************************************************
*/
protected boolean ensureLoaded(int minimum)
throws IOException
{
/* Let's assume here buffer has enough room -- this will always
* be true for the limited used this method gets
*/
int gotten = (_inputEnd - _inputPtr);
while (gotten < minimum) {
int count;
if (_in == null) { // block source
count = -1;
} else {
count = _in.read(_inputBuffer, _inputEnd, _inputBuffer.length - _inputEnd);
}
if (count < 1) {
return false;
}
_inputEnd += count;
gotten += count;
}
return true;
}
}