blob: d8bd1b246ce334973f08200e1edaaec42b2fe99a [file] [log] [blame]
package org.codehaus.jackson.smile;
import java.io.*;
import org.codehaus.jackson.JsonLocation;
import org.codehaus.jackson.JsonParseException;
import org.codehaus.jackson.JsonParser;
import org.codehaus.jackson.ObjectCodec;
import org.codehaus.jackson.format.InputAccessor;
import org.codehaus.jackson.format.MatchStrength;
import org.codehaus.jackson.io.IOContext;
import org.codehaus.jackson.sym.BytesToNameCanonicalizer;
import static org.codehaus.jackson.smile.SmileConstants.*;
/**
* Simple bootstrapper version used with Smile format parser.
*/
public class SmileParserBootstrapper
{
/*
/**********************************************************
/* Configuration
/**********************************************************
*/
final IOContext _context;
final InputStream _in;
/*
/**********************************************************
/* Input buffering
/**********************************************************
*/
final byte[] _inputBuffer;
private int _inputPtr;
private int _inputEnd;
/**
* Flag that indicates whether buffer above is to be recycled
* after being used or not.
*/
private final boolean _bufferRecyclable;
/*
/**********************************************************
/* Input location
/**********************************************************
*/
/**
* Current number of input units (bytes or chars) that were processed in
* previous blocks,
* before contents of current input buffer.
*<p>
* Note: includes possible BOMs, if those were part of the input.
*/
protected int _inputProcessed;
/*
/**********************************************************
/* Data gathered
/**********************************************************
*/
/*
/**********************************************************
/* Life-cycle
/**********************************************************
*/
public SmileParserBootstrapper(IOContext ctxt, InputStream in)
{
_context = ctxt;
_in = in;
_inputBuffer = ctxt.allocReadIOBuffer();
_inputEnd = _inputPtr = 0;
_inputProcessed = 0;
_bufferRecyclable = true;
}
public SmileParserBootstrapper(IOContext ctxt, byte[] inputBuffer, int inputStart, int inputLen)
{
_context = ctxt;
_in = null;
_inputBuffer = inputBuffer;
_inputPtr = inputStart;
_inputEnd = (inputStart + inputLen);
// Need to offset this for correct location info
_inputProcessed = -inputStart;
_bufferRecyclable = false;
}
public SmileParser constructParser(int generalParserFeatures, int smileFeatures,
ObjectCodec codec, BytesToNameCanonicalizer rootByteSymbols)
throws IOException, JsonParseException
{
boolean intern = JsonParser.Feature.INTERN_FIELD_NAMES.enabledIn(generalParserFeatures);
BytesToNameCanonicalizer can = rootByteSymbols.makeChild(true, intern);
// We just need a single byte, really, to know if it starts with header
ensureLoaded(1);
SmileParser p = new SmileParser(_context, generalParserFeatures, smileFeatures,
codec, can,
_in, _inputBuffer, _inputPtr, _inputEnd, _bufferRecyclable);
boolean hadSig = false;
if (_inputPtr < _inputEnd) { // only false for empty doc
if (_inputBuffer[_inputPtr] == SmileConstants.HEADER_BYTE_1) {
// need to ensure it gets properly handled so caller won't see the signature
hadSig = p.handleSignature(true, true);
}
}
if (!hadSig && (smileFeatures & SmileParser.Feature.REQUIRE_HEADER.getMask()) != 0) {
// Ok, first, let's see if it looks like plain JSON...
String msg;
byte firstByte = (_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr] : 0;
if (firstByte == '{' || firstByte == '[') {
msg = "Input does not start with Smile format header (first byte = 0x"
+Integer.toHexString(firstByte & 0xFF)+") -- rather, it starts with '"+((char) firstByte)
+"' (plain JSON input?) -- can not parse";
} else {
msg = "Input does not start with Smile format header (first byte = 0x"
+Integer.toHexString(firstByte & 0xFF)+") and parser has REQUIRE_HEADER enabled: can not parse";
}
throw new JsonParseException(msg, JsonLocation.NA);
}
return p;
}
/*
/**********************************************************
/* Encoding detection for data format auto-detection
/**********************************************************
*/
/**
* Helper
*
* @since 1.8
*/
public static MatchStrength hasSmileFormat(InputAccessor acc) throws IOException
{
// Ok: ideally we start with the header -- if so, we are golden
if (!acc.hasMoreBytes()) {
return MatchStrength.INCONCLUSIVE;
}
// We always need at least two bytes to determine, so
byte b1 = acc.nextByte();
if (!acc.hasMoreBytes()) {
return MatchStrength.INCONCLUSIVE;
}
byte b2 = acc.nextByte();
// First: do we see 3 "magic bytes"? If so, we are golden
if (b1 == SmileConstants.HEADER_BYTE_1) { // yeah, looks like marker
if (b2 != SmileConstants.HEADER_BYTE_2) {
return MatchStrength.NO_MATCH;
}
if (!acc.hasMoreBytes()) {
return MatchStrength.INCONCLUSIVE;
}
return (acc.nextByte() == SmileConstants.HEADER_BYTE_3) ?
MatchStrength.FULL_MATCH : MatchStrength.NO_MATCH;
}
// Otherwise: ideally either Object or Array:
if (b1 == SmileConstants.TOKEN_LITERAL_START_OBJECT) {
/* Object is bit easier, because now we need to get new name; i.e. can
* rule out name back-refs
*/
if (b2 == SmileConstants.TOKEN_KEY_LONG_STRING) {
return MatchStrength.SOLID_MATCH;
}
int ch = (int) b2 & 0xFF;
if (ch >= 0x80 && ch < 0xF8) {
return MatchStrength.SOLID_MATCH;
}
return MatchStrength.NO_MATCH;
}
// Array bit trickier
if (b1 == SmileConstants.TOKEN_LITERAL_START_ARRAY) {
if (!acc.hasMoreBytes()) {
return MatchStrength.INCONCLUSIVE;
}
/* For arrays, we will actually accept much wider range of values (including
* things that could otherwise collide)
*/
if (likelySmileValue(b2) || possibleSmileValue(b2, true)) {
return MatchStrength.SOLID_MATCH;
}
return MatchStrength.NO_MATCH;
}
// Scalar values are pretty weak, albeit possible; require more certain match, consider it weak:
if (likelySmileValue(b1) || possibleSmileValue(b2, false)) {
return MatchStrength.SOLID_MATCH;
}
return MatchStrength.NO_MATCH;
}
private static boolean likelySmileValue(byte b)
{
int ch = (int) b & 0xFF;
if (ch >= 0xE0) { // good range for known values
switch (ch) {
case TOKEN_MISC_LONG_TEXT_ASCII: // 0xE0
case TOKEN_MISC_LONG_TEXT_UNICODE: // 0xE4
case TOKEN_MISC_BINARY_7BIT: // 0xE8
case TOKEN_LITERAL_START_ARRAY: // 0xF8
case TOKEN_LITERAL_START_OBJECT: // 0xFA
return true;
}
// Others will not work (end object/array; reserved; shared strings)
return false;
}
// ASCII ctrl char range is pretty good match too
if (ch >= 0x80 && ch <= 0x9F) {
return true;
}
return false;
}
/**
* @param lenient Whether to consider more speculative matches or not
* (typically true when there is context like start-array)
*/
private static boolean possibleSmileValue(byte b, boolean lenient)
{
int ch = (int) b & 0xFF;
// note: we know that likely matches have been handled already, so...
if (ch >= 0x80) {
return (ch <= 0xE0);
}
if (lenient) {
if (ch >= 0x40) { // tiny/short ASCII
return true;
}
if (ch >- 0x20) { // various constants
return (ch < 0x2C); // many reserved bytes that can't be seen
}
}
return false;
}
/*
/**********************************************************
/* Internal methods, raw input access
/**********************************************************
*/
protected boolean ensureLoaded(int minimum)
throws IOException
{
if (_in == null) { // block source; nothing more to load
return false;
}
/* Let's assume here buffer has enough room -- this will always
* be true for the limited used this method gets
*/
int gotten = (_inputEnd - _inputPtr);
while (gotten < minimum) {
int count = _in.read(_inputBuffer, _inputEnd, _inputBuffer.length - _inputEnd);
if (count < 1) {
return false;
}
_inputEnd += count;
gotten += count;
}
return true;
}
}