blob: b0ed0f02edca19a839d5d2f75facaa2e66b3612d [file] [log] [blame]
package org.codehaus.jackson.io;
import java.lang.ref.SoftReference;
import org.codehaus.jackson.util.BufferRecycler;
import org.codehaus.jackson.util.ByteArrayBuilder;
import org.codehaus.jackson.util.CharTypes;
import org.codehaus.jackson.util.TextBuffer;
/**
* Helper class used for efficient encoding of JSON String values (including
* JSON field names) into Strings or UTF-8 byte arrays.
*<p>
* Note that methods in here are somewhat optimized, but not ridiculously so.
* Reason is that conversion method results are expected to be cached so that
* these methods will not be hot spots during normal operation.
*
* @since 1.6
*/
public final class JsonStringEncoder
{
private final static char[] HEX_CHARS = CharTypes.copyHexChars();
private final static byte[] HEX_BYTES = CharTypes.copyHexBytes();
private final static int SURR1_FIRST = 0xD800;
private final static int SURR1_LAST = 0xDBFF;
private final static int SURR2_FIRST = 0xDC00;
private final static int SURR2_LAST = 0xDFFF;
private final static int INT_BACKSLASH = '\\';
private final static int INT_U = 'u';
private final static int INT_0 = '0';
/**
* This <code>ThreadLocal</code> contains a {@link java.lang.ref.SoftRerefence}
* to a {@link BufferRecycler} used to provide a low-cost
* buffer recycling between reader and writer instances.
*/
final protected static ThreadLocal<SoftReference<JsonStringEncoder>> _threadEncoder
= new ThreadLocal<SoftReference<JsonStringEncoder>>();
/**
* Lazily constructed text buffer used to produce JSON encoded Strings
* as characters (without UTF-8 encoding)
*/
protected TextBuffer _textBuffer;
/**
* Lazily-constructed builder used for UTF-8 encoding of text values
* (quoted and unquoted)
*/
protected ByteArrayBuilder _byteBuilder;
/**
* Temporary buffer used for composing quote/escape sequences
*/
protected final char[] _quoteBuffer;
/*
/**********************************************************
/* Construction, instance access
/**********************************************************
*/
public JsonStringEncoder()
{
_quoteBuffer = new char[6];
_quoteBuffer[0] = '\\';
_quoteBuffer[2] = '0';
_quoteBuffer[3] = '0';
}
/**
* Factory method for getting an instance; this is either recycled per-thread instance,
* or a newly constructed one.
*/
public static JsonStringEncoder getInstance()
{
SoftReference<JsonStringEncoder> ref = _threadEncoder.get();
JsonStringEncoder enc = (ref == null) ? null : ref.get();
if (enc == null) {
enc = new JsonStringEncoder();
_threadEncoder.set(new SoftReference<JsonStringEncoder>(enc));
}
return enc;
}
/*
/**********************************************************
/* Public API
/**********************************************************
*/
/**
* Method that will quote text contents using JSON standard quoting,
* and return results as a character array
*/
public char[] quoteAsString(String input)
{
TextBuffer textBuffer = _textBuffer;
if (textBuffer == null) {
// no allocator; can add if we must, shouldn't need to
_textBuffer = textBuffer = new TextBuffer(null);
}
char[] outputBuffer = textBuffer.emptyAndGetCurrentSegment();
final int[] escCodes = CharTypes.get7BitOutputEscapes();
final int escCodeCount = escCodes.length;
int inPtr = 0;
final int inputLen = input.length();
int outPtr = 0;
outer_loop:
while (inPtr < inputLen) {
tight_loop:
while (true) {
char c = input.charAt(inPtr);
if (c < escCodeCount && escCodes[c] != 0) {
break tight_loop;
}
if (outPtr >= outputBuffer.length) {
outputBuffer = textBuffer.finishCurrentSegment();
outPtr = 0;
}
outputBuffer[outPtr++] = c;
if (++inPtr >= inputLen) {
break outer_loop;
}
}
// something to escape; 2 or 6-char variant?
int escCode = escCodes[input.charAt(inPtr++)];
int length = _appendSingleEscape(escCode, _quoteBuffer);
if ((outPtr + length) > outputBuffer.length) {
int first = outputBuffer.length - outPtr;
if (first > 0) {
System.arraycopy(_quoteBuffer, 0, outputBuffer, outPtr, first);
}
outputBuffer = textBuffer.finishCurrentSegment();
int second = length - first;
System.arraycopy(_quoteBuffer, first, outputBuffer, 0, second);
outPtr = second;
} else {
System.arraycopy(_quoteBuffer, 0, outputBuffer, outPtr, length);
outPtr += length;
}
}
textBuffer.setCurrentLength(outPtr);
return textBuffer.contentsAsArray();
}
/**
* Will quote given JSON String value using standard quoting, encode
* results as UTF-8, and return result as a byte array.
*/
public byte[] quoteAsUTF8(String text)
{
ByteArrayBuilder byteBuilder = _byteBuilder;
if (byteBuilder == null) {
// no allocator; can add if we must, shouldn't need to
_byteBuilder = byteBuilder = new ByteArrayBuilder(null);
}
int inputPtr = 0;
int inputEnd = text.length();
int outputPtr = 0;
byte[] outputBuffer = byteBuilder.resetAndGetFirstSegment();
main_loop:
while (inputPtr < inputEnd) {
final int[] escCodes = CharTypes.get7BitOutputEscapes();
inner_loop: // ascii and escapes
while (true) {
int ch = text.charAt(inputPtr);
if (ch > 0x7F || escCodes[ch] != 0) {
break inner_loop;
}
if (outputPtr >= outputBuffer.length) {
outputBuffer = byteBuilder.finishCurrentSegment();
outputPtr = 0;
}
outputBuffer[outputPtr++] = (byte) ch;
if (++inputPtr >= inputEnd) {
break main_loop;
}
}
if (outputPtr >= outputBuffer.length) {
outputBuffer = byteBuilder.finishCurrentSegment();
outputPtr = 0;
}
// Ok, so what did we hit?
int ch = (int) text.charAt(inputPtr++);
if (ch <= 0x7F) { // needs quoting
int escape = escCodes[ch];
// ctrl-char, 6-byte escape...
outputPtr = _appendByteEscape(ch, escape, byteBuilder, outputPtr);
outputBuffer = byteBuilder.getCurrentSegment();
continue main_loop;
} else if (ch <= 0x7FF) { // fine, just needs 2 byte output
outputBuffer[outputPtr++] = (byte) (0xc0 | (ch >> 6));
ch = (0x80 | (ch & 0x3f));
} else { // 3 or 4 bytes
// Surrogates?
if (ch < SURR1_FIRST || ch > SURR2_LAST) { // nope
outputBuffer[outputPtr++] = (byte) (0xe0 | (ch >> 12));
if (outputPtr >= outputBuffer.length) {
outputBuffer = byteBuilder.finishCurrentSegment();
outputPtr = 0;
}
outputBuffer[outputPtr++] = (byte) (0x80 | ((ch >> 6) & 0x3f));
ch = (0x80 | (ch & 0x3f));
} else { // yes, surrogate pair
if (ch > SURR1_LAST) { // must be from first range
_throwIllegalSurrogate(ch);
}
// and if so, followed by another from next range
if (inputPtr >= inputEnd) {
_throwIllegalSurrogate(ch);
}
ch = _convertSurrogate(ch, text.charAt(inputPtr++));
if (ch > 0x10FFFF) { // illegal, as per RFC 4627
_throwIllegalSurrogate(ch);
}
outputBuffer[outputPtr++] = (byte) (0xf0 | (ch >> 18));
if (outputPtr >= outputBuffer.length) {
outputBuffer = byteBuilder.finishCurrentSegment();
outputPtr = 0;
}
outputBuffer[outputPtr++] = (byte) (0x80 | ((ch >> 12) & 0x3f));
if (outputPtr >= outputBuffer.length) {
outputBuffer = byteBuilder.finishCurrentSegment();
outputPtr = 0;
}
outputBuffer[outputPtr++] = (byte) (0x80 | ((ch >> 6) & 0x3f));
ch = (0x80 | (ch & 0x3f));
}
}
if (outputPtr >= outputBuffer.length) {
outputBuffer = byteBuilder.finishCurrentSegment();
outputPtr = 0;
}
outputBuffer[outputPtr++] = (byte) ch;
}
return _byteBuilder.completeAndCoalesce(outputPtr);
}
/**
* Will encode given String as UTF-8 (without any quoting), return
* resulting byte array.
*/
public byte[] encodeAsUTF8(String text)
{
ByteArrayBuilder byteBuilder = _byteBuilder;
if (byteBuilder == null) {
// no allocator; can add if we must, shouldn't need to
_byteBuilder = byteBuilder = new ByteArrayBuilder(null);
}
int inputPtr = 0;
int inputEnd = text.length();
int outputPtr = 0;
byte[] outputBuffer = byteBuilder.resetAndGetFirstSegment();
int outputEnd = outputBuffer.length;
main_loop:
while (inputPtr < inputEnd) {
int c = text.charAt(inputPtr++);
// first tight loop for ascii
while (c <= 0x7F) {
if (outputPtr >= outputEnd) {
outputBuffer = byteBuilder.finishCurrentSegment();
outputEnd = outputBuffer.length;
outputPtr = 0;
}
outputBuffer[outputPtr++] = (byte) c;
if (inputPtr >= inputEnd) {
break main_loop;
}
c = text.charAt(inputPtr++);
}
// then multi-byte...
if (outputPtr >= outputEnd) {
outputBuffer = byteBuilder.finishCurrentSegment();
outputEnd = outputBuffer.length;
outputPtr = 0;
}
if (c < 0x800) { // 2-byte
outputBuffer[outputPtr++] = (byte) (0xc0 | (c >> 6));
} else { // 3 or 4 bytes
// Surrogates?
if (c < SURR1_FIRST || c > SURR2_LAST) { // nope
outputBuffer[outputPtr++] = (byte) (0xe0 | (c >> 12));
if (outputPtr >= outputEnd) {
outputBuffer = byteBuilder.finishCurrentSegment();
outputEnd = outputBuffer.length;
outputPtr = 0;
}
outputBuffer[outputPtr++] = (byte) (0x80 | ((c >> 6) & 0x3f));
} else { // yes, surrogate pair
if (c > SURR1_LAST) { // must be from first range
_throwIllegalSurrogate(c);
}
// and if so, followed by another from next range
if (inputPtr >= inputEnd) {
_throwIllegalSurrogate(c);
}
c = _convertSurrogate(c, text.charAt(inputPtr++));
if (c > 0x10FFFF) { // illegal, as per RFC 4627
_throwIllegalSurrogate(c);
}
outputBuffer[outputPtr++] = (byte) (0xf0 | (c >> 18));
if (outputPtr >= outputEnd) {
outputBuffer = byteBuilder.finishCurrentSegment();
outputEnd = outputBuffer.length;
outputPtr = 0;
}
outputBuffer[outputPtr++] = (byte) (0x80 | ((c >> 12) & 0x3f));
if (outputPtr >= outputEnd) {
outputBuffer = byteBuilder.finishCurrentSegment();
outputEnd = outputBuffer.length;
outputPtr = 0;
}
outputBuffer[outputPtr++] = (byte) (0x80 | ((c >> 6) & 0x3f));
}
}
if (outputPtr >= outputEnd) {
outputBuffer = byteBuilder.finishCurrentSegment();
outputEnd = outputBuffer.length;
outputPtr = 0;
}
outputBuffer[outputPtr++] = (byte) (0x80 | (c & 0x3f));
}
return _byteBuilder.completeAndCoalesce(outputPtr);
}
/*
/**********************************************************
/* Internal methods
/**********************************************************
*/
private int _appendSingleEscape(int escCode, char[] quoteBuffer)
{
if (escCode < 0) { // control char, value -(char + 1)
int value = -(escCode + 1);
quoteBuffer[1] = 'u';
// We know it's a control char, so only the last 2 chars are non-0
quoteBuffer[4] = HEX_CHARS[value >> 4];
quoteBuffer[5] = HEX_CHARS[value & 0xF];
return 6;
}
quoteBuffer[1] = (char) escCode;
return 2;
}
private int _appendByteEscape(int ch, int escCode, ByteArrayBuilder byteBuilder, int ptr)
{
byteBuilder.setCurrentSegmentLength(ptr);
byteBuilder.append(INT_BACKSLASH);
if (escCode < 0) { // standard escape
byteBuilder.append(INT_U);
if (ch > 0xFF) {
int hi = (ch >> 8);
byteBuilder.append(HEX_BYTES[hi >> 4]);
byteBuilder.append(HEX_BYTES[hi & 0xF]);
ch &= 0xFF;
} else {
byteBuilder.append(INT_0);
byteBuilder.append(INT_0);
}
byteBuilder.append(HEX_BYTES[ch >> 4]);
byteBuilder.append(HEX_BYTES[ch & 0xF]);
} else { // 2-char simple escape
byteBuilder.append((byte) escCode);
}
return byteBuilder.getCurrentSegmentLength();
}
/**
* Method called to calculate UTF code point, from a surrogate pair.
*/
private int _convertSurrogate(int firstPart, int secondPart)
{
// Ok, then, is the second part valid?
if (secondPart < SURR2_FIRST || secondPart > SURR2_LAST) {
throw new IllegalArgumentException("Broken surrogate pair: first char 0x"+Integer.toHexString(firstPart)+", second 0x"+Integer.toHexString(secondPart)+"; illegal combination");
}
return 0x10000 + ((firstPart - SURR1_FIRST) << 10) + (secondPart - SURR2_FIRST);
}
private void _throwIllegalSurrogate(int code)
{
if (code > 0x10FFFF) { // over max?
throw new IllegalArgumentException("Illegal character point (0x"+Integer.toHexString(code)+") to output; max is 0x10FFFF as per RFC 4627");
}
if (code >= SURR1_FIRST) {
if (code <= SURR1_LAST) { // Unmatched first part (closing without second part?)
throw new IllegalArgumentException("Unmatched first part of surrogate pair (0x"+Integer.toHexString(code)+")");
}
throw new IllegalArgumentException("Unmatched second part of surrogate pair (0x"+Integer.toHexString(code)+")");
}
// should we ever get this?
throw new IllegalArgumentException("Illegal character point (0x"+Integer.toHexString(code)+") to output");
}
}