More cleanup, refactoring, additions to unit test coverage.
diff --git a/src/perf/BaseReader.java b/src/perf/BaseReader.java new file mode 100644 index 0000000..7831789 --- /dev/null +++ b/src/perf/BaseReader.java
@@ -0,0 +1,117 @@ +//package org.codehaus.jackson.io; + +import java.io.*; + +import org.codehaus.jackson.io.IOContext; + +/** + * Simple basic class for optimized readers in this package; implements + * "cookie-cutter" methods that are used by all actual implementations. + */ +abstract class BaseReader + extends Reader +{ + /** + * JSON actually limits available Unicode range in the high end + * to the same as xml (to basically limit UTF-8 max byte sequence + * length to 4) + */ + final protected static int LAST_VALID_UNICODE_CHAR = 0x10FFFF; + + final protected static char NULL_CHAR = (char) 0; + final protected static char NULL_BYTE = (byte) 0; + + final protected IOContext mContext; + + protected InputStream mIn; + + protected byte[] mBuffer; + + protected int mPtr; + protected int mLength; + + /* + //////////////////////////////////////// + // Life-cycle + //////////////////////////////////////// + */ + + protected BaseReader(IOContext context, + InputStream in, byte[] buf, int ptr, int len) + { + mContext = context; + mIn = in; + mBuffer = buf; + mPtr = ptr; + mLength = len; + } + + /* + //////////////////////////////////////// + // Reader API + //////////////////////////////////////// + */ + + public void close() + throws IOException + { + InputStream in = mIn; + + if (in != null) { + mIn = null; + freeBuffers(); + in.close(); + } + } + + char[] mTmpBuf = null; + + /** + * Although this method is implemented by the base class, AND it should + * never be called by main code, let's still implement it bit more + * efficiently just in case + */ + public int read() + throws IOException + { + if (mTmpBuf == null) { + mTmpBuf = new char[1]; + } + if (read(mTmpBuf, 0, 1) < 1) { + return -1; + } + return mTmpBuf[0]; + } + + /* + //////////////////////////////////////// + // Internal/package methods: + //////////////////////////////////////// + */ + + /** + * This method should be called along with (or instead of) normal + * close. After calling this method, no further reads should be tried. + * Method will try to recycle read buffers (if any). + */ + public final void freeBuffers() + { + byte[] buf = mBuffer; + if (buf != null) { + mBuffer = null; + mContext.releaseReadIOBuffer(buf); + } + } + + protected void reportBounds(char[] cbuf, int start, int len) + throws IOException + { + throw new ArrayIndexOutOfBoundsException("read(buf,"+start+","+len+"), cbuf["+cbuf.length+"]"); + } + + protected void reportStrangeStream() + throws IOException + { + throw new IOException("Strange I/O stream, returned 0 bytes on read"); + } +}
diff --git a/src/perf/TestJsonPerf.java b/src/perf/TestJsonPerf.java index b28aeda..22e2e2f 100644 --- a/src/perf/TestJsonPerf.java +++ b/src/perf/TestJsonPerf.java
@@ -2,7 +2,6 @@ import org.codehaus.jackson.*; import org.codehaus.jackson.io.IOContext; -import org.codehaus.jackson.io.UTF8Reader; import org.codehaus.jackson.map.JavaTypeMapper; import org.codehaus.jackson.map.JsonTypeMapper; import org.codehaus.jackson.util.BufferRecycler; @@ -175,7 +174,7 @@ char[] cbuf = new char[mData.length]; - IOContext ctxt = new IOContext(new BufferRecycler(), this); + IOContext ctxt = new IOContext(new BufferRecycler(), this, false); int sum = 0; for (int i = 0; i < reps; ++i) {
diff --git a/src/perf/UTF8Reader.java b/src/perf/UTF8Reader.java new file mode 100644 index 0000000..c69f937 --- /dev/null +++ b/src/perf/UTF8Reader.java
@@ -0,0 +1,364 @@ +//package org.codehaus.jackson.io; + +import java.io.*; + +import org.codehaus.jackson.io.IOContext; + +/** + * Optimized Reader that reads UTF-8 encoded content from an input stream. + * In addition to doing (hopefully) optimal conversion, it can also take + * array of "pre-read" (leftover) bytes; this is necessary when preliminary + * stream/reader is trying to figure out character encoding. + */ +public final class UTF8Reader + extends BaseReader +{ + char mSurrogate = NULL_CHAR; + + /** + * Total read character count; used for error reporting purposes + */ + int mCharCount = 0; + + /** + * Total read byte count; used for error reporting purposes + */ + int mByteCount = 0; + + /* + //////////////////////////////////////// + // Life-cycle + //////////////////////////////////////// + */ + + public UTF8Reader(IOContext ctxt, + InputStream in, byte[] buf, int ptr, int len) + { + super(ctxt, in, buf, ptr, len); + } + + /* + //////////////////////////////////////// + // Public API + //////////////////////////////////////// + */ + + public int read(char[] cbuf, int start, int len) + throws IOException + { + // Already EOF? + if (mBuffer == null) { + return -1; + } + if (len < 1) { + return len; + } + // Let's then ensure there's enough room... + if (start < 0 || (start+len) > cbuf.length) { + reportBounds(cbuf, start, len); + } + + len += start; + int outPtr = start; + + // Ok, first; do we have a surrogate from last round? + if (mSurrogate != NULL_CHAR) { + cbuf[outPtr++] = mSurrogate; + mSurrogate = NULL_CHAR; + // No need to load more, already got one char + } else { + /* To prevent unnecessary blocking (esp. with network streams), + * we'll only require decoding of a single char + */ + int left = (mLength - mPtr); + + /* So; only need to load more if we can't provide at least + * one more character. We need not do thorough check here, + * but let's check the common cases here: either completely + * empty buffer (left == 0), or one with less than max. byte + * count for a single char, and starting of a multi-byte + * encoding (this leaves possibility of a 2/3-byte char + * that is still fully accessible... but that can be checked + * by the load method) + */ + if (left < 4) { + // Need to load more? + if (left < 1 || mBuffer[mPtr] < 0) { + if (!loadMore(left)) { // (legal) EOF? + return -1; + } + } + } + } + + /* This may look silly, but using a local var is indeed faster + * (if and when HotSpot properly gets things running) than + * member variable... + */ + byte[] buf = mBuffer; + int inPtr = mPtr; + int inBufLen = mLength; + + main_loop: + while (outPtr < len) { + // At this point we have at least one byte available + int c = (int) buf[inPtr++]; + + /* Let's first do the quickie loop for common case; 7-bit + * ascii: + */ + if (c >= 0) { // ascii? can probably loop, then + cbuf[outPtr++] = (char) c; // ok since MSB is never on + + /* Ok, how many such chars could we safely process + * without overruns? (will combine 2 in-loop comparisons + * into just one) + */ + int outMax = (len - outPtr); // max output + int inMax = (inBufLen - inPtr); // max input + int inEnd = inPtr + ((inMax < outMax) ? inMax : outMax); + + ascii_loop: + while (true) { + if (inPtr >= inEnd) { + break main_loop; + } + c = (int) buf[inPtr++]; + if (c < 0) { // multi-byte + break ascii_loop; + } + cbuf[outPtr++] = (char) c; + } + } + + int needed; + + // Ok; if we end here, we got multi-byte combination + if ((c & 0xE0) == 0xC0) { // 2 bytes (0x0080 - 0x07FF) + c = (c & 0x1F); + needed = 1; + } else if ((c & 0xF0) == 0xE0) { // 3 bytes (0x0800 - 0xFFFF) + c = (c & 0x0F); + needed = 2; + } else if ((c & 0xF8) == 0xF0) { + // 4 bytes; double-char BS, with surrogates and all... + c = (c & 0x0F); + needed = 3; + } else { + reportInvalidInitial(c & 0xFF, outPtr-start); + // never gets here... + needed = 1; + } + /* Do we have enough bytes? If not, let's just push back the + * byte and leave, since we have already gotten at least one + * char decoded. This way we will only block (with read from + * input stream) when absolutely necessary. + */ + if ((inBufLen - inPtr) < needed) { + --inPtr; + break main_loop; + } + + int d = (int) buf[inPtr++]; + if ((d & 0xC0) != 0x080) { + reportInvalidOther(d & 0xFF, outPtr-start); + } + c = (c << 6) | (d & 0x3F); + + if (needed > 1) { // needed == 1 means 2 bytes total + d = buf[inPtr++]; // 3rd byte + if ((d & 0xC0) != 0x080) { + reportInvalidOther(d & 0xFF, outPtr-start); + } + c = (c << 6) | (d & 0x3F); + if (needed > 2) { // 4 bytes? (need surrogates) + d = buf[inPtr++]; + if ((d & 0xC0) != 0x080) { + reportInvalidOther(d & 0xFF, outPtr-start); + } + c = (c << 6) | (d & 0x3F); + if (c > LAST_VALID_UNICODE_CHAR) { + reportInvalid(c, outPtr-start, + "(above "+Integer.toHexString(LAST_VALID_UNICODE_CHAR)); + } + /* Ugh. Need to mess with surrogates. Ok; let's inline them + * there, then, if there's room: if only room for one, + * need to save the surrogate for the rainy day... + */ + c -= 0x10000; // to normalize it starting with 0x0 + cbuf[outPtr++] = (char) (0xD800 + (c >> 10)); + // hmmh. can this ever be 0? (not legal, at least?) + c = (0xDC00 | (c & 0x03FF)); + + // Room for second part? + if (outPtr >= len) { // nope + mSurrogate = (char) c; + break main_loop; + } + // sure, let's fall back to normal processing: + } + + /* 08-Jun-2007, TSa: Not sure if it's really legal + * to get surrogate chars here: JSON specs do not + * prevent them, which is different from xml. So + * for now let's not worry about them. If checks + * are needed, can uncomment following: + */ + + /* + else { + // Otherwise, need to check that 3-byte chars are + // legal ones (should not expand to surrogates) + if (c >= 0xD800) { + // But first, let's check max chars: + if (c < 0xE000) { + reportInvalid(c, outPtr-start, "(a surrogate character) "); + } + } + } + */ + } + cbuf[outPtr++] = (char) c; + if (inPtr >= inBufLen) { + break main_loop; + } + } + + mPtr = inPtr; + len = outPtr - start; + mCharCount += len; + return len; + } + + /* + //////////////////////////////////////// + // Internal methods + //////////////////////////////////////// + */ + + private void reportInvalidInitial(int mask, int offset) + throws IOException + { + // input (byte) ptr has been advanced by one, by now: + int bytePos = mByteCount + mPtr - 1; + int charPos = mCharCount + offset + 1; + + throw new CharConversionException("Invalid UTF-8 start byte 0x" + +Integer.toHexString(mask) + +" (at char #"+charPos+", byte #"+bytePos+")"); + } + + private void reportInvalidOther(int mask, int offset) + throws IOException + { + int bytePos = mByteCount + mPtr - 1; + int charPos = mCharCount + offset; + + throw new CharConversionException("Invalid UTF-8 middle byte 0x" + +Integer.toHexString(mask) + +" (at char #"+charPos+", byte #"+bytePos+")"); + } + + private void reportUnexpectedEOF(int gotBytes, int needed) + throws IOException + { + int bytePos = mByteCount + gotBytes; + int charPos = mCharCount; + + throw new CharConversionException("Unexpected EOF in the middle of a multi-byte char: got " + +gotBytes+", needed "+needed + +", at char #"+charPos+", byte #"+bytePos+")"); + } + + private void reportInvalid(int value, int offset, String msg) + throws IOException + { + int bytePos = mByteCount + mPtr - 1; + int charPos = mCharCount + offset; + + throw new CharConversionException("Invalid UTF-8 character 0x" + +Integer.toHexString(value)+msg + +" at char #"+charPos+", byte #"+bytePos+")"); + } + + /** + * @param available Number of "unused" bytes in the input buffer + * + * @return True, if enough bytes were read to allow decoding of at least + * one full character; false if EOF was encountered instead. + */ + private boolean loadMore(int available) + throws IOException + { + mByteCount += (mLength - available); + + // Bytes that need to be moved to the beginning of buffer? + if (available > 0) { + if (mPtr > 0) { + for (int i = 0; i < available; ++i) { + mBuffer[i] = mBuffer[mPtr+i]; + } + mPtr = 0; + } + mLength = available; + } else { + /* Ok; here we can actually reasonably expect an EOF, + * so let's do a separate read right away: + */ + mPtr = 0; + int count = mIn.read(mBuffer); + if (count < 1) { + mLength = 0; + if (count < 0) { // -1 + freeBuffers(); // to help GC? + return false; + } + // 0 count is no good; let's err out + reportStrangeStream(); + } + mLength = count; + } + + /* We now have at least one byte... and that allows us to + * calculate exactly how many bytes we need! + */ + int c = (int) mBuffer[0]; + if (c >= 0) { // single byte (ascii) char... cool, can return + return true; + } + + // Ok, a multi-byte char, let's check how many bytes we'll need: + int needed; + if ((c & 0xE0) == 0xC0) { // 2 bytes (0x0080 - 0x07FF) + needed = 2; + } else if ((c & 0xF0) == 0xE0) { // 3 bytes (0x0800 - 0xFFFF) + needed = 3; + } else if ((c & 0xF8) == 0xF0) { + // 4 bytes; double-char BS, with surrogates and all... + needed = 4; + } else { + reportInvalidInitial(c & 0xFF, 0); + // never gets here... but compiler whines without this: + needed = 1; + } + + /* And then we'll just need to load up to that many bytes; + * if an EOF is hit, that'll be an error. But we need not do + * actual decoding here, just load enough bytes. + */ + while (mLength < needed) { + int count = mIn.read(mBuffer, mLength, mBuffer.length - mLength); + if (count < 1) { + if (count < 0) { // -1, EOF... no good! + freeBuffers(); + reportUnexpectedEOF(mLength, needed); + } + // 0 count is no good; let's err out + reportStrangeStream(); + } + mLength += count; + } + return true; + } +} +