Smile: adding symbol handling (similar to standard UTF-8); now smile decoding 25% faster than textual json (about same as size reduction)
diff --git a/profile.sh b/profile.sh index 16f590b..9c014c1 100755 --- a/profile.sh +++ b/profile.sh
@@ -2,6 +2,7 @@ java -Xmx16m -server \ -XX:CompileThreshold=2000 \ - -cp build/classes/core:build/classes/mapper:build/classes/extra:build/classes/perf\ + -cp build/classes/core:build/classes/mapper:build/classes/extra\ +:build/classes/smile:build/classes/perf\ -Xrunhprof:cpu=samples,depth=10,verbose=n,interval=2 \ $*
diff --git a/src/java/org/codehaus/jackson/sym/BytesToNameCanonicalizer.java b/src/java/org/codehaus/jackson/sym/BytesToNameCanonicalizer.java index dbbad75..2e8e215 100644 --- a/src/java/org/codehaus/jackson/sym/BytesToNameCanonicalizer.java +++ b/src/java/org/codehaus/jackson/sym/BytesToNameCanonicalizer.java
@@ -477,6 +477,20 @@ /********************************************************** */ + /** + * @since 1.6.0 + */ + public Name addName(String symbolStr, int q1, int q2) + { + if (_intern) { + symbolStr = InternCache.instance.intern(symbolStr); + } + int hash = (q2 == 0) ? calcHash(q1) : calcHash(q1, q2); + Name symbol = constructName(hash, symbolStr, q1, q2); + _addSymbol(hash, symbol); + return symbol; + } + public Name addName(String symbolStr, int[] quads, int qlen) { if (_intern) { @@ -487,7 +501,7 @@ _addSymbol(hash, symbol); return symbol; } - + /* /********************************************************** /* Helper methods @@ -732,9 +746,9 @@ Bucket[] oldBuckets = _collList; _collList = new Bucket[oldBuckets.length]; for (int i = 0; i < oldEnd; ++i) { - for (Bucket curr = oldBuckets[i]; curr != null; curr = curr.mNext) { + for (Bucket curr = oldBuckets[i]; curr != null; curr = curr._next) { ++symbolsSeen; - Name symbol = curr.mName; + Name symbol = curr._name; int hash = symbol.hashCode(); int ix = (hash & _mainHashMask); int val = _mainHash[ix]; @@ -862,7 +876,6 @@ /********************************************************** */ - /* private static Name constructName(int hash, String name, int q1, int q2) { if (q2 == 0) { // one quad only? @@ -870,7 +883,6 @@ } return new Name2(name, hash, q1, q2); } - */ private static Name constructName(int hash, String name, int[] quads, int qlen) { @@ -901,19 +913,19 @@ final static class Bucket { - final Name mName; - final Bucket mNext; + protected final Name _name; + protected final Bucket _next; Bucket(Name name, Bucket next) { - mName = name; - mNext = next; + _name = name; + _next = next; } public int length() { int len = 1; - for (Bucket curr = mNext; curr != null; curr = curr.mNext) { + for (Bucket curr = _next; curr != null; curr = curr._next) { ++len; } return len; @@ -921,13 +933,13 @@ public Name find(int hash, int firstQuad, int secondQuad) { - if (mName.hashCode() == hash) { - if (mName.equals(firstQuad, secondQuad)) { - return mName; + if (_name.hashCode() == hash) { + if (_name.equals(firstQuad, secondQuad)) { + return _name; } } - for (Bucket curr = mNext; curr != null; curr = curr.mNext) { - Name currName = curr.mName; + for (Bucket curr = _next; curr != null; curr = curr._next) { + Name currName = curr._name; if (currName.hashCode() == hash) { if (currName.equals(firstQuad, secondQuad)) { return currName; @@ -939,13 +951,13 @@ public Name find(int hash, int[] quads, int qlen) { - if (mName.hashCode() == hash) { - if (mName.equals(quads, qlen)) { - return mName; + if (_name.hashCode() == hash) { + if (_name.equals(quads, qlen)) { + return _name; } } - for (Bucket curr = mNext; curr != null; curr = curr.mNext) { - Name currName = curr.mName; + for (Bucket curr = _next; curr != null; curr = curr._next) { + Name currName = curr._name; if (currName.hashCode() == hash) { if (currName.equals(quads, qlen)) { return currName;
diff --git a/src/java/org/codehaus/jackson/sym/Name.java b/src/java/org/codehaus/jackson/sym/Name.java index b677876..4127ac8 100644 --- a/src/java/org/codehaus/jackson/sym/Name.java +++ b/src/java/org/codehaus/jackson/sym/Name.java
@@ -21,9 +21,9 @@ public String getName() { return mName; } /* - ////////////////////////////////////////////////////////// - // Methods for package/core parser - ////////////////////////////////////////////////////////// + /********************************************************** + /* Methods for package/core parser + /********************************************************** */ public abstract boolean equals(int quad1); @@ -33,9 +33,9 @@ public abstract boolean equals(int[] quads, int qlen); /* - ////////////////////////////////////////////////////////// - // Overridden standard methods - ////////////////////////////////////////////////////////// + /********************************************************** + /* Overridden standard methods + /********************************************************** */ @Override
diff --git a/src/perf/TestJsonPerf.java b/src/perf/TestJsonPerf.java index f93d3b7..166c5a0 100644 --- a/src/perf/TestJsonPerf.java +++ b/src/perf/TestJsonPerf.java
@@ -55,7 +55,7 @@ while (true) { try { Thread.sleep(100L); } catch (InterruptedException ie) { } // Use 9 to test all... - int round = (i++ % 5); + int round = (i++ % 2); long curr = System.currentTimeMillis(); String msg; @@ -64,31 +64,33 @@ switch (round) { case 0: - msg = "Jackson, stream/byte"; - sum += testJacksonStream(REPS, _jsonFactory, _jsonData, true); - break; - case 1: - msg = "Jackson, stream/char"; - sum += testJacksonStream(REPS, _jsonFactory, _jsonData, false); - break; - case 2: msg = "Jackson/smile, stream"; sum += testJacksonStream(REPS, _smileFactory, _smileData, true); break; - case 3: - msg = "Noggit"; - sum += testNoggit(REPS); + case 1: + msg = "Jackson, stream/byte"; + sum += testJacksonStream(REPS, _jsonFactory, _jsonData, true); + break; + case 2: + msg = "Jackson, stream/char"; + sum += testJacksonStream(REPS, _jsonFactory, _jsonData, false); break; - case 4: + case 3: msg = "Jackson, Java types"; sum += testJacksonJavaTypes(_mapper, REPS); break; - case 5: + case 4: msg = "Jackson, JSON types"; sum += testJacksonJsonTypes(_mapper, REPS); break; + + case 5: + msg = "Noggit"; + sum += testNoggit(REPS); + break; + case 6: msg = "Json.org"; sum += testJsonOrg(REPS);
diff --git a/src/smile/java/org/codehaus/jackson/smile/SmileParser.java b/src/smile/java/org/codehaus/jackson/smile/SmileParser.java index 37b89cb..11d78ec 100644 --- a/src/smile/java/org/codehaus/jackson/smile/SmileParser.java +++ b/src/smile/java/org/codehaus/jackson/smile/SmileParser.java
@@ -11,6 +11,7 @@ import org.codehaus.jackson.impl.StreamBasedParserBase; import org.codehaus.jackson.io.IOContext; import org.codehaus.jackson.sym.BytesToNameCanonicalizer; +import org.codehaus.jackson.sym.Name; public class SmileParser extends StreamBasedParserBase @@ -49,6 +50,8 @@ public int getMask() { return _mask; } } + private static int[] NO_INTS = new int[0]; + /* /********************************************************** /* Configuration @@ -60,6 +63,11 @@ */ protected ObjectCodec _objectCodec; + /** + * Symbol table that contains field names encountered so far + */ + final protected BytesToNameCanonicalizer _symbols; + /* /********************************************************** /* Additional parsing state @@ -78,6 +86,13 @@ * format does, and we want to retain that separation. */ protected boolean _got32BitFloat; + + /** + * Temporary buffer used for name parsing. + */ + protected int[] _quadBuffer = NO_INTS; + + protected int _quad1, _quad2; /* /********************************************************** @@ -93,6 +108,7 @@ { super(ctxt, parserFeatures, in, inputBuffer, start, end, bufferRecyclable); _objectCodec = codec; + _symbols = sym; _tokenInputRow = -1; _tokenInputCol = -1; } @@ -147,7 +163,28 @@ } return true; } - + + /* + /********************************************************** + /* Overridden methods + /********************************************************** + */ + + @Override + protected void _finishString() throws IOException, JsonParseException + { + // should never be called; but must be defined for superclass + _throwInternal(); + } + + @Override + public void close() throws IOException + { + super.close(); + // Merge found symbols, if any: + _symbols.release(); + } + /* /********************************************************** /* JsonParser impl @@ -418,7 +455,7 @@ /* /********************************************************** - /* Internal methods, parsing + /* Internal methods, field name parsing /********************************************************** */ @@ -468,45 +505,77 @@ return JsonToken.FIELD_NAME; case 2: // short ASCII { - int len = 1 + (ch & 0x3F); - _loadToHaveAtLeast(len); - int outPtr = 0; - char[] outBuf = _textBuffer.emptyAndGetCurrentSegment(); - int inPtr = _inputPtr; - _inputPtr += len; - for (int end = inPtr + len; inPtr < end; ) { - outBuf[outPtr++] = (char) _inputBuffer[inPtr++]; - } - _textBuffer.setCurrentLength(len); - _parsingContext.setCurrentName(_textBuffer.contentsAsString()); + int len = (ch & 0x3f) + 1; + String name; + Name n = _findDecodedFromSymbols(len); + if (n != null) { + name = n.getName(); + _inputPtr += len; + } else { + name = _decodeShortAsciiName(len); + name = _addDecodedToSymbols(len, name); + } + _parsingContext.setCurrentName(name); } return JsonToken.FIELD_NAME; case 3: // short Unicode // all valid, except for 0xBF and 0xFF if ((ch & 0x3F) != 0x3F) { - int len = 1 + (ch & 0x3F); - _decodeShortUnicode(len); - _parsingContext.setCurrentName(_textBuffer.contentsAsString()); - return JsonToken.FIELD_NAME; + int len = (ch & 0x3f) + 1; + String name; + Name n = _findDecodedFromSymbols(len); + if (n != null) { + name = n.getName(); + _inputPtr += len; + } else { + name = _decodeShortUnicodeName(len); + name = _addDecodedToSymbols(len, name); + } + _parsingContext.setCurrentName(name); + return JsonToken.FIELD_NAME; } break; } // Other byte values are illegal _reportError("Invalid type marker byte 0x"+Integer.toHexString(ch)+" for expected field name (or END_OBJECT marker)"); return null; - } + } + private final String _addDecodedToSymbols(int len, String name) + { + if (len < 5) { + return _symbols.addName(name, _quad1, 0).getName(); + } + if (len < 9) { + return _symbols.addName(name, _quad1, _quad2).getName(); + } + int qlen = (len + 3) >> 2; + return _symbols.addName(name, _quadBuffer, qlen).getName(); + } + + private final String _decodeShortAsciiName(int len) + throws IOException, JsonParseException + { + int outPtr = 0; + char[] outBuf = _textBuffer.emptyAndGetCurrentSegment(); + int inPtr = _inputPtr; + _inputPtr += len; + for (int end = inPtr + len; inPtr < end; ) { + outBuf[outPtr++] = (char) _inputBuffer[inPtr++]; + } + _textBuffer.setCurrentLength(len); + return _textBuffer.contentsAsString(); + } + /** * Helper method used to decode short Unicode string, length for which actual * length (in bytes) is known * * @param len Length between 1 and 64 */ - protected final void _decodeShortUnicode(int len) + private final String _decodeShortUnicodeName(int len) throws IOException, JsonParseException { - _loadToHaveAtLeast(len); - int outPtr = 0; char[] outBuf = _textBuffer.emptyAndGetCurrentSegment(); int inPtr = _inputPtr; @@ -543,8 +612,111 @@ outBuf[outPtr++] = (char) i; } _textBuffer.setCurrentLength(outPtr); + return _textBuffer.contentsAsString(); } + /** + * Helper method for trying to find specified encoded UTF-8 byte sequence + * from symbol table; if succesfull avoids actual decoding to String + */ + private final Name _findDecodedFromSymbols(int len) + throws IOException, JsonParseException + { + if ((_inputEnd - _inputPtr) < len) { + _loadToHaveAtLeast(len); + } + // First: maybe we already have this name decoded? + if (len < 5) { + int inPtr = _inputPtr; + final byte[] inBuf = _inputBuffer; + int q = inBuf[inPtr]; + if (--len > 0) { + q = (q << 8) + inBuf[++inPtr]; + if (--len > 0) { + q = (q << 8) + inBuf[++inPtr]; + if (--len > 0) { + q = (q << 8) + inBuf[++inPtr]; + } + } + } + _quad1 = q; + return _symbols.findName(q); + } + if (len < 9) { + int inPtr = _inputPtr; + final byte[] inBuf = _inputBuffer; + // First quadbyte is easy + int q1 = inBuf[inPtr++] << 8; + q1 += inBuf[inPtr++]; + q1 <<= 8; + q1 += inBuf[inPtr++]; + q1 <<= 8; + q1 += inBuf[inPtr++]; + int q2 = inBuf[inPtr++]; + len -= 5; + if (len > 0) { + q2 = (q2 << 8) + inBuf[inPtr++]; + if (--len >= 0) { + q2 = (q2 << 8) + inBuf[inPtr++]; + if (--len >= 0) { + q2 = (q2 << 8) + inBuf[inPtr++]; + } + } + } + _quad1 = q1; + _quad2 = q2; + return _symbols.findName(q1, q2); + } + return _findDecodedLong(len); + } + + private final Name _findDecodedLong(int len) + throws IOException, JsonParseException + { + // first, need enough buffer to store bytes as ints: + { + int bufLen = (len + 3) >> 2; + if (bufLen > _quadBuffer.length) { + _quadBuffer = _growArrayTo(_quadBuffer, bufLen); + } + } + // then decode, full quads first + int offset = 0; + int inPtr = _inputPtr; + final byte[] inBuf = _inputBuffer; + do { + int q = inBuf[inPtr++] << 8; + q |= inBuf[inPtr++]; + q <<= 8; + q |= inBuf[inPtr++]; + q <<= 8; + q |= inBuf[inPtr++]; + _quadBuffer[offset++] = q; + } while ((len -= 4) > 3); + // and then leftovers + if (len > 0) { + int q = inBuf[inPtr++]; + if (--len >= 0) { + q = (q << 8) + inBuf[inPtr++]; + if (--len >= 0) { + q = (q << 8) + inBuf[inPtr++]; + } + } + _quadBuffer[offset++] = q; + } + return _symbols.findName(_quadBuffer, offset); + } + + public static int[] _growArrayTo(int[] arr, int minSize) + { + int[] newArray = new int[minSize + 4]; + if (arr != null) { + // !!! TODO: JDK 1.6, Arrays.copyOf + System.arraycopy(arr, 0, newArray, 0, arr.length); + } + return newArray; + } + /* /********************************************************** /* Internal methods, secondary parsing @@ -574,24 +746,13 @@ case 2: // tiny ascii // fall through case 3: // short ascii - { - int len = (tb - 0x3F); - _loadToHaveAtLeast(len); - int outPtr = 0; - char[] outBuf = _textBuffer.emptyAndGetCurrentSegment(); - int inPtr = _inputPtr; - _inputPtr += len; - for (int end = inPtr + len; inPtr < end; ) { - outBuf[outPtr++] = (char) _inputBuffer[inPtr++]; - } - _textBuffer.setCurrentLength(len); - } + _decodeShortAsciiValue(tb - 0x3F); return; case 4: // tiny unicode // fall through case 5: // short unicode - _decodeShortUnicode(tb - 0x7F); + _decodeShortUnicodeValue(tb - 0x7F); return; case 7: @@ -682,6 +843,67 @@ _throwInternal(); } + protected final void _decodeShortAsciiValue(int len) + throws IOException, JsonParseException + { + if ((_inputEnd - _inputPtr) < len) { + _loadToHaveAtLeast(len); + } + int outPtr = 0; + char[] outBuf = _textBuffer.emptyAndGetCurrentSegment(); + int inPtr = _inputPtr; + _inputPtr += len; + for (int end = inPtr + len; inPtr < end; ) { + outBuf[outPtr++] = (char) _inputBuffer[inPtr++]; + } + _textBuffer.setCurrentLength(len); + } + + protected final void _decodeShortUnicodeValue(int len) + throws IOException, JsonParseException + { + if ((_inputEnd - _inputPtr) < len) { + _loadToHaveAtLeast(len); + } + + int outPtr = 0; + char[] outBuf = _textBuffer.emptyAndGetCurrentSegment(); + int inPtr = _inputPtr; + _inputPtr += len; + final int[] codes = SmileConstants.sUtf8UnitLengths; + for (int end = inPtr + len; inPtr < end; ) { + int i = _inputBuffer[inPtr++] & 0xFF; + int code = codes[i]; + if (code != 0) { + // trickiest one, need surrogate handling + switch (code) { + case 1: + i = ((i & 0x1F) << 6) | (_inputBuffer[inPtr++] & 0x3F); + break; + case 2: + i = ((i & 0x0F) << 12) + | ((_inputBuffer[inPtr++] & 0x3F) << 6) + | (_inputBuffer[inPtr++] & 0x3F); + break; + case 3: + i = ((i & 0x07) << 18) + | ((_inputBuffer[inPtr++] & 0x3F) << 12) + | ((_inputBuffer[inPtr++] & 0x3F) << 6) + | (_inputBuffer[inPtr++] & 0x3F); + // note: this is the codepoint value; need to split, too + i -= 0x10000; + outBuf[outPtr++] = (char) (0xD800 | (i >> 10)); + i = 0xDC00 | (i & 0x3FF); + break; + default: // invalid + _reportError("Invalid byte "+Integer.toHexString(i)+" in short Unicode text block"); + } + } + outBuf[outPtr++] = (char) i; + } + _textBuffer.setCurrentLength(outPtr); + } + /** * Helper method that will update state for an int value */ @@ -888,13 +1110,6 @@ } } - @Override - protected void _finishString() throws IOException, JsonParseException - { - // should never be called; but must be defined for superclass - _throwInternal(); - } - /* /********************************************************** /* Internal methods, other
diff --git a/src/smile/test/org/codehaus/jackson/smile/SmileTestBase.java b/src/smile/test/org/codehaus/jackson/smile/SmileTestBase.java index bdc866e..19c3af6 100644 --- a/src/smile/test/org/codehaus/jackson/smile/SmileTestBase.java +++ b/src/smile/test/org/codehaus/jackson/smile/SmileTestBase.java
@@ -13,10 +13,15 @@ protected SmileParser _parser(byte[] input) throws IOException { - SmileFactory f = new SmileFactory(); - return f.createJsonParser(input); + return _parser(new SmileFactory(), input); } + protected SmileParser _parser(SmileFactory f, byte[] input) + throws IOException + { + return f.createJsonParser(input); + } + protected byte[] _smileDoc(String json) throws IOException { return _smileDoc(json, true);
diff --git a/src/smile/test/org/codehaus/jackson/smile/TestSmileParser.java b/src/smile/test/org/codehaus/jackson/smile/TestSmileParser.java index 9d6ff5c..6d7d64b 100644 --- a/src/smile/test/org/codehaus/jackson/smile/TestSmileParser.java +++ b/src/smile/test/org/codehaus/jackson/smile/TestSmileParser.java
@@ -34,6 +34,35 @@ assertToken(JsonToken.END_ARRAY, p.nextToken()); p.close(); } + + public void testLongAsciiString() throws IOException + { + final String DIGITS = "1234567890"; + String LONG = DIGITS + DIGITS + DIGITS + DIGITS; + LONG = LONG + LONG + LONG + LONG; + byte[] data = _smileDoc(quote(LONG)); + + SmileParser p = _parser(data); + assertNull(p.getCurrentToken()); + assertToken(JsonToken.VALUE_STRING, p.nextToken()); + assertEquals(LONG, p.getText()); + assertNull(p.nextToken()); + } + + public void testTrivialObject() throws IOException + { + byte[] data = _smileDoc("{\"abc\":13}"); + SmileParser p = _parser(data); + assertNull(p.getCurrentToken()); + + assertToken(JsonToken.START_OBJECT, p.nextToken()); + assertToken(JsonToken.FIELD_NAME, p.nextToken()); + assertEquals("abc", p.getCurrentName()); + assertEquals("abc", p.getText()); + assertToken(JsonToken.VALUE_NUMBER_INT, p.nextToken()); + assertEquals(13, p.getIntValue()); + assertToken(JsonToken.END_OBJECT, p.nextToken()); + } public void testSimpleObject() throws IOException {
diff --git a/src/smile/test/org/codehaus/jackson/smile/TestSmileParserSymbolHandling.java b/src/smile/test/org/codehaus/jackson/smile/TestSmileParserSymbolHandling.java new file mode 100644 index 0000000..58ae00a --- /dev/null +++ b/src/smile/test/org/codehaus/jackson/smile/TestSmileParserSymbolHandling.java
@@ -0,0 +1,75 @@ +package org.codehaus.jackson.smile; + +import java.io.*; + +import org.codehaus.jackson.JsonToken; +import org.codehaus.jackson.sym.BytesToNameCanonicalizer; + +/** + * Unit tests for verifying that symbol handling works as planned, including + * efficient reuse of names encountered during parsing. + */ +public class TestSmileParserSymbolHandling + extends SmileTestBase +{ + public void testSimple() throws IOException + { + final String STR1 = "a"; + + byte[] data = _smileDoc("{ "+quote(STR1)+":1, \"foobar\":2, \"longername\":3 }"); + SmileFactory f = new SmileFactory(); + SmileParser p = _parser(f, data); + final BytesToNameCanonicalizer symbols1 = p._symbols; + assertEquals(0, symbols1.size()); + + + + assertEquals(JsonToken.START_OBJECT, p.nextToken()); + assertEquals(JsonToken.FIELD_NAME, p.nextToken()); + // field names are interned: + assertSame(STR1, p.getCurrentName()); + assertEquals(1, symbols1.size()); + assertEquals(JsonToken.VALUE_NUMBER_INT, p.nextToken()); + assertEquals(JsonToken.FIELD_NAME, p.nextToken()); + assertSame("foobar", p.getCurrentName()); + assertEquals(2, symbols1.size()); + assertEquals(JsonToken.VALUE_NUMBER_INT, p.nextToken()); + assertEquals(JsonToken.FIELD_NAME, p.nextToken()); + assertSame("longername", p.getCurrentName()); + assertEquals(3, symbols1.size()); + assertEquals(JsonToken.VALUE_NUMBER_INT, p.nextToken()); + assertEquals(JsonToken.END_OBJECT, p.nextToken()); + assertNull(p.nextToken()); + assertEquals(3, symbols1.size()); + p.close(); + + // but let's verify that symbol table gets reused properly + p = _parser(f, data); + BytesToNameCanonicalizer symbols2 = p._symbols; + // symbol tables are not reused, but contents are: + assertNotSame(symbols1, symbols2); + assertEquals(3, symbols2.size()); + + assertEquals(JsonToken.START_OBJECT, p.nextToken()); + assertEquals(JsonToken.FIELD_NAME, p.nextToken()); + // field names are interned: + assertSame(STR1, p.getCurrentName()); + assertEquals(3, symbols2.size()); + assertEquals(JsonToken.VALUE_NUMBER_INT, p.nextToken()); + assertEquals(JsonToken.FIELD_NAME, p.nextToken()); + assertSame("foobar", p.getCurrentName()); + assertEquals(3, symbols2.size()); + assertEquals(JsonToken.VALUE_NUMBER_INT, p.nextToken()); + assertEquals(JsonToken.FIELD_NAME, p.nextToken()); + assertSame("longername", p.getCurrentName()); + assertEquals(3, symbols2.size()); + assertEquals(JsonToken.VALUE_NUMBER_INT, p.nextToken()); + assertEquals(JsonToken.END_OBJECT, p.nextToken()); + assertNull(p.nextToken()); + assertEquals(3, symbols2.size()); + p.close(); + + assertEquals(3, symbols2.size()); + p.close(); + } +}