Fix [Issue-21] for 1.9.9
diff --git a/release-notes/VERSION b/release-notes/VERSION
index be10e23..77bbdb2 100644
--- a/release-notes/VERSION
+++ b/release-notes/VERSION
@@ -8,6 +8,10 @@
Fixes:
+ * [Issue-21]: Improve handling of String hash code collisions for
+ symbol tables; exception for degenerate cases (attacks), improvements
+ for calculation otherwise
+
------------------------------------------------------------------------
=== History: ===
------------------------------------------------------------------------
diff --git a/src/java/org/codehaus/jackson/sym/BytesToNameCanonicalizer.java b/src/java/org/codehaus/jackson/sym/BytesToNameCanonicalizer.java
index 9eb505c..fc50806 100644
--- a/src/java/org/codehaus/jackson/sym/BytesToNameCanonicalizer.java
+++ b/src/java/org/codehaus/jackson/sym/BytesToNameCanonicalizer.java
@@ -18,9 +18,7 @@
/**
* Let's not expand symbol tables past some maximum size;
* this should protected against OOMEs caused by large documents
- * with uniquer (~= random) names.
- *
- * @since 1.5
+ * with unique (~= random) names.
*/
protected static final int MAX_TABLE_SIZE = 0x10000; // 64k entries == 256k mem
@@ -31,6 +29,29 @@
*/
final static int MAX_ENTRIES_FOR_REUSE = 6000;
+ /**
+ * Also: to thwart attacks based on hash collisions (which may or may not
+ * be cheap to calculate), we will need to detect "too long"
+ * collision chains. Let's start with static value of 255 entries
+ * for the longest legal chain.
+ *<p>
+ * Note: longest chain we have been able to produce without malicious
+ * intent has been 60 (with "com.fasterxml.jackson.core.main.TestWithTonsaSymbols");
+ * our setting should be reasonable here.
+ *
+ * @since 1.9.9
+ */
+ final static int MAX_COLL_CHAIN_LENGTH = 255;
+
+ /**
+ * And to support reduce likelihood of accidental collisions causing
+ * exceptions, let's prevent reuse of tables with long collision
+ * overflow lists as well.
+ *
+ * @since 1.9.9
+ */
+ final static int MAX_COLL_CHAIN_FOR_REUSE = 63;
+
final static int MIN_HASH_SIZE = 16;
final static int INITIAL_COLLISION_LEN = 32;
@@ -47,8 +68,19 @@
/**********************************************************
*/
- final BytesToNameCanonicalizer _parent;
+ final protected BytesToNameCanonicalizer _parent;
+ /**
+ * Seed value we use as the base to make hash codes non-static between
+ * different runs, but still stable for lifetime of a single symbol table
+ * instance.
+ * This is done for security reasons, to avoid potential DoS attack via
+ * hash collisions.
+ *
+ * @since 1.9.9
+ */
+ final private int _hashSeed;
+
/*
/**********************************************************
/* Main table state
@@ -59,7 +91,7 @@
* Whether canonial symbol Strings are to be intern()ed before added
* to the table or not
*/
- final boolean _intern;
+ private final boolean _intern;
// // // First, global information
@@ -68,6 +100,15 @@
*/
private int _count;
+ /**
+ * We need to keep track of the longest collision list; this is needed
+ * both to indicate problems with attacks and to allow flushing for
+ * other cases.
+ *
+ * @since 1.9.9
+ */
+ protected int _longestCollisionList;
+
// // // Then information regarding primary hash array and its
// // // matching Name array
@@ -162,19 +203,37 @@
/**********************************************************
*/
+ /**
+ * Factory method to call to create a symbol table instance with a
+ * randomized seed value.
+ */
public static BytesToNameCanonicalizer createRoot()
{
- return new BytesToNameCanonicalizer(DEFAULT_TABLE_SIZE, true);
+ /* [Issue-21]: Need to use a variable seed, to thwart hash-collision
+ * based attacks.
+ */
+ long now = System.currentTimeMillis();
+ // ensure it's not 0; and might as well require to be odd so:
+ int seed = (((int) now) + ((int) now >>> 32)) | 1;
+ return createRoot(seed);
}
/**
+ * Factory method that should only be called from unit tests, where seed
+ * value should remain the same.
+ */
+ protected static BytesToNameCanonicalizer createRoot(int hashSeed) {
+ return new BytesToNameCanonicalizer(DEFAULT_TABLE_SIZE, true, hashSeed);
+ }
+
+ /**
* @param intern Whether canonical symbol Strings should be interned
* or not
*/
public synchronized BytesToNameCanonicalizer makeChild(boolean canonicalize,
boolean intern)
{
- return new BytesToNameCanonicalizer(this, intern);
+ return new BytesToNameCanonicalizer(this, intern, _hashSeed);
}
/**
@@ -196,13 +255,12 @@
}
}
- private BytesToNameCanonicalizer(int hashSize, boolean intern)
+ private BytesToNameCanonicalizer(int hashSize, boolean intern, int seed)
{
_parent = null;
+ _hashSeed = seed;
_intern = intern;
- /* Sanity check: let's now allow hash sizes below certain
- * min. value
- */
+ // Sanity check: let's now allow hash sizes below certain minimum value
if (hashSize < MIN_HASH_SIZE) {
hashSize = MIN_HASH_SIZE;
} else {
@@ -223,9 +281,11 @@
/**
* Constructor used when creating a child instance
*/
- private BytesToNameCanonicalizer(BytesToNameCanonicalizer parent, boolean intern)
+ private BytesToNameCanonicalizer(BytesToNameCanonicalizer parent, boolean intern,
+ int seed)
{
_parent = parent;
+ _hashSeed = seed;
_intern = intern;
// First, let's copy the state as is:
@@ -236,6 +296,7 @@
_collList = parent._collList;
_collCount = parent._collCount;
_collEnd = parent._collEnd;
+ _longestCollisionList = parent._longestCollisionList;
_needRehash = false;
// And consider all shared, so far:
_mainHashShared = true;
@@ -246,6 +307,7 @@
private void initTables(int hashSize)
{
_count = 0;
+ _longestCollisionList = 0;
_mainHash = new int[hashSize];
_mainNames = new Name[hashSize];
_mainHashShared = false;
@@ -273,7 +335,8 @@
* One way to do this is to just purge tables if they grow
* too large, and that's what we'll do here.
*/
- if (child.size() > MAX_ENTRIES_FOR_REUSE) {
+ if (child.size() > MAX_ENTRIES_FOR_REUSE
+ || child._longestCollisionList > MAX_COLL_CHAIN_FOR_REUSE) {
/* Should there be a way to get notified about this
* event, to log it or such? (as it's somewhat abnormal
* thing to happen)
@@ -282,6 +345,7 @@
initTables(DEFAULT_TABLE_SIZE);
} else {
_count = child._count;
+ _longestCollisionList = child._longestCollisionList;
_mainHash = child._mainHash;
_mainNames = child._mainNames;
_mainHashShared = true; // shouldn't matter for parent
@@ -309,15 +373,52 @@
public int size() { return _count; }
/**
+ * @since 1.9.9
+ */
+ public int bucketCount() { return _mainHash.length; }
+
+ /**
* Method called to check to quickly see if a child symbol table
* may have gotten additional entries. Used for checking to see
* if a child table should be merged into shared table.
*/
- public boolean maybeDirty()
- {
+ public boolean maybeDirty() {
return !_mainHashShared;
}
+ /**
+ * @since 1.9.9
+ */
+ public int hashSeed() { return _hashSeed; }
+
+ /**
+ * Method mostly needed by unit tests; calculates number of
+ * entries that are in collision list. Value can be at most
+ * ({@link #size} - 1), but should usually be much lower, ideally 0.
+ *
+ * @since 1.9.9
+ */
+ public int collisionCount() {
+ return _collCount;
+ }
+
+ /**
+ * Method mostly needed by unit tests; calculates length of the
+ * longest collision chain. This should typically be a low number,
+ * but may be up to {@link #size} - 1 in the pathological case
+ *
+ * @since 1.9.9
+ */
+ public int maxCollisionLength() {
+ return _longestCollisionList;
+ }
+
+ /*
+ /**********************************************************
+ /* Public API, accessing symbols:
+ /**********************************************************
+ */
+
public static Name getEmptyName()
{
return Name1.getEmptyName();
@@ -441,11 +542,9 @@
*/
public Name findName(int[] quads, int qlen)
{
- /* // Not needed, never gets called
if (qlen < 3) { // another sanity check
return findName(quads[0], (qlen < 2) ? 0 : quads[1]);
}
- */
int hash = calcHash(quads, qlen);
// (for rest of comments regarding logic, see method above)
int ix = (hash & _mainHashMask);
@@ -476,9 +575,6 @@
/**********************************************************
*/
- /**
- * @since 1.6.0
- */
public Name addName(String symbolStr, int q1, int q2)
{
if (_intern) {
@@ -495,7 +591,12 @@
if (_intern) {
symbolStr = InternCache.instance.intern(symbolStr);
}
- int hash = calcHash(quads, qlen);
+ int hash;
+ if (qlen < 3) {
+ hash = (qlen == 1) ? calcHash(quads[0]) : calcHash(quads[0], quads[1]);
+ } else {
+ hash = calcHash(quads, qlen);
+ }
Name symbol = constructName(hash, symbolStr, quads, qlen);
_addSymbol(hash, symbol);
return symbol;
@@ -507,45 +608,72 @@
/**********************************************************
*/
- public final static int calcHash(int firstQuad)
+ /* Note on hash calculation: we try to make it more difficult to
+ * generate collisions automatically; part of this is to avoid
+ * simple "multiply-add" algorithm (like JDK String.hashCode()),
+ * and add bit of shifting. And other part is to make this
+ * non-linear, at least for shorter symbols.
+ */
+
+ // JDK uses 31; other fine choices are 33 and 65599, let's use 33
+ // as it seems to give fewest collisions for us
+ // (see [http://www.cse.yorku.ca/~oz/hash.html] for details)
+ private final static int MULT = 33;
+
+ public final int calcHash(int firstQuad)
{
+ int hash = firstQuad ^ _hashSeed;
+ hash += (hash >>> 15); // to xor hi- and low- 16-bits
+ hash ^= (hash >>> 9); // as well as lowest 2 bytes
+ return hash;
+ }
+
+ public final int calcHash(int firstQuad, int secondQuad)
+ {
+ /* For two quads, let's change algorithm a bit, to spice
+ * things up (can do bit more processing anyway)
+ */
+
int hash = firstQuad;
- hash ^= (hash >>> 16); // to xor hi- and low- 16-bits
- hash ^= (hash >>> 8); // as well as lowest 2 bytes
+ hash += (hash >>> 15); // try mixing first and second byte pairs first
+ hash ^= ((secondQuad + _hashSeed) * MULT); // then add second quad
+ hash += (hash >>> 9); // and shuffle some more
return hash;
}
- public final static int calcHash(int firstQuad, int secondQuad)
+ public final int calcHash(int[] quads, int qlen)
{
- int hash = (firstQuad * 31) + secondQuad;
-
- // If this was called for single-quad instance:
- //int hash = (secondQuad == 0) ? firstQuad : ((firstQuad * 31) + secondQuad);
-
- hash ^= (hash >>> 16); // to xor hi- and low- 16-bits
- hash ^= (hash >>> 8); // as well as lowest 2 bytes
- return hash;
- }
-
- public final static int calcHash(int[] quads, int qlen)
- {
- // Note: may be called for qlen < 3
- int hash = quads[0];
- for (int i = 1; i < qlen; ++i) {
- hash = (hash * 31) + quads[i];
+ // Note: may be called for qlen < 3; but has at least one int
+ if (qlen < 3) {
+ throw new IllegalArgumentException();
}
- hash ^= (hash >>> 16); // to xor hi- and low- 16-bits
- hash ^= (hash >>> 8); // as well as lowest 2 bytes
+ /* And then change handling again for "multi-quad" case; mostly
+ * to make calculation of collisions less fun. For example,
+ * add seed bit later in the game, and switch plus/xor around,
+ * use different shift lengths.
+ */
+ int hash = quads[0];
+ hash ^= (hash >>> 9);
+ hash += ((quads[1] + _hashSeed) * MULT);
+ hash ^= (hash >>> 15);
+ hash ^= (quads[2] * MULT);
+ hash += (hash >>> 17);
+
+ for (int i = 3; i < qlen; ++i) {
+ hash = (hash * MULT) + (quads[i] + 1);
+ // for longer entries, mess a bit in-between too
+ hash ^= (hash >>> 3);
+ }
+ // and finally shuffle some more once done
+ hash += (hash >>> 15); // to get high-order bits to mix more
+ hash ^= (hash >>> 9); // as well as lowest 2 bytes
return hash;
}
- /* 26-Nov-2008, tatu: not used currently; if not used in near future,
- * let's just delete it.
- */
- /*
- public static int[] calcQuads(byte[] wordBytes)
+ // Method only used by unit tests
+ protected static int[] calcQuads(byte[] wordBytes)
{
int blen = wordBytes.length;
int[] result = new int[(blen + 3) / 4];
@@ -565,7 +693,6 @@
}
return result;
}
- */
/*
/**********************************************************
@@ -647,7 +774,6 @@
if (_collListShared) {
unshareCollision(); // also allocates if list was null
}
-
++_collCount;
int entryValue = _mainHash[ix];
int bucket = entryValue & 0xFF;
@@ -669,7 +795,13 @@
}
// And then just need to link the new bucket entry in
- _collList[bucket] = new Bucket(symbol, _collList[bucket]);
+ Bucket newB = new Bucket(symbol, _collList[bucket]);
+ _collList[bucket] = newB;
+ // but, be careful wrt attacks
+ _longestCollisionList = Math.max(newB.length(), _longestCollisionList);
+ if (_longestCollisionList > MAX_COLL_CHAIN_LENGTH) {
+ reportTooManyCollisions(MAX_COLL_CHAIN_LENGTH);
+ }
}
/* Ok. Now, do we need a rehash next time? Need to have at least
@@ -735,6 +867,7 @@
*/
int oldEnd = _collEnd;
if (oldEnd == 0) { // no prior collisions...
+ _longestCollisionList = 0;
return;
}
@@ -742,6 +875,8 @@
_collEnd = 0;
_collListShared = false;
+ int maxColl = 0;
+
Bucket[] oldBuckets = _collList;
_collList = new Bucket[oldBuckets.length];
for (int i = 0; i < oldEnd; ++i) {
@@ -774,11 +909,15 @@
--bucket; // 1-based index in value
}
// And then just need to link the new bucket entry in
- _collList[bucket] = new Bucket(symbol, _collList[bucket]);
+ Bucket newB = new Bucket(symbol, _collList[bucket]);
+ _collList[bucket] = newB;
+ maxColl = Math.max(maxColl, newB.length());
}
} // for (... buckets in the chain ...)
} // for (... list of bucket heads ... )
+ _longestCollisionList = maxColl;
+
if (symbolsSeen != _count) { // sanity check
throw new RuntimeException("Internal error: count after rehash "+symbolsSeen+"; should be "+_count);
}
@@ -791,6 +930,7 @@
private void nukeSymbols()
{
_count = 0;
+ _longestCollisionList = 0;
Arrays.fill(_mainHash, 0);
Arrays.fill(_mainNames, null);
Arrays.fill(_collList, null);
@@ -906,6 +1046,21 @@
/*
/**********************************************************
+ /* Other helper methods
+ /**********************************************************
+ */
+
+ /**
+ * @since 1.9.9
+ */
+ protected void reportTooManyCollisions(int maxLen)
+ {
+ throw new IllegalStateException("Longest collision chain in symbol table (of size "+_count
+ +") now exceeds maximum, "+maxLen+" -- suspect a DoS attack based on hash collisions");
+ }
+
+ /*
+ /**********************************************************
/* Helper classes
/**********************************************************
*/
@@ -914,21 +1069,16 @@
{
protected final Name _name;
protected final Bucket _next;
+ private final int _length;
Bucket(Name name, Bucket next)
{
_name = name;
_next = next;
+ _length = (next == null) ? 1 : next._length+1;
}
- public int length()
- {
- int len = 1;
- for (Bucket curr = _next; curr != null; curr = curr._next) {
- ++len;
- }
- return len;
- }
+ public int length() { return _length; }
public Name find(int hash, int firstQuad, int secondQuad)
{
diff --git a/src/java/org/codehaus/jackson/sym/CharsToNameCanonicalizer.java b/src/java/org/codehaus/jackson/sym/CharsToNameCanonicalizer.java
index 76badc3..11c0650 100644
--- a/src/java/org/codehaus/jackson/sym/CharsToNameCanonicalizer.java
+++ b/src/java/org/codehaus/jackson/sym/CharsToNameCanonicalizer.java
@@ -346,9 +346,11 @@
*/
public int size() { return _size; }
-
+
public boolean maybeDirty() { return _dirty; }
+ public int bucketCount() { return _symbols.length; }
+
/**
* Method mostly needed by unit tests; calculates number of
* entries that are in collision list. Value can be at most
@@ -392,8 +394,7 @@
return new String(buffer, start, len);
}
- hash &= _indexMask;
-
+ hash = _hashToIndex(hash);
String sym = _symbols[hash];
// Optimal case; checking existing primary symbol for hash index:
@@ -429,7 +430,7 @@
/* Need to recalc hash; rare occurence (index mask has been
* recalculated as part of rehash)
*/
- hash = calcHash(buffer, start, len) & _indexMask;
+ hash = _hashToIndex(calcHash(buffer, start, len));
}
String newSymbol = new String(buffer, start, len);
@@ -453,6 +454,12 @@
return newSymbol;
}
+ /*
+ /**********************************************************
+ /* Hash calculation
+ /**********************************************************
+ */
+
/**
* Implementation of a hashing method for variable length
* Strings. Most of the time intention is that this calculation
@@ -479,6 +486,16 @@
return hash;
}
+ /**
+ * Helper method that takes in a "raw" hash value, shuffles it as necessary,
+ * and truncates to be used as the index.
+ */
+ private final int _hashToIndex(int rawHash)
+ {
+ rawHash += (rawHash >>> 15); // this seems to help quite a bit, at least for our tests
+ return (rawHash & _indexMask);
+ }
+
/*
/**********************************************************
/* Internal methods
@@ -546,7 +563,7 @@
String symbol = oldSyms[i];
if (symbol != null) {
++count;
- int index = calcHash(symbol) & _indexMask;
+ int index = _hashToIndex(calcHash(symbol));
if (_symbols[index] == null) {
_symbols[index] = symbol;
} else {
@@ -564,7 +581,7 @@
while (b != null) {
++count;
String symbol = b.getSymbol();
- int index = calcHash(symbol) & _indexMask;
+ int index = _hashToIndex(calcHash(symbol));
if (_symbols[index] == null) {
_symbols[index] = symbol;
} else {
diff --git a/src/mapper/java/org/codehaus/jackson/map/ser/BasicSerializerFactory.java b/src/mapper/java/org/codehaus/jackson/map/ser/BasicSerializerFactory.java
index 5f6f418..76de697 100644
--- a/src/mapper/java/org/codehaus/jackson/map/ser/BasicSerializerFactory.java
+++ b/src/mapper/java/org/codehaus/jackson/map/ser/BasicSerializerFactory.java
@@ -7,7 +7,6 @@
import java.util.*;
import org.codehaus.jackson.map.*;
-import org.codehaus.jackson.map.annotate.JacksonStdImpl;
import org.codehaus.jackson.map.annotate.JsonSerialize;
import org.codehaus.jackson.map.ext.OptionalHandlerFactory;
import org.codehaus.jackson.map.introspect.*;
diff --git a/src/test/org/codehaus/jackson/map/ser/TestNullSerialization.java b/src/test/org/codehaus/jackson/map/ser/TestNullSerialization.java
index 37b0947..4a3ca98 100644
--- a/src/test/org/codehaus/jackson/map/ser/TestNullSerialization.java
+++ b/src/test/org/codehaus/jackson/map/ser/TestNullSerialization.java
@@ -51,7 +51,7 @@
ObjectMapper mapper = new ObjectMapper();
mapper.setAnnotationIntrospector(new JacksonAnnotationIntrospector());
mapper.setSerializationInclusion(Inclusion.NON_NULL);
- assertEquals("{\"value\":\"abc\"}", mapper.writeValueAsString(new NullBean("abc")));
- assertEquals("{}", mapper.writeValueAsString(new NullBean(null)));
+ assertEquals("{\"value\":\"abc\"}", mapper.writeValueAsString(new NullBean<String>("abc")));
+ assertEquals("{}", mapper.writeValueAsString(new NullBean<String>(null)));
}
}
diff --git a/src/test/org/codehaus/jackson/sym/TestSymbolTables.java b/src/test/org/codehaus/jackson/sym/TestSymbolTables.java
index c6d6664..023ff71 100644
--- a/src/test/org/codehaus/jackson/sym/TestSymbolTables.java
+++ b/src/test/org/codehaus/jackson/sym/TestSymbolTables.java
@@ -1,5 +1,7 @@
package org.codehaus.jackson.sym;
+import java.io.IOException;
+
public class TestSymbolTables extends main.BaseTest
{
// 11 3-char snippets that hash to 0xFFFF (with default JDK hashCode() calc),
@@ -51,4 +53,80 @@
assertEquals(CharsToNameCanonicalizer.MAX_COLL_CHAIN_LENGTH+2, sym.size());
}
}
+
+ // Test for verifying stability of hashCode, wrt collisions, using
+ // synthetic field name generation and character-based input
+ public void testSyntheticWithChars()
+ {
+ // pass seed, to keep results consistent:
+ CharsToNameCanonicalizer symbols = CharsToNameCanonicalizer.createRoot();
+ final int COUNT = 6000;
+ for (int i = 0; i < COUNT; ++i) {
+ String id = fieldNameFor(i);
+ char[] ch = id.toCharArray();
+ symbols.findSymbol(ch, 0, ch.length, CharsToNameCanonicalizer.calcHash(id));
+ }
+
+ assertEquals(8192, symbols.bucketCount());
+ assertEquals(COUNT, symbols.size());
+
+//System.out.printf("Char stuff: collisions %d, max-coll %d\n", symbols.collisionCount(), symbols.maxCollisionLength());
+
+ // original hashCode calc gave very high rate (3567), but with shuffling comes down a bit
+ // (changing mult to 33 would help more)
+ assertEquals(1905, symbols.collisionCount());
+ // esp. with collisions; first got about 30
+ assertEquals(6, symbols.maxCollisionLength());
+ }
+
+ // Test for verifying stability of hashCode, wrt collisions, using
+ // synthetic field name generation and byte-based input (UTF-8)
+ public void testSyntheticWithBytes() throws IOException
+ {
+ // pass seed, to keep results consistent:
+ BytesToNameCanonicalizer symbols = BytesToNameCanonicalizer.createRoot(1);
+ final int COUNT = 6000;
+ for (int i = 0; i < COUNT; ++i) {
+ String id = fieldNameFor(i);
+ int[] quads = BytesToNameCanonicalizer.calcQuads(id.getBytes("UTF-8"));
+ symbols.addName(id, quads, quads.length);
+ }
+ assertEquals(COUNT, symbols.size());
+ assertEquals(8192, symbols.bucketCount());
+
+//System.out.printf("Byte stuff: collisions %d, max-coll %d\n", symbols.collisionCount(), symbols.maxCollisionLength());
+
+ // Fewer collisions than with chars, but still quite a few
+ assertEquals(1770, symbols.collisionCount());
+ // but not super long collision chains:
+ assertEquals(9, symbols.maxCollisionLength());
+ }
+
+ protected void fieldNameFor(StringBuilder sb, int index)
+ {
+ /* let's do something like "f1.1" to exercise different
+ * field names (important for byte-based codec)
+ * Other name shuffling done mostly just for fun... :)
+ */
+ sb.append("f");
+ sb.append(index);
+ if (index > 50) {
+ sb.append('.');
+ if (index > 200) {
+ sb.append(index);
+ if (index > 4000) { // and some even longer symbols...
+ sb.append(".").append(index);
+ }
+ } else {
+ sb.append(index >> 3); // divide by 8
+ }
+ }
+ }
+
+ protected String fieldNameFor(int index)
+ {
+ StringBuilder sb = new StringBuilder(16);
+ fieldNameFor(sb, index);
+ return sb.toString();
+ }
}