Skip to content

Commit 408af58

Browse files
committed
AVRO-4060: Use JDK to Hash Byte Array in UTF8
1 parent 1a2d200 commit 408af58

File tree

2 files changed

+34
-3
lines changed

2 files changed

+34
-3
lines changed

lang/java/avro/src/main/java/org/apache/avro/util/Utf8.java

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,11 @@ public Utf8(byte[] bytes) {
6868
this.length = length;
6969
}
7070

71+
Utf8(String string, int length) {
72+
this(string);
73+
this.length = length;
74+
}
75+
7176
/**
7277
* Return UTF-8 encoded bytes. Only valid through {@link #getByteLength()}.
7378
*/
@@ -175,9 +180,15 @@ public int hashCode() {
175180
if (h == 0) {
176181
byte[] bytes = this.bytes;
177182
int length = this.length;
178-
h = 1;
179-
for (int i = 0; i < length; i++) {
180-
h = h * 31 + bytes[i];
183+
// If the array is filled, use the underlying JDK hash functionality.
184+
// Starting with JDK 21, the underlying implementation is vectorized.
185+
if (bytes.length == length) {
186+
h = Arrays.hashCode(bytes);
187+
} else {
188+
h = 1;
189+
for (int i = 0; i < length; i++) {
190+
h = h * 31 + bytes[i];
191+
}
181192
}
182193
this.hash = h;
183194
}

lang/java/avro/src/test/java/org/apache/avro/util/TestUtf8.java

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,26 @@ void hashCodeReused() {
9999
assertEquals(4122302, u.hashCode());
100100
}
101101

102+
/**
103+
* There are two different code paths that hashcode() can call depending on the
104+
* state of the internal buffer. If the buffer is full (string length eq. buffer
105+
* length) then the JDK hashcode function can be used. This function can is
106+
* vectorized JDK 21+ and therefore should be preferable. However, if the buffer
107+
* is not full (string length le. buffer length), then the JDK does not support
108+
* this and a scalar implementation is the only option as of today. This
109+
* difference can be resolved with JDK 23 as it supports both cases.
110+
*/
111+
@Test
112+
void hashCodeBasedOnCapacity() {
113+
// string = 3; buffer = 3
114+
Utf8 fullCapacity = new Utf8("abc", 3);
115+
116+
// string = 3; buffer = 4
117+
Utf8 partialCapacity = new Utf8("abcX", 3);
118+
119+
assertEquals(fullCapacity.hashCode(), partialCapacity.hashCode());
120+
}
121+
102122
@Test
103123
void oversizeUtf8() {
104124
Utf8 u = new Utf8();

0 commit comments

Comments
 (0)