Skip to content

Commit a816934

Browse files
committed
Binary search now works for non-full arrays.
1 parent 84ff5d3 commit a816934

File tree

6 files changed

+125
-101
lines changed

6 files changed

+125
-101
lines changed

src/main/java/com/dannemann/stringcompressor/FiveBitBinarySearch.java

Lines changed: 34 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -3,15 +3,18 @@
33
import static com.dannemann.stringcompressor.FiveBitAsciiCompressor.DEFAULT_5BIT_CHARSET;
44

55
/**
6-
* <p>Performs binary search (including prefix search) on data compressed by {@link FiveBitAsciiCompressor}.
7-
* Particularly useful when searching large amounts of compressed data stored in memory.</p>
8-
* <p>The data must have been sorted prior to compression.</p>
9-
* <p>If {@code prefixSearch} is set to {@code true}, the method searches for an element whose prefix matches the
10-
* specified key. Otherwise, it searches for an exact match. If there are multiple elements with the same prefix, the
11-
* first matching element is returned.</p>
12-
* <p>Note that character ordering depends on the sequence defined in your custom charset (via {@code supportedCharset}),
13-
* which is passed to the compressor constructor (see {@link FiveBitAsciiCompressor#FiveBitAsciiCompressor(byte[])}).
14-
* If no custom charset is provided, compressors use a default charset ordered by ASCII.</p>
6+
* <p>Performs binary search on data compressed by {@link FiveBitAsciiCompressor}. The data must have been sorted prior
7+
* to compression.</p>
8+
* <p>If {@code prefixSearch} is {@code true}, the method returns the first element whose prefix matches the specified
9+
* key. Otherwise, it looks for an exact match. When multiple elements share the same prefix, the first matching element
10+
* is returned (just as in the exact‐match search).</p>
11+
* <p>Null elements are considered to come after any character, in the same way that Z comes after A. This is because
12+
* the {@code compressedData} array typically has extra space to accommodate new entries, so unused slots (nulls) are
13+
* placed at the end.</p>
14+
* <p>Note that character ordering depends on the sequence defined in your custom charset (via
15+
* {@code supportedCharset}), which is passed to the compressor constructor (see
16+
* {@link FiveBitAsciiCompressor#FiveBitAsciiCompressor(byte[])}). If no custom charset is provided, compressors use a
17+
* default charset ordered by ASCII.</p>
1518
* @author Jean Dannemann Carone
1619
* @see FiveBitAsciiCompressor#DEFAULT_5BIT_CHARSET
1720
*/
@@ -64,29 +67,34 @@ public int search(final byte[] key) {
6467
while (low <= high) {
6568
final int mid = low + high >>> 1;
6669
final byte[] compStr = compressedData[mid];
67-
final int cLenMinus = compStr.length - 1;
68-
int buffer = 0;
69-
int bits = 0;
7070
int cmp = 0;
7171

72-
for (int i = 0, j = 0; i < cLenMinus && j < keyLen; i++) {
73-
buffer = buffer << 8 | compStr[i] & 0xFF;
74-
bits += 8;
72+
if (compStr == null)
73+
cmp = 1;
74+
else {
75+
final int cLenMinus = compStr.length - 1;
76+
int buffer = 0;
77+
int bits = 0;
7578

76-
if (bits >= 5 &&
77-
(cmp = charset[buffer >>> (bits -= 5) & 0x1F] - key[j++]) != 0 ||
78-
bits >= 5 && j < keyLen &&
79-
(cmp = charset[buffer >>> (bits -= 5) & 0x1F] - key[j++]) != 0)
80-
break;
81-
}
79+
for (int i = 0, j = 0; i < cLenMinus && j < keyLen; i++) {
80+
buffer = buffer << 8 | compStr[i] & 0xFF;
81+
bits += 8;
82+
83+
if (bits >= 5 &&
84+
(cmp = charset[buffer >>> (bits -= 5) & 0x1F] - key[j++]) != 0 ||
85+
bits >= 5 && j < keyLen &&
86+
(cmp = charset[buffer >>> (bits -= 5) & 0x1F] - key[j++]) != 0)
87+
break;
88+
}
8289

83-
if (cmp == 0) {
84-
final int dLen = cLenMinus >= 0 ? cLenMinus * 8 / 5 - (compStr[cLenMinus] & 1) : 0;
90+
if (cmp == 0) {
91+
final int dLen = cLenMinus >= 0 ? cLenMinus * 8 / 5 - (compStr[cLenMinus] & 1) : 0;
8592

86-
if (prefixSearch && keyLen <= dLen)
87-
return mid;
93+
if (prefixSearch && keyLen <= dLen)
94+
return mid;
8895

89-
cmp = dLen - keyLen;
96+
cmp = dLen - keyLen;
97+
}
9098
}
9199

92100
if (cmp < 0)

src/main/java/com/dannemann/stringcompressor/FourBitBinarySearch.java

Lines changed: 47 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -3,15 +3,18 @@
33
import static com.dannemann.stringcompressor.FourBitAsciiCompressor.DEFAULT_4BIT_CHARSET;
44

55
/**
6-
* <p>Performs binary search (including prefix search) on data compressed by {@link FourBitAsciiCompressor}.
7-
* Particularly useful when searching large amounts of compressed data stored in memory.</p>
8-
* <p>The data must have been sorted prior to compression.</p>
9-
* <p>If {@code prefixSearch} is set to {@code true}, the method searches for an element whose prefix matches the
10-
* specified key. Otherwise, it searches for an exact match. If there are multiple elements with the same prefix, the
11-
* first matching element is returned.</p>
12-
* <p>Note that character ordering depends on the sequence defined in your custom charset (via {@code supportedCharset}),
13-
* which is passed to the compressor constructor (see {@link FourBitAsciiCompressor#FourBitAsciiCompressor(byte[])}).
14-
* If no custom charset is provided, compressors use a default charset ordered by ASCII.</p>
6+
* <p>Performs binary search on data compressed by {@link FourBitAsciiCompressor}. The data must have been sorted prior
7+
* to compression.</p>
8+
* <p>If {@code prefixSearch} is {@code true}, the method returns the first element whose prefix matches the specified
9+
* key. Otherwise, it looks for an exact match. When multiple elements share the same prefix, the first matching element
10+
* is returned (just as in the exact‐match search).</p>
11+
* <p>Null elements are considered to come after any character, in the same way that Z comes after A. This is because
12+
* the {@code compressedData} array typically has extra space to accommodate new entries, so unused slots (nulls) are
13+
* placed at the end.</p>
14+
* <p>Note that character ordering depends on the sequence defined in your custom charset (via
15+
* {@code supportedCharset}), which is passed to the compressor constructor (see
16+
* {@link FourBitAsciiCompressor#FourBitAsciiCompressor(byte[])}). If no custom charset is provided, compressors use a
17+
* default charset ordered by ASCII.</p>
1518
* @author Jean Dannemann Carone
1619
* @see FourBitAsciiCompressor#DEFAULT_4BIT_CHARSET
1720
*/
@@ -64,38 +67,43 @@ public int search(final byte[] key) {
6467
while (low <= high) {
6568
final int mid = low + high >>> 1;
6669
final byte[] compStr = compressedData[mid];
67-
final int odd;
68-
final int dLen;
69-
int cLenMinus = compStr.length - 1;
70-
71-
if (cLenMinus >= 0) {
72-
odd = compStr[cLenMinus];
73-
dLen = odd == 1 ? (--cLenMinus << 1) + 1 : cLenMinus << 1;
74-
} else {
75-
odd = 0;
76-
dLen = 0;
77-
}
78-
79-
int j = 0;
8070
int cmp = 0;
8171

82-
for (int i = 0; i < cLenMinus && j < keyLen; i++) {
83-
final byte bite = compStr[i];
84-
85-
if ((cmp = DEFAULT_4BIT_CHARSET[(bite & 0xF0) >> 4] - key[j++]) != 0 ||
86-
j < keyLen &&
87-
(cmp = DEFAULT_4BIT_CHARSET[bite & 0x0F] - key[j++]) != 0)
88-
break;
89-
}
90-
91-
if (cmp == 0 && odd == 1 && j < keyLen)
92-
cmp = DEFAULT_4BIT_CHARSET[compStr[cLenMinus]] - key[j];
93-
94-
if (cmp == 0) {
95-
if (prefixSearch && keyLen <= dLen)
96-
return mid;
97-
98-
cmp = dLen - keyLen;
72+
if (compStr == null)
73+
cmp = 1;
74+
else {
75+
final int odd;
76+
final int dLen;
77+
int cLenMinus = compStr.length - 1;
78+
79+
if (cLenMinus >= 0) {
80+
odd = compStr[cLenMinus];
81+
dLen = odd == 1 ? (--cLenMinus << 1) + 1 : cLenMinus << 1;
82+
} else {
83+
odd = 0;
84+
dLen = 0;
85+
}
86+
87+
int j = 0;
88+
89+
for (int i = 0; i < cLenMinus && j < keyLen; i++) {
90+
final byte bite = compStr[i];
91+
92+
if ((cmp = charset[(bite & 0xF0) >> 4] - key[j++]) != 0 ||
93+
j < keyLen &&
94+
(cmp = charset[bite & 0x0F] - key[j++]) != 0)
95+
break;
96+
}
97+
98+
if (cmp == 0 && odd == 1 && j < keyLen)
99+
cmp = charset[compStr[cLenMinus]] - key[j];
100+
101+
if (cmp == 0) {
102+
if (prefixSearch && keyLen <= dLen)
103+
return mid;
104+
105+
cmp = dLen - keyLen;
106+
}
99107
}
100108

101109
if (cmp < 0)

src/main/java/com/dannemann/stringcompressor/SixBitBinarySearch.java

Lines changed: 34 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -3,15 +3,18 @@
33
import static com.dannemann.stringcompressor.SixBitAsciiCompressor.DEFAULT_6BIT_CHARSET;
44

55
/**
6-
* <p>Performs binary search (including prefix search) on data compressed by {@link SixBitAsciiCompressor}.
7-
* Particularly useful when searching large amounts of compressed data stored in memory.</p>
8-
* <p>The data must have been sorted prior to compression.</p>
9-
* <p>If {@code prefixSearch} is set to {@code true}, the method searches for an element whose prefix matches the
10-
* specified key. Otherwise, it searches for an exact match. If there are multiple elements with the same prefix, the
11-
* first matching element is returned.</p>
12-
* <p>Note that character ordering depends on the sequence defined in your custom charset (via {@code supportedCharset}),
13-
* which is passed to the compressor constructor (see {@link SixBitAsciiCompressor#SixBitAsciiCompressor(byte[])}).
14-
* If no custom charset is provided, compressors use a default charset ordered by ASCII.</p>
6+
* <p>Performs binary search on data compressed by {@link SixBitAsciiCompressor}. The data must have been sorted prior
7+
* to compression.</p>
8+
* <p>If {@code prefixSearch} is {@code true}, the method returns the first element whose prefix matches the specified
9+
* key. Otherwise, it looks for an exact match. When multiple elements share the same prefix, the first matching element
10+
* is returned (just as in the exact‐match search).</p>
11+
* <p>Null elements are considered to come after any character, in the same way that Z comes after A. This is because
12+
* the {@code compressedData} array typically has extra space to accommodate new entries, so unused slots (nulls) are
13+
* placed at the end.</p>
14+
* <p>Note that character ordering depends on the sequence defined in your custom charset (via
15+
* {@code supportedCharset}), which is passed to the compressor constructor (see
16+
* {@link SixBitAsciiCompressor#SixBitAsciiCompressor(byte[])}). If no custom charset is provided, compressors use a
17+
* default charset ordered by ASCII.</p>
1518
* @author Jean Dannemann Carone
1619
* @see SixBitAsciiCompressor#DEFAULT_6BIT_CHARSET
1720
*/
@@ -64,29 +67,34 @@ public int search(final byte[] key) {
6467
while (low <= high) {
6568
final int mid = low + high >>> 1;
6669
final byte[] compStr = compressedData[mid];
67-
final int cLenMinus = compStr.length - 1;
68-
int buffer = 0;
69-
int bits = 0;
7070
int cmp = 0;
7171

72-
for (int i = 0, j = 0; i < cLenMinus && j < keyLen; i++) {
73-
buffer = buffer << 8 | compStr[i] & 0xFF;
74-
bits += 8;
72+
if (compStr == null)
73+
cmp = 1;
74+
else {
75+
final int cLenMinus = compStr.length - 1;
76+
int buffer = 0;
77+
int bits = 0;
7578

76-
if (bits >= 6 &&
77-
(cmp = DEFAULT_6BIT_CHARSET[buffer >>> (bits -= 6) & 0x3F] - key[j++]) != 0 ||
78-
bits >= 6 && j < keyLen &&
79-
(cmp = DEFAULT_6BIT_CHARSET[buffer >>> (bits -= 6) & 0x3F] - key[j++]) != 0)
80-
break;
81-
}
79+
for (int i = 0, j = 0; i < cLenMinus && j < keyLen; i++) {
80+
buffer = buffer << 8 | compStr[i] & 0xFF;
81+
bits += 8;
82+
83+
if (bits >= 6 &&
84+
(cmp = charset[buffer >>> (bits -= 6) & 0x3F] - key[j++]) != 0 ||
85+
bits >= 6 && j < keyLen &&
86+
(cmp = charset[buffer >>> (bits -= 6) & 0x3F] - key[j++]) != 0)
87+
break;
88+
}
8289

83-
if (cmp == 0) {
84-
final int dLen = cLenMinus >= 0 ? cLenMinus * 8 / 6 - (compStr[cLenMinus] & 1) : 0;
90+
if (cmp == 0) {
91+
final int dLen = cLenMinus >= 0 ? cLenMinus * 8 / 6 - (compStr[cLenMinus] & 1) : 0;
8592

86-
if (prefixSearch && keyLen <= dLen)
87-
return mid;
93+
if (prefixSearch && keyLen <= dLen)
94+
return mid;
8895

89-
cmp = dLen - keyLen;
96+
cmp = dLen - keyLen;
97+
}
9098
}
9199

92100
if (cmp < 0)

src/test/java/com/dannemann/stringcompressor/FiveBitBinarySearchTest.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ void searchSmallStringsTest() {
2626
for (int length = 0; length <= 50; length++)
2727
for (int i = 0; i <= 30_000; i++) {
2828
final List<String> source = generateRandomUniqueOrderedStringList(500, length, length + 1, DEFAULT_5BIT_CHARSET);
29-
final byte[][] destination = new byte[source.size()][];
29+
final byte[][] destination = new byte[700][];
3030
ManagedBulkCompressor.compressAndAddAll(COMPRESSOR, destination, source);
3131
final FiveBitBinarySearch bs = new FiveBitBinarySearch(destination, false);
3232
for (int j = 0, len = source.size(); j < len; j++)
@@ -37,7 +37,7 @@ void searchSmallStringsTest() {
3737
@RepeatedTest(100)
3838
void searchBigArrayTest() {
3939
final List<String> source = generateRandomUniqueOrderedStringList(2_000_000, 0, 100, DEFAULT_5BIT_CHARSET);
40-
final byte[][] destination = new byte[source.size()][];
40+
final byte[][] destination = new byte[4_000_000][];
4141
ManagedBulkCompressor.compressAndAddAll(COMPRESSOR, destination, source);
4242
final FiveBitBinarySearch bs = new FiveBitBinarySearch(destination, false);
4343
for (int i = 0, len = source.size(); i < len; i++)
@@ -47,7 +47,7 @@ void searchBigArrayTest() {
4747
@RepeatedTest(100)
4848
void searchBigStringsTest() {
4949
final List<String> source = generateRandomUniqueOrderedStringList(50_000, 4500, 5000, DEFAULT_5BIT_CHARSET);
50-
final byte[][] destination = new byte[source.size()][];
50+
final byte[][] destination = new byte[70_000][];
5151
ManagedBulkCompressor.compressAndAddAll(COMPRESSOR, destination, source);
5252
final FiveBitBinarySearch bs = new FiveBitBinarySearch(destination, false);
5353
for (int i = 0, len = source.size(); i < len; i++)

src/test/java/com/dannemann/stringcompressor/FourBitBinarySearchTest.java

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,10 +23,10 @@ class FourBitBinarySearchTest extends BaseTest {
2323

2424
@Test
2525
void searchSmallStringsTest() {
26-
for (int length = 0; length <= 50; length++)
26+
for (int length = 10; length <= 50; length++)
2727
for (int i = 0; i <= 30_000; i++) {
2828
final List<String> source = generateRandomUniqueOrderedStringList(500, length, length + 1, DEFAULT_4BIT_CHARSET);
29-
final byte[][] destination = new byte[source.size()][];
29+
final byte[][] destination = new byte[700][];
3030
ManagedBulkCompressor.compressAndAddAll(COMPRESSOR, destination, source);
3131
final FourBitBinarySearch bs = new FourBitBinarySearch(destination, false);
3232
for (int j = 0, len = source.size(); j < len; j++)
@@ -37,7 +37,7 @@ void searchSmallStringsTest() {
3737
@RepeatedTest(100)
3838
void searchBigArrayTest() {
3939
final List<String> source = generateRandomUniqueOrderedStringList(2_000_000, 0, 100, DEFAULT_4BIT_CHARSET);
40-
final byte[][] destination = new byte[source.size()][];
40+
final byte[][] destination = new byte[4_000_000][];
4141
ManagedBulkCompressor.compressAndAddAll(COMPRESSOR, destination, source);
4242
final FourBitBinarySearch bs = new FourBitBinarySearch(destination, false);
4343
for (int i = 0, len = source.size(); i < len; i++)
@@ -47,7 +47,7 @@ void searchBigArrayTest() {
4747
@RepeatedTest(100)
4848
void searchBigStringsTest() {
4949
final List<String> source = generateRandomUniqueOrderedStringList(50_000, 4500, 5000, DEFAULT_4BIT_CHARSET);
50-
final byte[][] destination = new byte[source.size()][];
50+
final byte[][] destination = new byte[70_000][];
5151
ManagedBulkCompressor.compressAndAddAll(COMPRESSOR, destination, source);
5252
final FourBitBinarySearch bs = new FourBitBinarySearch(destination, false);
5353
for (int i = 0, len = source.size(); i < len; i++)

src/test/java/com/dannemann/stringcompressor/SixBitBinarySearchTest.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ void searchSmallStringsTest() {
2626
for (int length = 0; length <= 50; length++)
2727
for (int i = 0; i <= 30_000; i++) {
2828
final List<String> source = generateRandomUniqueOrderedStringList(500, length, length + 1, DEFAULT_6BIT_CHARSET);
29-
final byte[][] destination = new byte[source.size()][];
29+
final byte[][] destination = new byte[700][];
3030
ManagedBulkCompressor.compressAndAddAll(COMPRESSOR, destination, source);
3131
final SixBitBinarySearch bs = new SixBitBinarySearch(destination, false);
3232
for (int j = 0, len = source.size(); j < len; j++)
@@ -37,7 +37,7 @@ void searchSmallStringsTest() {
3737
@RepeatedTest(100)
3838
void searchBigArrayTest() {
3939
final List<String> source = generateRandomUniqueOrderedStringList(2_000_000, 0, 100, DEFAULT_6BIT_CHARSET);
40-
final byte[][] destination = new byte[source.size()][];
40+
final byte[][] destination = new byte[4_000_000][];
4141
ManagedBulkCompressor.compressAndAddAll(COMPRESSOR, destination, source);
4242
final SixBitBinarySearch bs = new SixBitBinarySearch(destination, false);
4343
for (int i = 0, len = source.size(); i < len; i++)
@@ -47,7 +47,7 @@ void searchBigArrayTest() {
4747
@RepeatedTest(100)
4848
void searchBigStringsTest() {
4949
final List<String> source = generateRandomUniqueOrderedStringList(50_000, 4500, 5000, DEFAULT_6BIT_CHARSET);
50-
final byte[][] destination = new byte[source.size()][];
50+
final byte[][] destination = new byte[70_000][];
5151
ManagedBulkCompressor.compressAndAddAll(COMPRESSOR, destination, source);
5252
final SixBitBinarySearch bs = new SixBitBinarySearch(destination, false);
5353
for (int i = 0, len = source.size(); i < len; i++)

0 commit comments

Comments
 (0)