From 19d5afd7234eca91f26584a9f406ed1e12101f26 Mon Sep 17 00:00:00 2001 From: tison Date: Fri, 19 Dec 2025 09:27:20 +0800 Subject: [PATCH] Accept TDigest bytes with buffered values Signed-off-by: tison --- .editorconfig | 28 +++++++ .../datasketches/tdigest/TDigestDouble.java | 8 +- .../tdigest/TDigestCrossLanguageTest.java | 76 +++++++++++-------- 3 files changed, 79 insertions(+), 33 deletions(-) create mode 100644 .editorconfig diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 000000000..4ebca26dd --- /dev/null +++ b/.editorconfig @@ -0,0 +1,28 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +root = true + +[*] +end_of_line = lf +indent_style = space +insert_final_newline = true +trim_trailing_whitespace = true + +[*.java] +indent_size = tab +tab_width = 2 diff --git a/src/main/java/org/apache/datasketches/tdigest/TDigestDouble.java b/src/main/java/org/apache/datasketches/tdigest/TDigestDouble.java index 6e43e0e92..0f059937d 100644 --- a/src/main/java/org/apache/datasketches/tdigest/TDigestDouble.java +++ b/src/main/java/org/apache/datasketches/tdigest/TDigestDouble.java @@ -426,7 +426,7 @@ public static TDigestDouble heapify(final MemorySegment seg, final boolean isFlo return new TDigestDouble(reverseMerge, k, value, value, new double[] {value}, new long[] {1}, 1, null); } final int numCentroids = posSeg.getInt(); - posSeg.getInt(); // unused + final int numBuffered = posSeg.getInt(); final double min; final double max; if (isFloat) { @@ -444,7 +444,11 @@ public static TDigestDouble heapify(final MemorySegment seg, final boolean isFlo weights[i] = isFloat ? posSeg.getInt() : posSeg.getLong(); totalWeight += weights[i]; } - return new TDigestDouble(reverseMerge, k, min, max, means, weights, totalWeight, null); + final double[] buffered = new double[numBuffered]; + for (int i = 0; i < numBuffered; i++) { + buffered[i] = isFloat ? posSeg.getFloat() : posSeg.getDouble(); + } + return new TDigestDouble(reverseMerge, k, min, max, means, weights, totalWeight, buffered); } // compatibility with the format of the reference implementation diff --git a/src/test/java/org/apache/datasketches/tdigest/TDigestCrossLanguageTest.java b/src/test/java/org/apache/datasketches/tdigest/TDigestCrossLanguageTest.java index 8475b5bd4..1ce3f7555 100644 --- a/src/test/java/org/apache/datasketches/tdigest/TDigestCrossLanguageTest.java +++ b/src/test/java/org/apache/datasketches/tdigest/TDigestCrossLanguageTest.java @@ -25,32 +25,38 @@ import static org.apache.datasketches.common.TestUtil.javaPath; import static org.testng.Assert.assertEquals; import static org.testng.Assert.assertTrue; - -import java.lang.foreign.MemorySegment; import java.io.IOException; +import java.lang.foreign.MemorySegment; import java.nio.file.Files; - import org.testng.annotations.Test; public class TDigestCrossLanguageTest { @Test(groups = {CHECK_CPP_FILES}) public void deserializeFromCppDouble() throws IOException { - final int[] nArr = {0, 1, 10, 100, 1000, 10_000, 100_000, 1_000_000}; - for (final int n: nArr) { - final byte[] bytes = Files.readAllBytes(cppPath.resolve("tdigest_double_n" + n + "_cpp.sk")); - final TDigestDouble td = TDigestDouble.heapify(MemorySegment.ofArray(bytes)); - assertTrue(n == 0 ? td.isEmpty() : !td.isEmpty()); - assertEquals(td.getTotalWeight(), n); - if (n > 0) { - assertEquals(td.getMinValue(), 1); - assertEquals(td.getMaxValue(), n); - assertEquals(td.getRank(0), 0); - assertEquals(td.getRank(n + 1), 1); - if (n == 1) { - assertEquals(td.getRank(n), 0.5); + final boolean[] with_buffer = {false, true}; + for (final boolean buffered : with_buffer) { + final int[] nArr = {0, 1, 10, 100, 1000, 10_000, 100_000, 1_000_000}; + for (final int n : nArr) { + final byte[] bytes; + if (buffered) { + bytes = Files.readAllBytes(cppPath.resolve("tdigest_double_buf_n" + n + "_cpp.sk")); } else { - assertEquals(td.getRank(n / 2), 0.5, 0.05); + bytes = Files.readAllBytes(cppPath.resolve("tdigest_double_n" + n + "_cpp.sk")); + } + final TDigestDouble td = TDigestDouble.heapify(MemorySegment.ofArray(bytes)); + assertTrue(n == 0 ? td.isEmpty() : !td.isEmpty()); + assertEquals(td.getTotalWeight(), n); + if (n > 0) { + assertEquals(td.getMinValue(), 1); + assertEquals(td.getMaxValue(), n); + assertEquals(td.getRank(0), 0); + assertEquals(td.getRank(n + 1), 1); + if (n == 1) { + assertEquals(td.getRank(n), 0.5); + } else { + assertEquals(td.getRank(n / 2), 0.5, 0.05); + } } } } @@ -58,21 +64,29 @@ public void deserializeFromCppDouble() throws IOException { @Test(groups = {CHECK_CPP_FILES}) public void deserializeFromCppFloat() throws IOException { + final boolean[] with_buffer = {false, true}; final int[] nArr = {0, 1, 10, 100, 1000, 10_000, 100_000, 1_000_000}; - for (final int n: nArr) { - final byte[] bytes = Files.readAllBytes(cppPath.resolve("tdigest_float_n" + n + "_cpp.sk")); - final TDigestDouble td = TDigestDouble.heapify(MemorySegment.ofArray(bytes), true); - assertTrue(n == 0 ? td.isEmpty() : !td.isEmpty()); - assertEquals(td.getTotalWeight(), n); - if (n > 0) { - assertEquals(td.getMinValue(), 1); - assertEquals(td.getMaxValue(), n); - assertEquals(td.getRank(0), 0); - assertEquals(td.getRank(n + 1), 1); - if (n == 1) { - assertEquals(td.getRank(n), 0.5); + for (final boolean buffered : with_buffer) { + for (final int n : nArr) { + final byte[] bytes; + if (buffered) { + bytes = Files.readAllBytes(cppPath.resolve("tdigest_float_buf_n" + n + "_cpp.sk")); } else { - assertEquals(td.getRank(n / 2), 0.5, 0.05); + bytes = Files.readAllBytes(cppPath.resolve("tdigest_float_n" + n + "_cpp.sk")); + } + final TDigestDouble td = TDigestDouble.heapify(MemorySegment.ofArray(bytes), true); + assertTrue(n == 0 ? td.isEmpty() : !td.isEmpty()); + assertEquals(td.getTotalWeight(), n); + if (n > 0) { + assertEquals(td.getMinValue(), 1); + assertEquals(td.getMaxValue(), n); + assertEquals(td.getRank(0), 0); + assertEquals(td.getRank(n + 1), 1); + if (n == 1) { + assertEquals(td.getRank(n), 0.5); + } else { + assertEquals(td.getRank(n / 2), 0.5, 0.05); + } } } } @@ -81,7 +95,7 @@ public void deserializeFromCppFloat() throws IOException { @Test(groups = {GENERATE_JAVA_FILES}) public void generateForCppDouble() throws IOException { final int[] nArr = {0, 1, 10, 100, 1000, 10_000, 100_000, 1_000_000}; - for (final int n: nArr) { + for (final int n : nArr) { final TDigestDouble td = new TDigestDouble((short) 100); for (int i = 1; i <= n; i++) { td.update(i);