From 0f592d69f841f60094833f036c1d5199b9061721 Mon Sep 17 00:00:00 2001 From: Lee Rhodes Date: Sun, 16 Nov 2025 12:39:34 -0800 Subject: [PATCH 01/12] Fixed Hll Union to HllUnion. Updated README, POM. --- README.md | 39 +++++-- .../datasketches/hll/BaseHllSketch.java | 24 ++-- .../datasketches/hll/DirectHll4Array.java | 2 +- .../datasketches/hll/DirectHll6Array.java | 2 +- .../datasketches/hll/DirectHll8Array.java | 2 +- .../apache/datasketches/hll/Hll4Array.java | 2 +- .../apache/datasketches/hll/Hll6Array.java | 2 +- .../apache/datasketches/hll/Hll8Array.java | 2 +- .../apache/datasketches/hll/HllSketch.java | 14 +-- .../hll/{Union.java => HllUnion.java} | 106 +++++++++--------- .../apache/datasketches/hll/PreambleUtil.java | 2 +- .../apache/datasketches/hll/package-info.java | 22 ++-- .../datasketches/hll/BaseHllSketchTest.java | 4 +- .../datasketches/hll/DirectUnionTest.java | 42 +++---- .../apache/datasketches/hll/HllArrayTest.java | 4 +- .../hll/HllSketchMergeOrderTest.java | 2 +- .../datasketches/hll/HllSketchTest.java | 8 +- .../datasketches/hll/IsomorphicTest.java | 43 ++++--- .../datasketches/hll/UnionCaseTest.java | 44 ++++---- .../apache/datasketches/hll/UnionTest.java | 56 +++++---- 20 files changed, 212 insertions(+), 210 deletions(-) rename src/main/java/org/apache/datasketches/hll/{Union.java => HllUnion.java} (88%) diff --git a/README.md b/README.md index 0358c6865..3a7cec913 100644 --- a/README.md +++ b/README.md @@ -27,34 +27,53 @@ This is the core Java component of the DataSketches library. It contains all of This component is also a dependency of other components of the library that create adaptors for target systems, such as the [Apache Pig adaptor](https://github.com/apache/datasketches-pig), the [Apache Hive adaptor](https://github.com/apache/datasketches-hive), and others. -Note that we have a parallel core component for C++, Python and GO implementations of many of the same sketch algorithms, -[datasketches-cpp](https://github.com/apache/datasketches-cpp), [datasketches-python](https://github.com/apache/datasketches-python), and -[datasketches-go](https://github.com/apache/datasketches-go). +Note that we have parallel core components for C++, Python and GO implementations of many of the same sketch algorithms: + +- [datasketches-cpp](https://github.com/apache/datasketches-cpp), +- [datasketches-python](https://github.com/apache/datasketches-python), +- [datasketches-go](https://github.com/apache/datasketches-go). Please visit the main [DataSketches website](https://datasketches.apache.org) for more information. If you are interested in making contributions to this site please see our [Community](https://datasketches.apache.org/docs/Community/) page for how to contact us. --- +## Major Changes with this Release +This release is a major release where we took the opportunity to do some significant refactoring that will constitute incompatible changes from previous releases. Any incompatibility with prior releases is always an inconvenience to users who wish to just upgrade to the latest release and run. However, some of the code in this library was written in 2013 and meanwhile the Java language has evolved enormously since then. We chose to use this major release as the opportunity to modernize some of the code to achieve the following goals: + +### Eliminate the dependency on the DataSketches-Memory component. +The DataSketches-Memory component was originally developed in 2014 to address the need for fast access to off-heap memory data structures and used Unsafe and other JVM internals as there were no satisfactory Java language features to do this at the time. + +The FFM capabilities introduced into the language in Java 22, are now part of the Java 25 LTS release, which we support. Since the capabilities of FFM are a superset of the original DataSketches-Memory component, it made sense to rewrite the code to eliminate the dependency on DataSketches-Memory and use FFM instead. This impacted code across the entire library. + +This provided several advantages to the code base. By removing this dependency on DataSketches-Memory, there are now no runtime dependencies! This should make integrating this library into other Java systems much simpler. Since FFM is tightly integrated into the Java language, it has improved performance, especially with bulk operations. + +- As an added note: There are numerous other improvements to the Java language that we could perhaps take advantage of in a rewrite, e.g., Records, text blocks, switch expressions, sealed, var, modules, patterns, etc. However, faced with the risk of accidentally creating bugs due to too many changes at one time, we focused on FFM, which actually improve performance as opposed to just syntactic sugar. + +### Align public sketch class names so that the sketch family name is part of the class name. +For example, the Theta sketch was the first sketch written for the library and its base class was called *Sketch*. Obviously, because it was the only sketch! The Tuple sketch evolved soon after and its base class was also called *Sketch*. Oops, bad idea. If a user wanted to use both the Theta and Tuple sketches in the same class one of them had to be fully qualified every time it was referenced. Ugh! + +Unfortunately, this habit propagated so some of the other early sketches where we ended up with two different sketches with a *ItemsSketch*, for example. For the more recent additions to the library we started including the sketch family name in all the relevant sketch-like public classes of a sketch family. + +In this release we have refactored these older sketches with new names that now include the sketch family name. Yes, this is an incompatible change for user code moving from earlier releases, but this can be usually fixed with search-and-replace tools. This release is not perfect, but hopefully more consistent across all the different sketch families. + ## Build & Runtime Dependencies ### Installation Directory Path **NOTE:** This component accesses resource files for testing. As a result, the directory elements of the full absolute path of the target installation directory must qualify as Java identifiers. In other words, the directory elements must not have any space characters (or non-Java identifier characters) in any of the path elements. This is required by the Oracle Java Specification in order to ensure location-independent access to resources: [See Oracle Location-Independent Access to Resources](https://docs.oracle.com/javase/8/docs/technotes/guides/lang/resources.html) -### OpenJDK Version 24 -An OpenJDK-compatible build of Java 24, provided by one of the Open-Source JVM providers, such as Azul Systems, Red Hat, SAP, Eclipse Temurin, etc, is required. -All of the testing of this release has been performed with an Eclipse Temurin build. - -This release uses the new Java Foreign Function & Memory (FFM) features that were made part of the Java Language in in Java 22. +### OpenJDK Version 25 +At minimum, an OpenJDK-compatible build of Java 25, provided by one of the Open-Source JVM providers, such as *Azul Systems*, *Red Hat*, *SAP*, *Eclipse Temurin*, etc, is required. +All of the testing of this release has been performed with the *Eclipse Temurin* build. ## Compilation and Test using Maven This DataSketches component is structured as a Maven project and Maven is the recommended tool for compile and test. #### A Toolchain is required -* You must have a JDK type toolchain defined in location *~/.m2/toolchains.xml* that specifies where to find a locally installed OpenJDK-compatible version 24. -* Your default \$JAVA\_HOME compiler must be OpenJDK compatible, specified in the toolchain, and may be a version greater than 24. Note that if your \$JAVA\_HOME is set to a Java version greater than 24, Maven will automatically use the Java 24 version specified in the toolchain instead. The included pom.xml specifies the necessary JVM flags, so no further action should be required. +* You must have a JDK type toolchain defined in location *~/.m2/toolchains.xml* that specifies where to find a locally installed OpenJDK-compatible version 25. +* Your default \$JAVA\_HOME compiler must be OpenJDK compatible, specified in the toolchain, and may be a version greater than 25. Note that if your \$JAVA\_HOME is set to a Java version greater than 25, Maven will automatically use the Java 25 version specified in the toolchain instead. The included pom.xml specifies the necessary JVM flags, if required, so no further action is needed. * Note that the paths specified in the toolchain must be fully qualified direct paths to the OpenJDK version locations. Using environment variables will not work. #### To run normal unit tests: diff --git a/src/main/java/org/apache/datasketches/hll/BaseHllSketch.java b/src/main/java/org/apache/datasketches/hll/BaseHllSketch.java index 082318ee2..99107220e 100644 --- a/src/main/java/org/apache/datasketches/hll/BaseHllSketch.java +++ b/src/main/java/org/apache/datasketches/hll/BaseHllSketch.java @@ -35,7 +35,7 @@ /** * Although this class is package-private, it provides a single place to define and document - * the common public API for both HllSketch and Union. + * the common public API for both HllSketch and HllUnion. * @author Lee Rhodes * @author Kevin Lang */ @@ -115,7 +115,7 @@ public static final int getSerializationVersion(final MemorySegment seg) { * Gets the current (approximate) Relative Error (RE) asymptotic values given several * parameters. This is used primarily for testing. * @param upperBound return the RE for the Upper Bound, otherwise for the Lower Bound. - * @param oooFlag set true if the sketch is the result of a non qualifying union operation. + * @param oooFlag set true if the sketch is the result of a non qualifying HllUnion operation. * @param lgConfigK the configured value for the sketch. * @param numStdDev the given number of Standard Deviations. This must be an integer between * 1 and 3, inclusive. @@ -206,8 +206,8 @@ public boolean isEstimationMode() { * inquire of the sketch if it has, in fact, moved itself. * * @param seg the given MemorySegment - * @return true if the given MemorySegment refers to the same underlying resource as this sketch or - * union. + * @return true if the given MemorySegment refers to the same underlying resource as this HllSketch or + * HllUnion. */ @Override public abstract boolean isSameResource(MemorySegment seg); @@ -219,17 +219,17 @@ public boolean isEstimationMode() { /** * Serializes this sketch as a byte array in compact form. The compact form is smaller in size - * than the updatable form and read-only. It can be used in union operations as follows: + * than the updatable form and read-only. It can be used in HllUnion operations as follows: *
{@code
-   *     Union union; HllSketch sk, sk2;
+   *     HllUnion union; HllSketch sk, sk2;
    *     int lgK = 12;
    *     sk = new HllSketch(lgK, TgtHllType.HLL_4); //can be 4, 6, or 8
    *     for (int i = 0; i < (2 << lgK); i++) { sk.update(i); }
    *     byte[] arr = HllSketch.toCompactByteArray();
    *     //...
-   *     union = Union.heapify(arr); //initializes the union using data from the array.
+   *     union = HllUnion.heapify(arr); //initializes the HllUnion using data from the array.
    *     //OR, if used in an off-heap environment:
-   *     union = Union.heapify(MemorySegment.ofArray(arr)); //same as above, except from MemorySegment object.
+   *     union = HllUnion.heapify(MemorySegment.ofArray(arr)); //same as above, except from MemorySegment object.
    *
    *     //To recover an updatable heap sketch:
    *     sk2 = HllSketch.heapify(arr);
@@ -250,17 +250,17 @@ public boolean isEstimationMode() {
   /**
    * Serializes this sketch as a byte array in an updatable form. The updatable form is larger than
    * the compact form. The use of this form is primarily in environments that support updating
-   * sketches in off-heap MemorySegment. If the sketch is constructed using HLL_8, sketch updating and
-   * union updating operations can actually occur in MemorySegment, which can be off-heap:
+   * sketches in off-heap MemorySegment. If the sketch is constructed using HLL_8, HllSketch updating and
+   * HllUnion updating operations can actually occur in MemorySegment, which can be off-heap:
    * 
{@code
-   *     Union union; HllSketch sk;
+   *     HllUnion union; HllSketch sk;
    *     int lgK = 12;
    *     sk = new HllSketch(lgK, TgtHllType.HLL_8) //must be 8
    *     for (int i = 0; i < (2 << lgK); i++) { sk.update(i); }
    *     byte[] arr = sk.toUpdatableByteArray();
    *     MemorySegment wseg = MemorySegment.wrap(arr);
    *     //...
-   *     union = Union.writableWrap(wseg); //no deserialization!
+   *     union = HllUnion.writableWrap(wseg); //no deserialization!
    * }
* @return this sketch as an updatable byte array. */ diff --git a/src/main/java/org/apache/datasketches/hll/DirectHll4Array.java b/src/main/java/org/apache/datasketches/hll/DirectHll4Array.java index 99e9450bb..03eefdc01 100644 --- a/src/main/java/org/apache/datasketches/hll/DirectHll4Array.java +++ b/src/main/java/org/apache/datasketches/hll/DirectHll4Array.java @@ -136,7 +136,7 @@ void putNibble(final int slotNo, final int nibValue) { } @Override - //Would be used by Union, but not used because the gadget is always HLL8 type + //Would be used by HllUnion, but not used because the gadget is always HLL8 type void updateSlotNoKxQ(final int slotNo, final int newValue) { throw new SketchesStateException("Improper access."); } diff --git a/src/main/java/org/apache/datasketches/hll/DirectHll6Array.java b/src/main/java/org/apache/datasketches/hll/DirectHll6Array.java index c9a8eb7c7..4d35be674 100644 --- a/src/main/java/org/apache/datasketches/hll/DirectHll6Array.java +++ b/src/main/java/org/apache/datasketches/hll/DirectHll6Array.java @@ -83,7 +83,7 @@ void putNibble(final int slotNo, final int nibValue) { } @Override - //Would be used by Union, but not used because the gadget is always HLL8 type + //Would be used by HllUnion, but not used because the gadget is always HLL8 type void updateSlotNoKxQ(final int slotNo, final int newValue) { throw new SketchesStateException("Improper access."); } diff --git a/src/main/java/org/apache/datasketches/hll/DirectHll8Array.java b/src/main/java/org/apache/datasketches/hll/DirectHll8Array.java index 7267d2f57..34714fb75 100644 --- a/src/main/java/org/apache/datasketches/hll/DirectHll8Array.java +++ b/src/main/java/org/apache/datasketches/hll/DirectHll8Array.java @@ -86,7 +86,7 @@ void putNibble(final int slotNo, final int nibValue) { } @Override - //Used by Union when source is not HLL8 + //Used by HllUnion when source is not HLL8 void updateSlotNoKxQ(final int slotNo, final int newValue) { final int oldValue = getSlotValue(slotNo); if (newValue > oldValue) { diff --git a/src/main/java/org/apache/datasketches/hll/Hll4Array.java b/src/main/java/org/apache/datasketches/hll/Hll4Array.java index 759174bea..f6295fa78 100644 --- a/src/main/java/org/apache/datasketches/hll/Hll4Array.java +++ b/src/main/java/org/apache/datasketches/hll/Hll4Array.java @@ -136,7 +136,7 @@ void putNibble(final int slotNo, final int nibValue) { } @Override - //Would be used by Union, but not used because the gadget is always HLL8 type + //Would be used by HllUnion, but not used because the gadget is always HLL8 type void updateSlotNoKxQ(final int slotNo, final int newValue) { throw new SketchesStateException("Improper access."); } diff --git a/src/main/java/org/apache/datasketches/hll/Hll6Array.java b/src/main/java/org/apache/datasketches/hll/Hll6Array.java index 70a41090e..a0ddcbf59 100644 --- a/src/main/java/org/apache/datasketches/hll/Hll6Array.java +++ b/src/main/java/org/apache/datasketches/hll/Hll6Array.java @@ -93,7 +93,7 @@ void putNibble(final int slotNo, final int nibValue) { } @Override - //Would be used by Union, but not used because the gadget is always HLL8 type + //Would be used by HllUnion, but not used because the gadget is always HLL8 type void updateSlotNoKxQ(final int slotNo, final int newValue) { throw new SketchesStateException("Improper access."); } diff --git a/src/main/java/org/apache/datasketches/hll/Hll8Array.java b/src/main/java/org/apache/datasketches/hll/Hll8Array.java index 97ebac9dc..423cebfee 100644 --- a/src/main/java/org/apache/datasketches/hll/Hll8Array.java +++ b/src/main/java/org/apache/datasketches/hll/Hll8Array.java @@ -92,7 +92,7 @@ void putNibble(final int slotNo, final int nibValue) { } @Override - //Used by Union when source is not HLL8 + //Used by HllUnion when source is not HLL8 void updateSlotNoKxQ(final int slotNo, final int newValue) { final int oldValue = getSlotValue(slotNo); hllByteArr[slotNo] = (byte) Math.max(newValue, oldValue); diff --git a/src/main/java/org/apache/datasketches/hll/HllSketch.java b/src/main/java/org/apache/datasketches/hll/HllSketch.java index 35d782a27..0ff0c1e97 100644 --- a/src/main/java/org/apache/datasketches/hll/HllSketch.java +++ b/src/main/java/org/apache/datasketches/hll/HllSketch.java @@ -203,7 +203,7 @@ public static final HllSketch heapify(final MemorySegment srcSeg) { return heapify(srcSeg, true); } - //used by union and above + //used by HllUnion and above static final HllSketch heapify(final MemorySegment srcSeg, final boolean checkRebuild) { Objects.requireNonNull(srcSeg, "Source MemorySegment must not be null"); checkBounds(0, 8, srcSeg.byteSize()); //need min 8 bytes @@ -218,7 +218,7 @@ static final HllSketch heapify(final MemorySegment srcSeg, final boolean checkRe } else { //Hll_8 heapSketch = new HllSketch(Hll8Array.heapify(srcSeg)); if (checkRebuild) { - Union.checkRebuildCurMinNumKxQ(heapSketch); + HllUnion.checkRebuildCurMinNumKxQ(heapSketch); } } } else if (curMode == CurMode.LIST) { @@ -245,7 +245,7 @@ public static final HllSketch writableWrap(final MemorySegment srcWseg) { return writableWrap(srcWseg, true); } - //used by union and above + //used by HllUnion and above static final HllSketch writableWrap( final MemorySegment srcWseg, final boolean checkRebuild) { Objects.requireNonNull(srcWseg, "Source MemorySegment must not be null"); checkBounds(0, 8, srcWseg.byteSize()); //need min 8 bytes @@ -268,8 +268,8 @@ static final HllSketch writableWrap( final MemorySegment srcWseg, final boolean directSketch = new HllSketch(new DirectHll6Array(lgConfigK, srcWseg)); } else { //Hll_8 directSketch = new HllSketch(new DirectHll8Array(lgConfigK, srcWseg)); - if (checkRebuild) { //union only uses HLL_8, we allow non-finalized from a union call. - Union.checkRebuildCurMinNumKxQ(directSketch); + if (checkRebuild) { //HllUnion only uses HLL_8, we allow non-finalized from a HllUnion call. + HllUnion.checkRebuildCurMinNumKxQ(directSketch); } } } else if (curMode == CurMode.LIST) { @@ -305,8 +305,8 @@ public static final HllSketch wrap(final MemorySegment srcSeg) { //read only directSketch = new HllSketch(new DirectHll6Array(lgConfigK, srcSeg, true)); } else { //Hll_8 directSketch = new HllSketch(new DirectHll8Array(lgConfigK, srcSeg, true)); - //rebuild if srcSeg came from a union and was not finalized, rather than throw exception. - Union.checkRebuildCurMinNumKxQ(directSketch); + //rebuild if srcSeg came from a HllUnion and was not finalized, rather than throw exception. + HllUnion.checkRebuildCurMinNumKxQ(directSketch); } } else if (curMode == CurMode.LIST) { directSketch = diff --git a/src/main/java/org/apache/datasketches/hll/Union.java b/src/main/java/org/apache/datasketches/hll/HllUnion.java similarity index 88% rename from src/main/java/org/apache/datasketches/hll/Union.java rename to src/main/java/org/apache/datasketches/hll/HllUnion.java index ead23b9ff..dca2cd7da 100644 --- a/src/main/java/org/apache/datasketches/hll/Union.java +++ b/src/main/java/org/apache/datasketches/hll/HllUnion.java @@ -34,64 +34,64 @@ import org.apache.datasketches.common.SketchesArgumentException; /** - * This performs union operations for all HllSketches. This union operator can be configured to be - * on or off heap. The source sketch given to this union using the {@link #update(HllSketch)} can + * This performs union operations for all HllSketches. This HllUnion operator can be configured to be + * on or off heap. The source sketch given to this HllUnion using the {@link #update(HllSketch)} can * be configured with any precision value lgConfigK (from 4 to 21), any TgtHllType * (HLL_4, HLL_6, HLL_8), and either on or off-heap; and it can be in either of the sparse modes * (LIST or SET), or the dense mode (HLL). * - *

Although the API for this union operator parallels many of the methods of the - * HllSketch, the behavior of the union operator has some fundamental differences.

+ *

Although the API for this HllUnion operator parallels many of the methods of the + * HllSketch, the behavior of the HllUnion operator has some fundamental differences.

* - *

First, this union operator is configured with a lgMaxK instead of the normal - * lgConfigK. Generally, this union operator will inherit the lowest lgConfigK + *

First, this HllUnion operator is configured with a lgMaxK instead of the normal + * lgConfigK. Generally, this HllUnion operator will inherit the lowest lgConfigK * less than lgMaxK that it has seen. However, the lgConfigK of incoming sketches that * are still in sparse are ignored. The lgMaxK provides the user the ability to specify the - * largest maximum size for the union operation. + * largest maximum size for the HllUnion operation. * - *

Second, the user cannot specify the {@link TgtHllType} as an input parameter to the union. + *

Second, the user cannot specify the {@link TgtHllType} as an input parameter to the HllUnion. * Instead, it is specified for the sketch returned with {@link #getResult(TgtHllType)}. * *

The following graph illustrates the HLL Merge speed.

* - *

HLL LgK12 Union Speed

+ *

HLL_UnionTime4_6_8_Java_CPP.png

* This graph illustrates the relative merging speed of the HLL 4,6,8 Java HLL sketches compared to * the DataSketches C++ implementations of the same sketches. With this particular test (merging 32 relative large * sketches together), the Java HLL 8 is the fastest and the Java HLL 4 the slowest, with a mixed cluster in the middle. - * Union / Merging speed is somewhat difficult to measure as the performance is very dependent on the mix of sketch + * HllUnion / Merging speed is somewhat difficult to measure as the performance is very dependent on the mix of sketch * sizes (and types) you are merging. So your mileage will vary! * - *

For a complete example of using the Union operator - * see Union Example.

+ *

For a complete example of using the HllUnion operator + * see HllUnion Example.

* * @author Lee Rhodes * @author Kevin Lang */ -public class Union extends BaseHllSketch { +public class HllUnion extends BaseHllSketch { final int lgMaxK; private final HllSketch gadget; /** - * Construct this Union operator with the default maximum log-base-2 of K. + * Construct this HllUnion operator with the default maximum log-base-2 of K. */ - public Union() { + public HllUnion() { lgMaxK = HllSketch.DEFAULT_LG_K; gadget = new HllSketch(lgMaxK, HLL_8); } /** - * Construct this Union operator with a given maximum log-base-2 of K. + * Construct this HllUnion operator with a given maximum log-base-2 of K. * @param lgMaxK the desired maximum log-base-2 of K. This value must be * between 4 and 21 inclusively. */ - public Union(final int lgMaxK) { + public HllUnion(final int lgMaxK) { this.lgMaxK = HllUtil.checkLgK(lgMaxK); gadget = new HllSketch(lgMaxK, HLL_8); } /** - * Construct this Union operator with a given maximum log-base-2 of K and the given - * MemorySegment as the destination for this Union. This MemorySegment is usually configured + * Construct this HllUnion operator with a given maximum log-base-2 of K and the given + * MemorySegment as the destination for this HllUnion. This MemorySegment is usually configured * for off-heap MemorySegment. What remains on the java heap is a thin wrapper object that reads and * writes to the given MemorySegment. * @@ -101,35 +101,35 @@ public Union(final int lgMaxK) { * between 4 and 21 inclusively. * @param dstWseg the destination writable MemorySegment for the sketch. */ - public Union(final int lgMaxK, final MemorySegment dstWseg) { + public HllUnion(final int lgMaxK, final MemorySegment dstWseg) { this.lgMaxK = HllUtil.checkLgK(lgMaxK); gadget = new HllSketch(lgMaxK, HLL_8, dstWseg); } //used only by writableWrap - private Union(final HllSketch sketch) { + private HllUnion(final HllSketch sketch) { lgMaxK = sketch.getLgConfigK(); gadget = sketch; } /** - * Construct a union operator populated with the given byte array image of an HllSketch. + * Construct a HllUnion operator populated with the given byte array image of an HllSketch. * @param byteArray the given byte array - * @return a union operator populated with the given byte array image of an HllSketch. + * @return a HllUnion operator populated with the given byte array image of an HllSketch. */ - public static final Union heapify(final byte[] byteArray) { + public static final HllUnion heapify(final byte[] byteArray) { return heapify(MemorySegment.ofArray(byteArray)); } /** - * Construct a union operator populated with the given MemorySegment image of an HllSketch. + * Construct a HllUnion operator populated with the given MemorySegment image of an HllSketch. * @param seg the given MemorySegment - * @return a union operator populated with the given MemorySegment image of an HllSketch. + * @return a HllUnion operator populated with the given MemorySegment image of an HllSketch. */ - public static final Union heapify(final MemorySegment seg) { + public static final HllUnion heapify(final MemorySegment seg) { final int lgK = HllUtil.checkLgK(seg.get(JAVA_BYTE, PreambleUtil.LG_K_BYTE)); final HllSketch sk = HllSketch.heapify(seg, false); //allows non-finalized image - final Union union = new Union(lgK); + final HllUnion union = new HllUnion(lgK); union.update(sk); return union; } @@ -143,16 +143,16 @@ public static final Union heapify(final MemorySegment seg) { *

The given dstSeg is checked for the required capacity as determined by * {@link HllSketch#getMaxUpdatableSerializationBytes(int, TgtHllType)}, and for the correct type. * @param srcWseg an writable image of a valid sketch with data. - * @return a Union operator where the sketch data is in the given dstSeg. + * @return a HllUnion operator where the sketch data is in the given dstSeg. */ - public static final Union writableWrap(final MemorySegment srcWseg) { + public static final HllUnion writableWrap(final MemorySegment srcWseg) { final TgtHllType tgtHllType = extractTgtHllType(srcWseg); if (tgtHllType != TgtHllType.HLL_8) { throw new SketchesArgumentException( - "Union can only wrap writable HLL_8 sketches that were the Gadget of a Union."); + "HllUnion can only wrap writable HLL_8 sketches that were the Gadget of a HllUnion."); } //allows writableWrap of non-finalized image - return new Union(HllSketch.writableWrap(srcWseg, false)); + return new HllUnion(HllSketch.writableWrap(srcWseg, false)); } @Override @@ -178,7 +178,7 @@ public double getEstimate() { } /** - * Gets the effective lgConfigK for the union operator, which may be less than + * Gets the effective lgConfigK for the HllUnion operator, which may be less than * lgMaxK. * @return the lgConfigK. */ @@ -194,28 +194,28 @@ public double getLowerBound(final int numStdDev) { } /** - * Returns the maximum size in bytes that this union operator can grow to given a lgK. + * Returns the maximum size in bytes that this HllUnion operator can grow to given a lgK. * - * @param lgK The maximum Log2 of K for this union operator. This value must be + * @param lgK The maximum Log2 of K for this HllUnion operator. This value must be * between 4 and 21 inclusively. - * @return the maximum size in bytes that this union operator can grow to. + * @return the maximum size in bytes that this HllUnion operator can grow to. */ public static int getMaxSerializationBytes(final int lgK) { return HllSketch.getMaxUpdatableSerializationBytes(lgK, TgtHllType.HLL_8); } /** - * Return the result of this union operator as an HLL_4 sketch. - * @return the result of this union operator as an HLL_4 sketch. + * Return the result of this HllUnion operator as an HLL_4 sketch. + * @return the result of this HllUnion operator as an HLL_4 sketch. */ public HllSketch getResult() { return getResult(HllSketch.DEFAULT_HLL_TYPE); } /** - * Return the result of this union operator with the specified {@link TgtHllType} + * Return the result of this HllUnion operator with the specified {@link TgtHllType} * @param tgtHllType the TgtHllType enum - * @return the result of this union operator with the specified TgtHllType + * @return the result of this HllUnion operator with the specified TgtHllType */ public HllSketch getResult(final TgtHllType tgtHllType) { checkRebuildCurMinNumKxQ(gadget); @@ -286,11 +286,11 @@ public void reset() { } /** - * Gets the serialization of this union operator as a byte array in compact form, which is + * Gets the serialization of this HllUnion operator as a byte array in compact form, which is * designed to be heapified only. It is not directly updatable. - * For the Union operator, this is the serialization of the internal state of - * the union operator as a sketch. - * @return the serialization of this union operator as a byte array. + * For the HllUnion operator, this is the serialization of the internal state of + * the HllUnion operator as a sketch. + * @return the serialization of this HllUnion operator as a byte array. */ @Override public byte[] toCompactByteArray() { @@ -313,7 +313,7 @@ public String toString(final boolean summary, final boolean hllDetail, } /** - * Update this union operator with the given sketch. + * Update this HllUnion operator with the given sketch. * @param sketch the given sketch. */ public void update(final HllSketch sketch) { @@ -326,28 +326,28 @@ void couponUpdate(final int coupon) { gadget.hllSketchImpl = gadget.hllSketchImpl.couponUpdate(coupon); } - // Union operator logic + // HllUnion operator logic /** * Union the given source and destination sketches. This static method examines the state of * the current internal gadget and the incoming sketch and determines the optimum way to * perform the union. This may involve swapping the merge order, downsampling, transforming, - * and / or copying one of the arguments and may completely replace the internals of the union. + * and / or copying one of the arguments and may completely replace the internals of the HllUnion. * - *

If the union gadget is empty, the source sketch is effectively copied to the union gadget + *

If the HllUnion gadget is empty, the source sketch is effectively copied to the HllUnion gadget * after any required transformations. * - *

The direction of the merge is reversed if the union gadget is in LIST or SET mode, and the + *

The direction of the merge is reversed if the HllUnion gadget is in LIST or SET mode, and the * source sketch is in HLL mode. This is done to maintain maximum accuracy of the union process. * *

The source sketch is downsampled if the source LgK is larger than maxLgK and in HLL mode. * - *

The union gadget is downsampled if both source and union gadget are in HLL mode - * and the source LgK less than the union gadget LgK. + *

The HllUnion gadget is downsampled if both source and HllUnion gadget are in HLL mode + * and the source LgK less than the HllUnion gadget LgK. * * @param source the given incoming sketch, which cannot be modified. * @param gadget the given gadget sketch, which has a target of HLL_8 and holds the result. - * @param lgMaxK the maximum value of log2 K for this union. + * @param lgMaxK the maximum value of log2 K for this union operation. * @return the union of the two sketches in the form of the internal HllSketchImpl, which is * always in HLL_8 form. */ @@ -765,7 +765,7 @@ private static final void mergeHlltoHLLmode(final HllSketch src, final HllSketch tgt.hllSketchImpl.putRebuildCurMinNumKxQFlag(true); } - //Used by union operator. Always copies or downsamples to Heap HLL_8. + //Used by HllUnion operator. Always copies or downsamples to Heap HLL_8. //Caller must ultimately manage oooFlag, as caller has more context. /** * Copies or downsamples the given candidate HLLmode sketch to tgtLgK, HLL_8, on the heap. diff --git a/src/main/java/org/apache/datasketches/hll/PreambleUtil.java b/src/main/java/org/apache/datasketches/hll/PreambleUtil.java index a43a6f121..b86b65fc6 100644 --- a/src/main/java/org/apache/datasketches/hll/PreambleUtil.java +++ b/src/main/java/org/apache/datasketches/hll/PreambleUtil.java @@ -137,7 +137,7 @@ private PreambleUtil() {} static final int EMPTY_FLAG_MASK = 4; static final int COMPACT_FLAG_MASK = 8; static final int OUT_OF_ORDER_FLAG_MASK = 16; - static final int REBUILD_CURMIN_NUM_KXQ_MASK = 32; //used only by Union + static final int REBUILD_CURMIN_NUM_KXQ_MASK = 32; //used only by HllUnion //Mode byte masks static final int CUR_MODE_MASK = 3; diff --git a/src/main/java/org/apache/datasketches/hll/package-info.java b/src/main/java/org/apache/datasketches/hll/package-info.java index 114d4da96..ad2f22fa9 100644 --- a/src/main/java/org/apache/datasketches/hll/package-info.java +++ b/src/main/java/org/apache/datasketches/hll/package-info.java @@ -18,18 +18,18 @@ */ /** - *

The DataSketches™ HLL sketch family package

- * {@link org.apache.datasketches.hll.HllSketch HllSketch} and {@link org.apache.datasketches.hll.Union Union} + *

The DataSketches™ HllSketch family package

+ * {@link org.apache.datasketches.hll.HllSketch HllSketch} and {@link org.apache.datasketches.hll.HllUnion HllUnion} * are the public facing classes of this high performance implementation of Phillipe Flajolet's * HyperLogLog algorithm[1] but with significantly improved error behavior and important features that can be * essential for large production systems that must handle massive data. * - *

Key Features of the DataSketches™ HLL Sketch and its companion Union

+ *

Key Features of the DataSketches™ HllSketch and its companion HllUnion

* *

Advanced Estimation Algorithms for Optimum Accuracy

* *

Zero error at low cardinalities

- * The HLL sketch leverages highly compact arrays and hash tables to keep exact counts until the transition to + * The HllSketch leverages highly compact arrays and hash tables to keep exact counts until the transition to * dense mode is required for space reasons. The result is perfect accuracy for very low cardinalities. * *

Accuracy for very small streams can be important because Big Data is often fragmented into millions of smaller @@ -55,7 +55,7 @@ * sketch once the statistical randomness is removed through multiple trials. This can be observed in the * following graph.

* - *

HLL Accuracy[6]

+ *

HLL_HIP_K12T20U20.png[6]

* *

The above graph has 7 curves. At y = 0, is the median line that hugs the x-axis so closely that it can't be seen. * The two curves, just above and just below the x-axis, correspond to +/- 1 standard deviation (SD) of error. @@ -68,8 +68,8 @@ * Below the cardinality of about 512 there is no error at all. This is the point where this particular * sketch transitions from sparse to dense (or estimation) mode.

* - *

Three HLL Types

- * This HLL implementation offers three different types of HLL sketch, each with different + *

Three HllSketch Types

+ * This HLL implementation offers three different types of HllSketch, each with different * trade-offs with accuracy, space and performance. These types are selected with the * {@link org.apache.datasketches.hll.TgtHllType TgtHllType} parameter. * @@ -96,7 +96,7 @@ * terms of update time, but has the smallest storage footprint of about K/2 * 1.03 bytes. * *

Off-Heap Operation

- * This HLL sketch also offers the capability of operating off-heap. Given a MemorySegment[5] object + * This HllSketch also offers the capability of operating off-heap. Given a MemorySegment[5] object * created by the user, the sketch will perform all of its updates and internal phase transitions * in that object, which can actually reside either on-heap or off-heap based on how it was * configured. In large systems that must update and union many millions of sketches, having the @@ -104,8 +104,8 @@ * off-heap and back, and reduces the need for garbage collection. * *

Merging sketches with different configured lgConfigK

- * This enables a user to union a HLL sketch that was configured with, say, lgConfigK = 12 - * with another loaded HLL sketch that was configured with, say, lgConfigK = 14. + * This enables a user to union an HllSketch that was configured with, say, lgConfigK = 12 + * with another loaded HllSketch that was configured with, say, lgConfigK = 14. * *

Why is this important? Suppose you have been building a history of sketches of your customer's * data that go back a full year (or 5 or 10!) that were all configured with lgConfigK = 12. Because sketches @@ -125,7 +125,7 @@ * *

Multi-language, multi-platform.

* The binary structures for our sketch serializations are language and platform independent. - * This means it is possible to generate an HLL sketch on a C++ Windows platform and it can be used on a + * This means it is possible to generate an HllSketch on a C++ Windows platform and it can be used on a * Java or Python Unix platform. * *

[1] Philippe Flajolet, et al, diff --git a/src/test/java/org/apache/datasketches/hll/BaseHllSketchTest.java b/src/test/java/org/apache/datasketches/hll/BaseHllSketchTest.java index 4afb282b6..0ba429b0e 100644 --- a/src/test/java/org/apache/datasketches/hll/BaseHllSketchTest.java +++ b/src/test/java/org/apache/datasketches/hll/BaseHllSketchTest.java @@ -26,7 +26,7 @@ import org.apache.datasketches.hll.HllSketch; import org.apache.datasketches.hll.PreambleUtil; import org.apache.datasketches.hll.TgtHllType; -import org.apache.datasketches.hll.Union; +import org.apache.datasketches.hll.HllUnion; import org.testng.annotations.Test; import java.lang.foreign.MemorySegment; @@ -71,7 +71,7 @@ public void checkUpdateTypes() { sk.update(s); sk.update("1234"); - final Union u = new Union(10); + final HllUnion u = new HllUnion(10); final byte[] byteArr1 = null; u.update(byteArr1); u.update(new byte[] {}); diff --git a/src/test/java/org/apache/datasketches/hll/DirectUnionTest.java b/src/test/java/org/apache/datasketches/hll/DirectUnionTest.java index 9dca85d98..114530a86 100644 --- a/src/test/java/org/apache/datasketches/hll/DirectUnionTest.java +++ b/src/test/java/org/apache/datasketches/hll/DirectUnionTest.java @@ -36,7 +36,7 @@ import org.apache.datasketches.hll.HllUtil; import org.apache.datasketches.hll.RelativeErrorTables; import org.apache.datasketches.hll.TgtHllType; -import org.apache.datasketches.hll.Union; +import org.apache.datasketches.hll.HllUnion; /** * @author Lee Rhodes @@ -198,7 +198,7 @@ private static void basicUnion(final int n1, final int n2, final int lgK1, final final String h1SketchStr = ("H1 SKETCH: \n" + h1.toString()); final String h2SketchStr = ("H2 SKETCH: \n" + h2.toString()); - final Union union = newUnion(lgMaxK); + final HllUnion union = newUnion(lgMaxK); union.update(h1); final String uH1SketchStr = ("Union after H1: \n" + union.getResult(resultType).toString()); @@ -260,7 +260,7 @@ public void checkToFromUnion1() { } private static void toFrom1(final int lgK, final TgtHllType tgtHllType, final int n) { - final Union srcU = newUnion(lgK); + final HllUnion srcU = newUnion(lgK); final HllSketch srcSk = new HllSketch(lgK, tgtHllType); for (int i = 0; i < n; i++) { srcSk.update(i); @@ -271,7 +271,7 @@ private static void toFrom1(final int lgK, final TgtHllType tgtHllType, final in final byte[] byteArr = srcU.toCompactByteArray(); final MemorySegment seg = MemorySegment.ofArray(byteArr); - final Union dstU = Union.heapify(seg); + final HllUnion dstU = HllUnion.heapify(seg); assertEquals(dstU.getEstimate(), srcU.getEstimate(), 0.0); } @@ -290,7 +290,7 @@ public void checkToFromUnion2() { } private static void toFrom2(final int lgK, final TgtHllType tgtHllType, final int n) { - final Union srcU = newUnion(lgK); + final HllUnion srcU = newUnion(lgK); final HllSketch srcSk = new HllSketch(lgK, tgtHllType); for (int i = 0; i < n; i++) { srcSk.update(i); @@ -300,14 +300,14 @@ private static void toFrom2(final int lgK, final TgtHllType tgtHllType, final in srcU.update(srcSk); final byte[] byteArr = srcU.toCompactByteArray(); - final Union dstU = Union.heapify(byteArr); + final HllUnion dstU = HllUnion.heapify(byteArr); assertEquals(dstU.getEstimate(), srcU.getEstimate(), 0.0); } @Test public void checkCompositeEst() { - final Union u = newUnion(12); + final HllUnion u = newUnion(12); assertEquals(u.getCompositeEstimate(), 0, .03); for (int i = 1; i <= 15; i++) { u.update(i); } assertEquals(u.getCompositeEstimate(), 15, 15 *.03); @@ -319,31 +319,31 @@ public void checkCompositeEst() { @Test public void checkMisc() { try { - final Union u = newUnion(HllUtil.MIN_LOG_K - 1); + final HllUnion u = newUnion(HllUtil.MIN_LOG_K - 1); fail(); } catch (final SketchesArgumentException e) { //expected } try { - final Union u = newUnion(HllUtil.MAX_LOG_K + 1); + final HllUnion u = newUnion(HllUtil.MAX_LOG_K + 1); fail(); } catch (final SketchesArgumentException e) { //expected } - final Union u = newUnion(7); + final HllUnion u = newUnion(7); final HllSketch sk = u.getResult(); assertTrue(sk.isEmpty()); } @Test public void checkHeapify() { - final Union u = newUnion(16); + final HllUnion u = newUnion(16); for (int i = 0; i < (1 << 20); i++) { u.update(i); } final double est1 = u.getEstimate(); final byte[] byteArray = u.toUpdatableByteArray(); - final Union u2 = Union.heapify(byteArray); + final HllUnion u2 = HllUnion.heapify(byteArray); assertEquals(u2.getEstimate(), est1, 0.0); } @@ -363,7 +363,7 @@ public void checkUbLb() { @Test public void checkEmptyCouponMisc() { final int lgK = 8; - final Union union = newUnion(lgK); + final HllUnion union = newUnion(lgK); for (int i = 0; i < 20; i++) { union.update(i); } //SET mode union.couponUpdate(0); assertEquals(union.getEstimate(), 20.0, 0.001); @@ -371,7 +371,7 @@ public void checkEmptyCouponMisc() { assertTrue(union.hasMemorySegment()); assertFalse(union.isOffHeap()); final int bytes = union.getUpdatableSerializationBytes(); - assertTrue(bytes <= Union.getMaxSerializationBytes(lgK)); + assertTrue(bytes <= HllUnion.getMaxSerializationBytes(lgK)); assertFalse(union.isCompact()); } @@ -388,7 +388,7 @@ public void checkUnionWithWrap() { final HllSketch sk2 = HllSketch.wrap(MemorySegment.ofArray(skByteArr)); assertEquals(sk2.getEstimate(), est, 0.0); - final Union union = newUnion(lgConfigK); + final HllUnion union = newUnion(lgConfigK); union.update(HllSketch.wrap(MemorySegment.ofArray(skByteArr))); assertEquals(union.getEstimate(), est, 0.0); } @@ -402,7 +402,7 @@ public void checkUnionWithWrap2() { final double est1 = sk1.getEstimate(); final byte[] byteArr1 = sk1.toCompactByteArray(); - final Union union = newUnion(lgConfigK); + final HllUnion union = newUnion(lgConfigK); union.update(HllSketch.wrap(MemorySegment.ofArray(byteArr1))); final double est2 = union.getEstimate(); assertEquals(est2, est1); @@ -412,10 +412,10 @@ public void checkUnionWithWrap2() { public void checkWritableWrap() { final int lgConfigK = 10; final int n = 128; - final Union union = newUnion(lgConfigK); + final HllUnion union = newUnion(lgConfigK); for (int i = 0; i < n; i++) { union.update(i); } final double est = union.getEstimate(); - final Union union2 = Union.writableWrap(MemorySegment.ofArray(union.toUpdatableByteArray())); + final HllUnion union2 = HllUnion.writableWrap(MemorySegment.ofArray(union.toUpdatableByteArray())); final double est2 = union2.getEstimate(); assertEquals(est2, est, 0.0); } @@ -426,13 +426,13 @@ public void checkWritableWrapThrows() { final int n = 128; final HllSketch sk = new HllSketch(lgConfigK, HLL_6); for (int i = 0; i < n; i++) {sk.update(i); } - Union.writableWrap(MemorySegment.ofArray(sk.toUpdatableByteArray())); + HllUnion.writableWrap(MemorySegment.ofArray(sk.toUpdatableByteArray())); } - private static Union newUnion(final int lgK) { + private static HllUnion newUnion(final int lgK) { final int bytes = HllSketch.getMaxUpdatableSerializationBytes(lgK, TgtHllType.HLL_8); final MemorySegment wseg = MemorySegment.ofArray(new byte[bytes]); - return new Union(lgK, wseg); + return new HllUnion(lgK, wseg); } private static double getBound(final int lgK, final boolean ub, final boolean oooFlag, final int numStdDev, final double est) { diff --git a/src/test/java/org/apache/datasketches/hll/HllArrayTest.java b/src/test/java/org/apache/datasketches/hll/HllArrayTest.java index 27793679b..3bbb01db0 100644 --- a/src/test/java/org/apache/datasketches/hll/HllArrayTest.java +++ b/src/test/java/org/apache/datasketches/hll/HllArrayTest.java @@ -34,7 +34,7 @@ import org.apache.datasketches.hll.HllArray; import org.apache.datasketches.hll.HllSketch; import org.apache.datasketches.hll.TgtHllType; -import org.apache.datasketches.hll.Union; +import org.apache.datasketches.hll.HllUnion; import org.testng.annotations.Test; /** @@ -59,7 +59,7 @@ public void checkBigHipGetRse() { } private static void testComposite(final int lgK, final TgtHllType tgtHllType, final int n) { - final Union u = new Union(lgK); + final HllUnion u = new HllUnion(lgK); final HllSketch sk = new HllSketch(lgK, tgtHllType); for (int i = 0; i < n; i++) { u.update(i); diff --git a/src/test/java/org/apache/datasketches/hll/HllSketchMergeOrderTest.java b/src/test/java/org/apache/datasketches/hll/HllSketchMergeOrderTest.java index f2656f15f..1618f81e7 100644 --- a/src/test/java/org/apache/datasketches/hll/HllSketchMergeOrderTest.java +++ b/src/test/java/org/apache/datasketches/hll/HllSketchMergeOrderTest.java @@ -98,7 +98,7 @@ private HllSketch createUniquePowerSeriesSketch(final long baseValue, final int * Merges three sketches in the specified order and returns the composite estimate */ private double mergeThreeSketches(final HllSketch s1, final HllSketch s2, final HllSketch s3) { - final Union union = new Union(LgK); + final HllUnion union = new HllUnion(LgK); union.update(s1); union.update(s2); diff --git a/src/test/java/org/apache/datasketches/hll/HllSketchTest.java b/src/test/java/org/apache/datasketches/hll/HllSketchTest.java index d0744f857..b9cc6f298 100644 --- a/src/test/java/org/apache/datasketches/hll/HllSketchTest.java +++ b/src/test/java/org/apache/datasketches/hll/HllSketchTest.java @@ -47,7 +47,7 @@ import org.apache.datasketches.hll.HllUtil; import org.apache.datasketches.hll.PreambleUtil; import org.apache.datasketches.hll.TgtHllType; -import org.apache.datasketches.hll.Union; +import org.apache.datasketches.hll.HllUnion; import org.testng.annotations.Test; /** @@ -431,15 +431,15 @@ public void checkWritableWrapOfCompact() { @SuppressWarnings("unused") @Test public void checkJavadocExample() { - Union union; HllSketch sk, sk2; + HllUnion union; HllSketch sk, sk2; final int lgK = 12; sk = new HllSketch(lgK, TgtHllType.HLL_4); //can be 4, 6, or 8 for (int i = 0; i < (2 << lgK); i++) { sk.update(i); } final byte[] arr = sk.toCompactByteArray(); // ... - union = Union.heapify(arr); //initializes the union using data from the array. + union = HllUnion.heapify(arr); //initializes the union using data from the array. //OR, if used in an off-heap environment: - union = Union.heapify(MemorySegment.ofArray(arr)); + union = HllUnion.heapify(MemorySegment.ofArray(arr)); //To recover an updatable Heap sketch: sk2 = HllSketch.heapify(arr); diff --git a/src/test/java/org/apache/datasketches/hll/IsomorphicTest.java b/src/test/java/org/apache/datasketches/hll/IsomorphicTest.java index 90db8088b..d895246a8 100644 --- a/src/test/java/org/apache/datasketches/hll/IsomorphicTest.java +++ b/src/test/java/org/apache/datasketches/hll/IsomorphicTest.java @@ -30,11 +30,6 @@ import java.lang.foreign.MemorySegment; -import org.apache.datasketches.hll.AbstractHllArray; -import org.apache.datasketches.hll.CurMode; -import org.apache.datasketches.hll.HllSketch; -import org.apache.datasketches.hll.TgtHllType; -import org.apache.datasketches.hll.Union; import org.testng.annotations.Test; /** @@ -55,7 +50,7 @@ public void isomorphicUnionUpdatableHeap() { final TgtHllType tgtHllType1 = TgtHllType.fromOrdinal(t); final HllSketch sk1 = buildHeapSketch(lgK, tgtHllType1, curMode); final byte[] sk1bytes = sk1.toUpdatableByteArray(); //UPDATABLE - final Union union = buildHeapUnion(lgK, null); //UNION + final HllUnion union = buildHeapUnion(lgK, null); //UNION union.update(sk1); final HllSketch sk2 = union.getResult(tgtHllType1); final byte[] sk2bytes = sk2.toUpdatableByteArray(); //UPDATABLE @@ -77,7 +72,7 @@ public void isomorphicUnionCompactHeap() { final TgtHllType tgtHllType1 = TgtHllType.fromOrdinal(t); final HllSketch sk1 = buildHeapSketch(lgK, tgtHllType1, curMode); final byte[] sk1bytes = sk1.toCompactByteArray(); //COMPACT - final Union union = buildHeapUnion(lgK, null); //UNION + final HllUnion union = buildHeapUnion(lgK, null); //UNION union.update(sk1); final HllSketch sk2 = union.getResult(tgtHllType1); final byte[] sk2bytes = sk2.toCompactByteArray(); //COMPACT @@ -161,17 +156,17 @@ public void isomorphicHllMerges() { } private static void innerLoop(final int uLgK, final int skLgK, final TgtHllType tgtHllType) { - Union u; + HllUnion u; HllSketch sk; final HllSketch skOut; - //CASE 1 Heap Union, Heap sketch + //CASE 1 Heap HllUnion, Heap HllSketch u = buildHeapUnionHllMode(uLgK, 0); sk = buildHeapSketchHllMode(skLgK, tgtHllType, 1 << uLgK); u.update(sk); final byte[] bytesOut1 = u.getResult(HLL_8).toUpdatableByteArray(); - //CASE 2 Heap Union, MemorySegment sketch + //CASE 2 Heap HllUnion, MemorySegment HllSketch u = buildHeapUnionHllMode(uLgK, 0); sk = buildMemorySegmentSketchHllMode(skLgK, tgtHllType, 1 << uLgK); u.update(sk); @@ -181,10 +176,10 @@ private static void innerLoop(final int uLgK, final int skLgK, final TgtHllType //println("Uheap/SkSegment HIP: " + bytesToDouble(bytesOut2, 8)); //HipAccum String comb = "uLgK: " + uLgK + ", skLgK: " + skLgK + ", SkType: " + tgtHllType.toString() - + ", Case1: Heap Union, Heap sketch; Case2: /Heap Union, MemorySegment sketch"; + + ", Case1: Heap HllUnion, Heap HllSketch; Case2: /Heap HllUnion, MemorySegment HllSketch"; checkArrays(bytesOut1, bytesOut2, comb, false); - //CASE 3 Offheap Union, Heap sketch + //CASE 3 Offheap HllUnion, Heap HllSketch u = buildMemorySegmentUnionHllMode(uLgK, 0); sk = buildHeapSketchHllMode(skLgK, tgtHllType, 1 << uLgK); u.update(sk); @@ -194,10 +189,10 @@ private static void innerLoop(final int uLgK, final int skLgK, final TgtHllType //println("Usegment/SkHeap HIP: " + bytesToDouble(bytesOut3, 8)); //HipAccum comb = "LgK: " + uLgK + ", skLgK: " + skLgK + ", SkType: " + tgtHllType.toString() - + ", Case2: Heap Union, MemorySegment sketch; Case3: /MemorySegment Union, Heap sketch"; + + ", Case2: Heap HllUnion, MemorySegment HllSketch; Case3: /MemorySegment HllUnion, Heap HllSketch"; checkArrays(bytesOut2, bytesOut3, comb, false); - //Case 4 MemorySegment Union, MemorySegment sketch + //Case 4 MemorySegment HllUnion, MemorySegment HllSketch u = buildMemorySegmentUnionHllMode(uLgK, 0); sk = buildMemorySegmentSketchHllMode(skLgK, tgtHllType, 1 << uLgK); u.update(sk); @@ -205,7 +200,7 @@ private static void innerLoop(final int uLgK, final int skLgK, final TgtHllType comb = "LgK: " + uLgK + ", skLgK: " + skLgK + ", SkType: " + tgtHllType.toString() - + ", Case2: Heap Union, MemorySegment sketch; Case4: /MemorySegment Union, MemorySegment sketch"; + + ", Case2: Heap HllUnion, MemorySegment HllSketch; Case4: /MemorySegment HllUnion, MemorySegment HllSketch"; checkArrays(bytesOut2, bytesOut4, comb, false); } @@ -218,7 +213,7 @@ private static void innerLoop(final int uLgK, final int skLgK, final TgtHllType public void isomorphicHllMerges2() { byte[] bytesOut8, bytesOut6, bytesOut4; String comb; - Union u; + HllUnion u; HllSketch sk; for (int lgK = 4; lgK <= 4; lgK++) { //All LgK u = buildHeapUnionHllMode(lgK, 0); @@ -260,18 +255,18 @@ private static void checkArrays(final byte[] sk1, final byte[] sk2, final String } //BUILDERS - private Union buildHeapUnion(final int lgMaxK, final CurMode curMode) { - final Union u = new Union(lgMaxK); + private HllUnion buildHeapUnion(final int lgMaxK, final CurMode curMode) { + final HllUnion u = new HllUnion(lgMaxK); final int n = (curMode == null) ? 0 : getN(lgMaxK, curMode); for (int i = 0; i < n; i++) { u.update(i + v); } v += n; return u; } - private Union buildMemorySegmentUnion(final int lgMaxK, final CurMode curMode) { + private HllUnion buildMemorySegmentUnion(final int lgMaxK, final CurMode curMode) { final int bytes = HllSketch.getMaxUpdatableSerializationBytes(lgMaxK, TgtHllType.HLL_8); final MemorySegment wseg = MemorySegment.ofArray(new byte[bytes]); - final Union u = new Union(lgMaxK, wseg); + final HllUnion u = new HllUnion(lgMaxK, wseg); final int n = (curMode == null) ? 0 : getN(lgMaxK, curMode); for (int i = 0; i < n; i++) { u.update(i + v); } v += n; @@ -296,17 +291,17 @@ private HllSketch buildMemorySegmentSketch(final int lgK, final TgtHllType tgtHl return sk; } - private static Union buildHeapUnionHllMode(final int lgMaxK, final int startN) { - final Union u = new Union(lgMaxK); + private static HllUnion buildHeapUnionHllMode(final int lgMaxK, final int startN) { + final HllUnion u = new HllUnion(lgMaxK); final int n = getN(lgMaxK, HLL); for (int i = 0; i < n; i++) { u.update(i + startN); } return u; } - private static Union buildMemorySegmentUnionHllMode(final int lgMaxK, final int startN) { + private static HllUnion buildMemorySegmentUnionHllMode(final int lgMaxK, final int startN) { final int bytes = HllSketch.getMaxUpdatableSerializationBytes(lgMaxK, TgtHllType.HLL_8); final MemorySegment wseg = MemorySegment.ofArray(new byte[bytes]); - final Union u = new Union(lgMaxK, wseg); + final HllUnion u = new HllUnion(lgMaxK, wseg); final int n = getN(lgMaxK, HLL); for (int i = 0; i < n; i++) { u.update(i + startN); } return u; diff --git a/src/test/java/org/apache/datasketches/hll/UnionCaseTest.java b/src/test/java/org/apache/datasketches/hll/UnionCaseTest.java index fc6a7a40d..84ec7ce9a 100644 --- a/src/test/java/org/apache/datasketches/hll/UnionCaseTest.java +++ b/src/test/java/org/apache/datasketches/hll/UnionCaseTest.java @@ -33,12 +33,6 @@ import java.lang.foreign.MemorySegment; import org.apache.datasketches.common.SketchesStateException; -import org.apache.datasketches.hll.AbstractHllArray; -import org.apache.datasketches.hll.DirectHllArray; -import org.apache.datasketches.hll.HllArray; -import org.apache.datasketches.hll.HllSketch; -import org.apache.datasketches.hll.TgtHllType; -import org.apache.datasketches.hll.Union; import org.testng.annotations.Test; /** @@ -49,7 +43,7 @@ public class UnionCaseTest { long v = 0; final static int maxLgK = 12; HllSketch source; - //Union union; + //HllUnion union; String hfmt = "%10s%10s%10s%10s%10s%10s%10s%10s%10s%10s%10s" + LS; String hdr = String.format(hfmt, "caseNum","srcLgKStr","gdtLgKStr","srcType","gdtType", "srcSeg","gdtSeg","srcMode","gdtMode","srcOoof","gdtOoof"); @@ -96,13 +90,13 @@ public void checkAllCases() { private void checkCase(final int caseNum, final TgtHllType srcType, final boolean srcSeg) { source = getSource(caseNum, srcType, srcSeg); final boolean gdtSeg = (caseNum & 1) > 0; - final Union union = getUnion(caseNum, gdtSeg); + final HllUnion union = getUnion(caseNum, gdtSeg); union.update(source); final int totalU = getSrcCount(caseNum, maxLgK) + getUnionCount(caseNum); output(caseNum, source, union, totalU); } - private void output(final int caseNum, final HllSketch source, final Union union, final int totalU) { + private void output(final int caseNum, final HllSketch source, final HllUnion union, final int totalU) { final double estU = union.getEstimate(); final double err = Math.abs((estU / totalU) - 1.0); final int gdtLgK = union.getLgConfigK(); @@ -137,7 +131,7 @@ private HllSketch getSource(final int caseNum, final TgtHllType tgtHllType, fina } } - private Union getUnion(final int caseNum, final boolean useMemorySegment) { + private HllUnion getUnion(final int caseNum, final boolean useMemorySegment) { final int unionU = getUnionCount(caseNum); return (useMemorySegment) ? buildMemorSegmentUnion(maxLgK, unionU) : buildHeapUnion(maxLgK, unionU); } @@ -162,10 +156,10 @@ private static int getSrcLgK(final int caseNum, final int maxLgK) { @Test public void checkMisc() { - final Union u = buildHeapUnion(12, 0); + final HllUnion u = buildHeapUnion(12, 0); int bytes = u.getCompactSerializationBytes(); assertEquals(bytes, 8); - bytes = Union.getMaxSerializationBytes(7); + bytes = HllUnion.getMaxSerializationBytes(7); assertEquals(bytes, 40 + 128); double v = u.getEstimate(); assertEquals(v, 0.0, 0.0); @@ -187,7 +181,7 @@ public void checkSrcListList() { //src: LIST, gadget: LIST final int n2 = 3; final int n3 = 2; final int sum = n1 + n2 + n3; - final Union u = buildHeapUnion(12, n1); //gdt = list + final HllUnion u = buildHeapUnion(12, n1); //gdt = list final HllSketch h2 = buildHeapSketch(11, HLL_6, n2); //src = list final HllSketch h3 = buildHeapSketch(10, HLL_8, n3); //src = list u.update(h2); @@ -209,7 +203,7 @@ public void checkSrcListSet() { //src: SET, gadget: LIST final int n2 = 2; final int n3 = 16; final int sum = n1 + n2 + n3; - final Union u = buildHeapUnion(12, n1); //LIST, 5 + final HllUnion u = buildHeapUnion(12, n1); //LIST, 5 final HllSketch h2 = buildHeapSketch(11, HLL_6, n2); //LIST, 2 final HllSketch h3 = buildHeapSketch(10, HLL_8, n3); //SET, 16 u.update(h2); @@ -231,7 +225,7 @@ public void checkSrcSetList() { //src: LIST, gadget: SET final int n2 = 10; final int n3 = 6; final int sum = n1 + n2 + n3; - final Union u = buildHeapUnion(12, n1); + final HllUnion u = buildHeapUnion(12, n1); final HllSketch h2 = buildHeapSketch(11, HLL_6, n2); //SET final HllSketch h3 = buildHeapSketch(10, HLL_8, n3); //LIST u.update(h2); @@ -253,7 +247,7 @@ public void checkSrcSetSet() { //src: SET, gadget: SET final int n2 = 10; final int n3 = 16; final int sum = n1 + n2 + n3; - final Union u = buildHeapUnion(12, n1); + final HllUnion u = buildHeapUnion(12, n1); final HllSketch h2 = buildHeapSketch(11, HLL_6, n2); //src: SET final HllSketch h3 = buildHeapSketch(10, HLL_8, n3); //src: SET u.update(h2); @@ -275,7 +269,7 @@ public void checkSrcEmptyList() { //src: LIST, gadget: empty final int n2 = 0; final int n3 = 7; final int sum = n1 + n2 + n3; - final Union u = buildHeapUnion(12, n1); //LIST empty + final HllUnion u = buildHeapUnion(12, n1); //LIST empty final HllSketch h2 = buildHeapSketch(11, HLL_6, n2); //src: LIST empty, ignored final HllSketch h3 = buildHeapSketch(10, HLL_8, n3); //src: LIST u.update(h2); @@ -297,7 +291,7 @@ public void checkSrcEmptySet() { final int n2 = 0; final int n3 = 16; final int sum = n1 + n2 + n3; - final Union u = buildHeapUnion(12, n1); //LIST empty + final HllUnion u = buildHeapUnion(12, n1); //LIST empty final HllSketch h2 = buildHeapSketch(11, HLL_6, n2); //LIST empty, ignored final HllSketch h3 = buildHeapSketch(10, HLL_8, n3); // Src Set u.update(h2); @@ -316,7 +310,7 @@ public void checkSrcEmptySet() { @SuppressWarnings("unused") @Test public void checkSpecialMergeCase4() { - final Union u = buildHeapUnion(12, 1 << 9); + final HllUnion u = buildHeapUnion(12, 1 << 9); final HllSketch sk = buildHeapSketch(12, HLL_8, 1 << 9); u.update(sk); @@ -360,7 +354,7 @@ public void checkRebuildCurMinNumKxQFlag1() { final HllSketch sk = buildHeapSketch(4, HLL_8, 16); final HllArray hllArr = (HllArray)(sk.hllSketchImpl); hllArr.putRebuildCurMinNumKxQFlag(true); //corrupt the flag - final Union union = buildHeapUnion(4, 0); + final HllUnion union = buildHeapUnion(4, 0); union.update(sk); } @@ -370,7 +364,7 @@ public void checkRebuildCurMinNumKxQFlag2() { final DirectHllArray hllArr = (DirectHllArray)(sk.hllSketchImpl); hllArr.putRebuildCurMinNumKxQFlag(true); //corrupt the flag final MemorySegment wseg = sk.getMemorySegment(); - Union.writableWrap(wseg); + HllUnion.writableWrap(wseg); } @Test(expectedExceptions = SketchesStateException.class) @@ -393,17 +387,17 @@ private static double errorFactor(final int lgK, final boolean oooFlag, final do } //BUILDERS - private Union buildHeapUnion(final int lgMaxK, final int n) { - final Union u = new Union(lgMaxK); + private HllUnion buildHeapUnion(final int lgMaxK, final int n) { + final HllUnion u = new HllUnion(lgMaxK); for (int i = 0; i < n; i++) { u.update(i + v); } v += n; return u; } - private Union buildMemorSegmentUnion(final int lgMaxK, final int n) { + private HllUnion buildMemorSegmentUnion(final int lgMaxK, final int n) { final int bytes = HllSketch.getMaxUpdatableSerializationBytes(lgMaxK, TgtHllType.HLL_8); final MemorySegment wseg = MemorySegment.ofArray(new byte[bytes]); - final Union u = new Union(lgMaxK, wseg); + final HllUnion u = new HllUnion(lgMaxK, wseg); for (int i = 0; i < n; i++) { u.update(i + v); } v += n; return u; diff --git a/src/test/java/org/apache/datasketches/hll/UnionTest.java b/src/test/java/org/apache/datasketches/hll/UnionTest.java index 0e1bc5b46..23c373207 100644 --- a/src/test/java/org/apache/datasketches/hll/UnionTest.java +++ b/src/test/java/org/apache/datasketches/hll/UnionTest.java @@ -31,12 +31,6 @@ import java.lang.foreign.MemorySegment; import org.apache.datasketches.common.SketchesArgumentException; -import org.apache.datasketches.hll.HllSketch; -import org.apache.datasketches.hll.HllUtil; -import org.apache.datasketches.hll.PreambleUtil; -import org.apache.datasketches.hll.RelativeErrorTables; -import org.apache.datasketches.hll.TgtHllType; -import org.apache.datasketches.hll.Union; import org.testng.annotations.Test; /** @@ -199,7 +193,7 @@ private static void basicUnion(final int n1, final int n2, final int lgK1, final final String h1SketchStr = ("H1 SKETCH: \n" + h1.toString()); final String h2SketchStr = ("H2 SKETCH: \n" + h2.toString()); - final Union union = newUnion(lgMaxK); + final HllUnion union = newUnion(lgMaxK); union.update(h1); final String uH1SketchStr = ("Union after H1: \n" + union.getResult(resultType).toString()); @@ -261,7 +255,7 @@ public void checkToFromUnion1() { } private static void toFrom1(final int lgK, final TgtHllType tgtHllType, final int n) { - final Union srcU = newUnion(lgK); + final HllUnion srcU = newUnion(lgK); final HllSketch srcSk = new HllSketch(lgK, tgtHllType); for (int i = 0; i < n; i++) { srcSk.update(i); @@ -272,7 +266,7 @@ private static void toFrom1(final int lgK, final TgtHllType tgtHllType, final in final byte[] byteArr = srcU.toCompactByteArray(); final MemorySegment seg = MemorySegment.ofArray(byteArr); - final Union dstU = Union.heapify(seg); + final HllUnion dstU = HllUnion.heapify(seg); assertFalse(dstU.isSameResource(seg)); assertEquals(dstU.getEstimate(), srcU.getEstimate(), 0.0); @@ -292,7 +286,7 @@ public void checkToFromUnion2() { } private static void toFrom2(final int lgK, final TgtHllType tgtHllType, final int n) { - final Union srcU = newUnion(lgK); + final HllUnion srcU = newUnion(lgK); final HllSketch srcSk = new HllSketch(lgK, tgtHllType); for (int i = 0; i < n; i++) { srcSk.update(i); @@ -302,14 +296,14 @@ private static void toFrom2(final int lgK, final TgtHllType tgtHllType, final in srcU.update(srcSk); final byte[] byteArr = srcU.toCompactByteArray(); - final Union dstU = Union.heapify(byteArr); + final HllUnion dstU = HllUnion.heapify(byteArr); assertEquals(dstU.getEstimate(), srcU.getEstimate(), 0.0); } @Test public void checkCompositeEst() { - final Union u = new Union(); + final HllUnion u = new HllUnion(); assertEquals(u.getCompositeEstimate(), 0, .03); for (int i = 1; i <= 15; i++) { u.update(i); } assertEquals(u.getCompositeEstimate(), 15, 15 *.03); @@ -321,31 +315,31 @@ public void checkCompositeEst() { @Test public void checkMisc() { try { - final Union u = newUnion(HllUtil.MIN_LOG_K - 1); + final HllUnion u = newUnion(HllUtil.MIN_LOG_K - 1); fail(); } catch (final SketchesArgumentException e) { //expected } try { - final Union u = newUnion(HllUtil.MAX_LOG_K + 1); + final HllUnion u = newUnion(HllUtil.MAX_LOG_K + 1); fail(); } catch (final SketchesArgumentException e) { //expected } - final Union u = newUnion(7); + final HllUnion u = newUnion(7); final HllSketch sk = u.getResult(); assertTrue(sk.isEmpty()); } @Test public void checkHeapify() { - final Union u = newUnion(16); + final HllUnion u = newUnion(16); for (int i = 0; i < (1 << 20); i++) { u.update(i); } final double est1 = u.getEstimate(); final byte[] byteArray = u.toUpdatableByteArray(); - final Union u2 = Union.heapify(byteArray); + final HllUnion u2 = HllUnion.heapify(byteArray); assertEquals(u2.getEstimate(), est1, 0.0); } @@ -365,7 +359,7 @@ public void checkUbLb() { @Test public void checkEmptyCouponMisc() { final int lgK = 8; - final Union union = newUnion(lgK); + final HllUnion union = newUnion(lgK); for (int i = 0; i < 20; i++) { union.update(i); } //SET mode union.couponUpdate(0); assertEquals(union.getEstimate(), 20.0, 0.001); @@ -373,7 +367,7 @@ public void checkEmptyCouponMisc() { assertFalse(union.hasMemorySegment()); assertFalse(union.isOffHeap()); final int bytes = union.getUpdatableSerializationBytes(); - assertTrue(bytes <= Union.getMaxSerializationBytes(lgK)); + assertTrue(bytes <= HllUnion.getMaxSerializationBytes(lgK)); assertFalse(union.isCompact()); } @@ -390,7 +384,7 @@ public void checkUnionWithWrap() { final HllSketch sk2 = HllSketch.wrap(MemorySegment.ofArray(skByteArr)); assertEquals(sk2.getEstimate(), est, 0.0); - final Union union = newUnion(lgConfigK); + final HllUnion union = newUnion(lgConfigK); union.update(HllSketch.wrap(MemorySegment.ofArray(skByteArr))); assertEquals(union.getEstimate(), est, 0.0); } @@ -404,7 +398,7 @@ public void checkUnionWithWrap2() { final double est1 = sk1.getEstimate(); final byte[] byteArr1 = sk1.toCompactByteArray(); - final Union union = newUnion(lgConfigK); + final HllUnion union = newUnion(lgConfigK); union.update(HllSketch.wrap(MemorySegment.ofArray(byteArr1))); final double est2 = union.getEstimate(); assertEquals(est2, est1); @@ -420,7 +414,7 @@ public void checkConversions() { sk1.update(i); sk2.update(i + u); } - final Union union = new Union(lgK); + final HllUnion union = new HllUnion(lgK); union.update(sk1); union.update(sk2); final HllSketch rsk1 = union.getResult(TgtHllType.HLL_8); @@ -450,9 +444,9 @@ public void checkUnionHeapifyRebuildAfterMerge() { sk1.update(i); sk2.update(i + u); } - final int bytes = Union.getMaxSerializationBytes(lgK); + final int bytes = HllUnion.getMaxSerializationBytes(lgK); final MemorySegment wseg = MemorySegment.ofArray(new byte[bytes]); - final Union union1 = new Union(lgK, wseg); //Create original union off-heap + final HllUnion union1 = new HllUnion(lgK, wseg); //Create original union off-heap union1.update(sk1); union1.update(sk2); //oooFlag = Rebuild_KxQ = TRUE assertTrue(!union1.toString().isEmpty()); @@ -466,23 +460,23 @@ public void checkUnionHeapifyRebuildAfterMerge() { assertFalse(rebuild); } - @Test //similar to above except uses Union.writableWrap instead of heapify + @Test //similar to above except uses HllUnion.writableWrap instead of heapify public void druidUseCase() { final int lgK = 12; - final int bytes = Union.getMaxSerializationBytes(lgK); + final int bytes = HllUnion.getMaxSerializationBytes(lgK); final MemorySegment wseg = MemorySegment.ofArray(new byte[bytes]); - new Union(lgK, wseg); // result is unused, relying on side effect + new HllUnion(lgK, wseg); // result is unused, relying on side effect int trueCount = 0; final int delta = 1 << (lgK - 3); //(lgK < 8) ? 16 : 1 << (lgK - 3) //allows changing lgK above for (int i = 0; i < 3; i++) { - Union.writableWrap(wseg).update(buildSketch(trueCount, delta)); + HllUnion.writableWrap(wseg).update(buildSketch(trueCount, delta)); trueCount += delta; } boolean rebuild = PreambleUtil.extractRebuildCurMinNumKxQFlag(wseg); final double hipAccum = PreambleUtil.extractHipAccum(wseg); assertTrue(rebuild); assertTrue(hipAccum == 0.0); - final HllSketch result = Union.writableWrap(wseg).getResult(); //rebuilds result + final HllSketch result = HllUnion.writableWrap(wseg).getResult(); //rebuilds result rebuild = result.hllSketchImpl.isRebuildCurMinNumKxQFlag(); assertFalse(rebuild); final double est = result.getEstimate(); @@ -500,8 +494,8 @@ private static HllSketch buildSketch(final int start, final int count) { return sketch; } - private static Union newUnion(final int lgK) { - return new Union(lgK); + private static HllUnion newUnion(final int lgK) { + return new HllUnion(lgK); } private static double getBound(final int lgK, final boolean ub, final boolean oooFlag, final int numStdDev, final double est) { From 337b3421cab20852ec904e8d1729b2a8c8a54a3b Mon Sep 17 00:00:00 2001 From: Lee Rhodes Date: Mon, 17 Nov 2025 11:08:24 -0800 Subject: [PATCH 02/12] Revert maven-jar-plugin version from 3.5.0 to 3.4.2. 3.5.0 causes errors. --- pom.xml | 44 +++++++++++++++++++++++++++++++------------- 1 file changed, 31 insertions(+), 13 deletions(-) diff --git a/pom.xml b/pom.xml index 1cce4056c..befed4013 100644 --- a/pom.xml +++ b/pom.xml @@ -37,7 +37,7 @@ under the License. jar ${project.artifactId} - Core sketch algorithms used alone and by other Java repositories in the DataSketches library. + Core sketch algorithms used alone and by other Java repositories in the Apache DataSketches Project. https://datasketches.apache.org/ 2015 @@ -92,6 +92,8 @@ under the License. 3.9.11 25 + -Xmx4g UTF-8 @@ -102,17 +104,17 @@ under the License. 3.7.1 - 3.14.0 - 3.8.1 + 3.14.1 + 3.9.0 3.1.4 - 3.6.1 + 3.6.2 3.2.8 - 3.4.2 - 3.11.3 - 3.1.1 + 3.4.2 + 3.12.0 + 3.2.0 3.3.1 - 3.5.3 + 3.5.4 3.2.0 4.9.10 @@ -122,11 +124,12 @@ under the License. 4.3.0 - 0.8.13 + 0.8.14 - 2.18.0 + 2.19.1 1.0.0 + 0.17 @@ -393,6 +396,19 @@ under the License. ${git-commit-id-plugin.version} + + io.github.zlika + reproducible-build-maven-plugin + ${reproducible-build-maven-plugin.version} + + + + strip-jar + + + + + @@ -460,6 +476,10 @@ under the License. pl.project13.maven git-commit-id-plugin + + io.github.zlika + reproducible-build-maven-plugin + @@ -506,7 +526,7 @@ under the License. git.branch git.commit.id.full - git.commit.time + git.commit.user.email git.tags @@ -610,8 +630,6 @@ under the License. - - generate-java-files From 4ce1d5d99697cdcd83e15e7f3eaed08a3caecef3 Mon Sep 17 00:00:00 2001 From: Lee Rhodes Date: Mon, 17 Nov 2025 17:28:22 -0800 Subject: [PATCH 03/12] Rewrite Readme, Update javadoc.yml --- .github/workflows/javadoc.yml | 200 ++++++++++++++++++++++++++++++---- README.md | 28 +---- 2 files changed, 186 insertions(+), 42 deletions(-) diff --git a/.github/workflows/javadoc.yml b/.github/workflows/javadoc.yml index 66bab896a..b8c2fc855 100644 --- a/.github/workflows/javadoc.yml +++ b/.github/workflows/javadoc.yml @@ -1,41 +1,201 @@ -name: JavaDoc +name: JavaDoc Releases on: push: - branches: main + tags: + - "*" # Only publish docs for tags + delete: + tags: + - "*" # Auto-remove docs when tags are deleted workflow_dispatch: permissions: contents: write + pages: write + id-token: write jobs: - javadoc: + build: + name: Build JavaDoc runs-on: ubuntu-latest steps: - - name: Checkout + - name: Checkout source uses: actions/checkout@v5 - - name: Setup Java + - name: Set up JDK uses: actions/setup-java@v5 with: - java-version: '25' - distribution: 'temurin' + distribution: temurin + java-version: 25 - - name: Echo Java Version - run: java -version + - name: Set up Maven cache + uses: actions/cache@v4 + with: + path: | + ~/.m2/repository + key: maven-${{ runner.os }}-${{ hashFiles('**/pom.xml') }} + restore-keys: | + maven-${{ runner.os }}- + + - name: Build JavaDoc + run: mvn -B javadoc:javadoc + + # Upload for GitHub Pages preview (this is optional) + - name: Upload Pages Artifact + uses: actions/upload-pages-artifact@v4 + with: + path: target/site/apidocs + + update-gh-pages: + name: Update gh-pages + needs: build + runs-on: ubuntu-latest + + if: github.event_name != 'delete' # skip this job on delete events + + steps: + - name: Checkout gh-pages branch + uses: actions/checkout@v5 + with: + ref: gh-pages + path: gh-pages + + - name: Copy Javadoc into versioned folder + run: | + TAG="${GITHUB_REF_NAME}" + VERSION_DIR="docs/${TAG}" + + rm -rf "gh-pages/${VERSION_DIR}" + mkdir -p "gh-pages/${VERSION_DIR}" + cp -r target/site/apidocs/* "gh-pages/${VERSION_DIR}/" + + - name: Regenerate multi-version index.html + run: | + cd gh-pages + + # Create index header + cat > index.html << 'EOF' + + + + + Javadoc Versions + + + +

Available Javadoc Versions

+ + + + EOF - - name: Generate JavaDoc - run: mvn javadoc:javadoc + - name: Commit & push changes + run: | + cd gh-pages + git config user.email "github-actions[bot]@users.noreply.github.com" + git config user.name "github-actions[bot]" - - name: Deploy JavaDoc - uses: JamesIves/github-pages-deploy-action@881db5376404c5c8d621010bcbec0310b58d5e29 + if git diff --quiet; then + echo "No changes to commit" + exit 0 + fi + + git add . + git commit -m "Update release Javadoc for ${GITHUB_REF_NAME}" + git push origin gh-pages + + delete-tag-docs: + name: Remove deleted tag docs + runs-on: ubuntu-latest + + if: github.event_name == 'delete' && github.event.ref_type == 'tag' + + steps: + - name: Checkout gh-pages + uses: actions/checkout@v5 with: - token: ${{ secrets.GITHUB_TOKEN }} - folder: target/reports/apidocs - target-folder: docs/${{ github.ref_name }} - branch: gh-pages + ref: gh-pages + path: gh-pages + + - name: Remove documentation for deleted tag + run: | + TAG="${GITHUB_REF_NAME}" + VERSION_DIR="docs/${TAG}" + rm -rf "gh-pages/${VERSION_DIR}" + + - name: Rebuild index.html + run: | + cd gh-pages + + # Regenerate index.html (same as above) + cat > index.html << 'EOF' + + + + + Javadoc Versions + + + +

Available Javadoc Versions

+ + + + EOF + + - name: Commit & push removal + run: | + cd gh-pages + git config user.email "github-actions[bot]@users.noreply.github.com" + git config user.name "github-actions[bot]" + + if git diff --quiet; then + echo "No changes to commit" + exit 0 + fi + + git add . + git commit -m "Remove Javadoc for deleted tag ${GITHUB_REF_NAME}" + git push origin gh-pages + +# Summary of Features +# Multi-version docs (docs//) +# Auto-generated index.html +# Only publish for tags +# Delete docs when tags are deleted +# Upload Pages artifact +# Maven dependency caching +# No third-party actions +# No force-push +# Clean HTML layout diff --git a/README.md b/README.md index 3a7cec913..106100d79 100644 --- a/README.md +++ b/README.md @@ -27,7 +27,7 @@ This is the core Java component of the DataSketches library. It contains all of This component is also a dependency of other components of the library that create adaptors for target systems, such as the [Apache Pig adaptor](https://github.com/apache/datasketches-pig), the [Apache Hive adaptor](https://github.com/apache/datasketches-hive), and others. -Note that we have parallel core components for C++, Python and GO implementations of many of the same sketch algorithms: +Note that we have parallel core library components for C++, Python and GO implementations of many of the same sketch algorithms: - [datasketches-cpp](https://github.com/apache/datasketches-cpp), - [datasketches-python](https://github.com/apache/datasketches-python), @@ -37,26 +37,6 @@ Please visit the main [DataSketches website](https://datasketches.apache.org) fo If you are interested in making contributions to this site please see our [Community](https://datasketches.apache.org/docs/Community/) page for how to contact us. ---- -## Major Changes with this Release -This release is a major release where we took the opportunity to do some significant refactoring that will constitute incompatible changes from previous releases. Any incompatibility with prior releases is always an inconvenience to users who wish to just upgrade to the latest release and run. However, some of the code in this library was written in 2013 and meanwhile the Java language has evolved enormously since then. We chose to use this major release as the opportunity to modernize some of the code to achieve the following goals: - -### Eliminate the dependency on the DataSketches-Memory component. -The DataSketches-Memory component was originally developed in 2014 to address the need for fast access to off-heap memory data structures and used Unsafe and other JVM internals as there were no satisfactory Java language features to do this at the time. - -The FFM capabilities introduced into the language in Java 22, are now part of the Java 25 LTS release, which we support. Since the capabilities of FFM are a superset of the original DataSketches-Memory component, it made sense to rewrite the code to eliminate the dependency on DataSketches-Memory and use FFM instead. This impacted code across the entire library. - -This provided several advantages to the code base. By removing this dependency on DataSketches-Memory, there are now no runtime dependencies! This should make integrating this library into other Java systems much simpler. Since FFM is tightly integrated into the Java language, it has improved performance, especially with bulk operations. - -- As an added note: There are numerous other improvements to the Java language that we could perhaps take advantage of in a rewrite, e.g., Records, text blocks, switch expressions, sealed, var, modules, patterns, etc. However, faced with the risk of accidentally creating bugs due to too many changes at one time, we focused on FFM, which actually improve performance as opposed to just syntactic sugar. - -### Align public sketch class names so that the sketch family name is part of the class name. -For example, the Theta sketch was the first sketch written for the library and its base class was called *Sketch*. Obviously, because it was the only sketch! The Tuple sketch evolved soon after and its base class was also called *Sketch*. Oops, bad idea. If a user wanted to use both the Theta and Tuple sketches in the same class one of them had to be fully qualified every time it was referenced. Ugh! - -Unfortunately, this habit propagated so some of the other early sketches where we ended up with two different sketches with a *ItemsSketch*, for example. For the more recent additions to the library we started including the sketch family name in all the relevant sketch-like public classes of a sketch family. - -In this release we have refactored these older sketches with new names that now include the sketch family name. Yes, this is an incompatible change for user code moving from earlier releases, but this can be usually fixed with search-and-replace tools. This release is not perfect, but hopefully more consistent across all the different sketch families. - ## Build & Runtime Dependencies @@ -73,7 +53,7 @@ This DataSketches component is structured as a Maven project and Maven is the re #### A Toolchain is required * You must have a JDK type toolchain defined in location *~/.m2/toolchains.xml* that specifies where to find a locally installed OpenJDK-compatible version 25. -* Your default \$JAVA\_HOME compiler must be OpenJDK compatible, specified in the toolchain, and may be a version greater than 25. Note that if your \$JAVA\_HOME is set to a Java version greater than 25, Maven will automatically use the Java 25 version specified in the toolchain instead. The included pom.xml specifies the necessary JVM flags, if required, so no further action is needed. +* Your default \$JAVA\_HOME compiler must be OpenJDK compatible, specified in the toolchain, and may be a version greater than 25. Note that if your \$JAVA\_HOME is set to a Java version greater than 25, Maven will automatically use the Java 25 version specified in the toolchain instead. The pom.xml specifies any necessary JVM flags, if required, so no further action is needed. * Note that the paths specified in the toolchain must be fully qualified direct paths to the OpenJDK version locations. Using environment variables will not work. #### To run normal unit tests: @@ -98,3 +78,7 @@ This will create the following jars: * Make sure you configure SpotBugs with the /tools/FindBugsExcludeFilter.xml file. Otherwise, you may get a lot of false positive or low risk issues that we have examined and eliminated with this exclusion file. +### Checkstyle + +* At the time of this writing, Checkstyle had not been upgraded to handle Java 25 features. + From 5b2b55f6df06e895370faa6cec8bc396ef62668a Mon Sep 17 00:00:00 2001 From: Lee Rhodes Date: Mon, 17 Nov 2025 17:48:14 -0800 Subject: [PATCH 04/12] fix javadoc.yml --- .github/workflows/javadoc.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/javadoc.yml b/.github/workflows/javadoc.yml index b8c2fc855..4f0076b95 100644 --- a/.github/workflows/javadoc.yml +++ b/.github/workflows/javadoc.yml @@ -45,7 +45,7 @@ jobs: - name: Upload Pages Artifact uses: actions/upload-pages-artifact@v4 with: - path: target/site/apidocs + path: target/reports/apidocs update-gh-pages: name: Update gh-pages @@ -68,7 +68,7 @@ jobs: rm -rf "gh-pages/${VERSION_DIR}" mkdir -p "gh-pages/${VERSION_DIR}" - cp -r target/site/apidocs/* "gh-pages/${VERSION_DIR}/" + cp -r target/reports/apidocs/* "gh-pages/${VERSION_DIR}/" - name: Regenerate multi-version index.html run: | From 7c474b6a91cde1c673029c8709d1bbcb480be801 Mon Sep 17 00:00:00 2001 From: Lee Rhodes Date: Mon, 17 Nov 2025 18:20:58 -0800 Subject: [PATCH 05/12] Debug javadoc.yml --- .github/workflows/javadoc.yml | 139 +++++++++++++++++----------------- 1 file changed, 70 insertions(+), 69 deletions(-) diff --git a/.github/workflows/javadoc.yml b/.github/workflows/javadoc.yml index 4f0076b95..441a82634 100644 --- a/.github/workflows/javadoc.yml +++ b/.github/workflows/javadoc.yml @@ -63,7 +63,8 @@ jobs: - name: Copy Javadoc into versioned folder run: | - TAG="${GITHUB_REF_NAME}" + # TAG="${GITHUB_REF_NAME}" + TAG="9.0.0-rc" VERSION_DIR="docs/${TAG}" rm -rf "gh-pages/${VERSION_DIR}" @@ -121,74 +122,74 @@ jobs: git commit -m "Update release Javadoc for ${GITHUB_REF_NAME}" git push origin gh-pages - delete-tag-docs: - name: Remove deleted tag docs - runs-on: ubuntu-latest - - if: github.event_name == 'delete' && github.event.ref_type == 'tag' - - steps: - - name: Checkout gh-pages - uses: actions/checkout@v5 - with: - ref: gh-pages - path: gh-pages - - - name: Remove documentation for deleted tag - run: | - TAG="${GITHUB_REF_NAME}" - VERSION_DIR="docs/${TAG}" - rm -rf "gh-pages/${VERSION_DIR}" - - - name: Rebuild index.html - run: | - cd gh-pages - - # Regenerate index.html (same as above) - cat > index.html << 'EOF' - - - - - Javadoc Versions - - - -

Available Javadoc Versions

-
    - EOF - - for dir in docs/*; do - [ -d "$dir" ] || continue - ver=$(basename "$dir") - echo "
  • $ver
  • " >> index.html - done - - cat >> index.html << 'EOF' -
- - - EOF - - - name: Commit & push removal - run: | - cd gh-pages - git config user.email "github-actions[bot]@users.noreply.github.com" - git config user.name "github-actions[bot]" - - if git diff --quiet; then - echo "No changes to commit" - exit 0 - fi - - git add . - git commit -m "Remove Javadoc for deleted tag ${GITHUB_REF_NAME}" - git push origin gh-pages - +# delete-tag-docs: +# name: Remove deleted tag docs +# runs-on: ubuntu-latest +# +# if: github.event_name == 'delete' && github.event.ref_type == 'tag' +# +# steps: +# - name: Checkout gh-pages +# uses: actions/checkout@v5 +# with: +# ref: gh-pages +# path: gh-pages +# +# - name: Remove documentation for deleted tag +# run: | +# TAG="${GITHUB_REF_NAME}" +# VERSION_DIR="docs/${TAG}" +# rm -rf "gh-pages/${VERSION_DIR}" +# +# - name: Rebuild index.html +# run: | +# cd gh-pages +# +# # Regenerate index.html (same as above) +# cat > index.html << 'EOF' +# +# +# +# +# Javadoc Versions +# +# +# +#

Available Javadoc Versions

+#
    +# EOF +# +# for dir in docs/*; do +# [ -d "$dir" ] || continue +# ver=$(basename "$dir") +# echo "
  • $ver
  • " >> index.html +# done +# +# cat >> index.html << 'EOF' +#
+# +# +# EOF +# +# - name: Commit & push removal +# run: | +# cd gh-pages +# git config user.email "github-actions[bot]@users.noreply.github.com" +# git config user.name "github-actions[bot]" +# +# if git diff --quiet; then +# echo "No changes to commit" +# exit 0 +# fi +# +# git add . +# git commit -m "Remove Javadoc for deleted tag ${GITHUB_REF_NAME}" +# git push origin gh-pages +# # Summary of Features # Multi-version docs (docs//) # Auto-generated index.html From 66901a35fb8b2c51d5fc57a9e51e6e2fcc48fd52 Mon Sep 17 00:00:00 2001 From: Lee Rhodes Date: Mon, 17 Nov 2025 20:48:54 -0800 Subject: [PATCH 06/12] debug javadoc.yml --- .github/workflows/auto-os-matrix.yml | 14 +++++++------- .github/workflows/javadoc.yml | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/.github/workflows/auto-os-matrix.yml b/.github/workflows/auto-os-matrix.yml index df3c7135f..629a0a0d7 100644 --- a/.github/workflows/auto-os-matrix.yml +++ b/.github/workflows/auto-os-matrix.yml @@ -1,13 +1,13 @@ name: Auto OS Matrix Test & Install on: - push: - paths-ignore: [ '**/*.html', '**/*.md', '**/*.txt', '**/*.xml', '**/*.yaml', '**/*.yml', '**/LICENSE', '**/NOTICE' ] - branches: [ 'main', '[0-9]+.[0-9]+.[Xx]' ] - pull_request: - paths-ignore: [ '**/*.html', '**/*.md', '**/*.txt', '**/*.xml', '**/*.yaml', '**/*.yml', '**/LICENSE', '**/NOTICE' ] - # The branches below must be a subset of the branches above - branches: [ 'main', '[0-9]+.[0-9]+.[Xx]' ] +# push: +# paths-ignore: [ '**/*.html', '**/*.md', '**/*.txt', '**/*.xml', '**/*.yaml', '**/*.yml', '**/LICENSE', '**/NOTICE' ] +# branches: [ 'main', '[0-9]+.[0-9]+.[Xx]' ] +# pull_request: +# paths-ignore: [ '**/*.html', '**/*.md', '**/*.txt', '**/*.xml', '**/*.yaml', '**/*.yml', '**/LICENSE', '**/NOTICE' ] +# # The branches below must be a subset of the branches above +# branches: [ 'main', '[0-9]+.[0-9]+.[Xx]' ] workflow_dispatch: env: diff --git a/.github/workflows/javadoc.yml b/.github/workflows/javadoc.yml index 441a82634..72ff4c52b 100644 --- a/.github/workflows/javadoc.yml +++ b/.github/workflows/javadoc.yml @@ -66,7 +66,7 @@ jobs: # TAG="${GITHUB_REF_NAME}" TAG="9.0.0-rc" VERSION_DIR="docs/${TAG}" - + echo $VERSION_DIR rm -rf "gh-pages/${VERSION_DIR}" mkdir -p "gh-pages/${VERSION_DIR}" cp -r target/reports/apidocs/* "gh-pages/${VERSION_DIR}/" From c6c8c10a7e3449eed70c5c8151287ff433c34909 Mon Sep 17 00:00:00 2001 From: Lee Rhodes Date: Mon, 17 Nov 2025 21:15:08 -0800 Subject: [PATCH 07/12] Debug javadoc.yml --- .github/workflows/auto-jdk-matrix.yml | 12 ++++++------ .github/workflows/check_cpp_files.yml | 14 +++++++------- .github/workflows/codeql-analysis.yml | 14 +++++++------- .github/workflows/javadoc.yml | 6 ++++++ 4 files changed, 26 insertions(+), 20 deletions(-) diff --git a/.github/workflows/auto-jdk-matrix.yml b/.github/workflows/auto-jdk-matrix.yml index 0d387c944..4f731fbc5 100644 --- a/.github/workflows/auto-jdk-matrix.yml +++ b/.github/workflows/auto-jdk-matrix.yml @@ -1,12 +1,12 @@ name: Auto JDK Matrix Test & Install on: - push: - branches: [ 'main', '[0-9]+.[0-9]+.[Xx]' ] - pull_request: - paths-ignore: [ '**/*.html', '**/*.md', '**/*.txt', '**/*.xml', '**/*.yaml', '**/*.yml', '**/LICENSE', '**/NOTICE' ] - # The branches below must be a subset of the branches above - branches: [ 'main', '[0-9]+.[0-9]+.[Xx]' ] +# push: +# branches: [ 'main', '[0-9]+.[0-9]+.[Xx]' ] +# pull_request: +# paths-ignore: [ '**/*.html', '**/*.md', '**/*.txt', '**/*.xml', '**/*.yaml', '**/*.yml', '**/LICENSE', '**/NOTICE' ] +# # The branches below must be a subset of the branches above +# branches: [ 'main', '[0-9]+.[0-9]+.[Xx]' ] workflow_dispatch: env: diff --git a/.github/workflows/check_cpp_files.yml b/.github/workflows/check_cpp_files.yml index 694aa139d..1fc522ad5 100644 --- a/.github/workflows/check_cpp_files.yml +++ b/.github/workflows/check_cpp_files.yml @@ -1,13 +1,13 @@ name: CPP SerDe Compatibility Test on: - push: - paths-ignore: [ '**/*.html', '**/*.md', '**/*.txt', '**/*.xml', '**/*.yaml', '**/*.yml', '**/LICENSE', '**/NOTICE' ] - branches: [ 'main', '[0-9]+.[0-9]+.[Xx]' ] - pull_request: - paths-ignore: [ '**/*.html', '**/*.md', '**/*.txt', '**/*.xml', '**/*.yaml', '**/*.yml', '**/LICENSE', '**/NOTICE' ] - # The branches below must be a subset of the branches above - branches: [ 'main', '[0-9]+.[0-9]+.[Xx]' ] +# push: +# paths-ignore: [ '**/*.html', '**/*.md', '**/*.txt', '**/*.xml', '**/*.yaml', '**/*.yml', '**/LICENSE', '**/NOTICE' ] +# branches: [ 'main', '[0-9]+.[0-9]+.[Xx]' ] +# pull_request: +# paths-ignore: [ '**/*.html', '**/*.md', '**/*.txt', '**/*.xml', '**/*.yaml', '**/*.yml', '**/LICENSE', '**/NOTICE' ] +# # The branches below must be a subset of the branches above +# branches: [ 'main', '[0-9]+.[0-9]+.[Xx]' ] workflow_dispatch: jobs: diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml index 0a6de05d9..57fc7706f 100644 --- a/.github/workflows/codeql-analysis.yml +++ b/.github/workflows/codeql-analysis.yml @@ -1,13 +1,13 @@ name: "CodeQL" on: - push: - paths-ignore: [ '**/*.html', '**/*.md', '**/*.txt', '**/*.xml', '**/*.yaml', '**/*.yml', '**/LICENSE', '**/NOTICE' ] - branches: [ 'main', '[0-9]+.[0-9]+.[Xx]' ] - pull_request: - paths-ignore: [ '**/*.html', '**/*.md', '**/*.txt', '**/*.xml', '**/*.yaml', '**/*.yml', '**/LICENSE', '**/NOTICE' ] - # The branches below must be a subset of the branches above - branches: [ 'main', '[0-9]+.[0-9]+.[Xx]' ] +# push: +# paths-ignore: [ '**/*.html', '**/*.md', '**/*.txt', '**/*.xml', '**/*.yaml', '**/*.yml', '**/LICENSE', '**/NOTICE' ] +# branches: [ 'main', '[0-9]+.[0-9]+.[Xx]' ] +# pull_request: +# paths-ignore: [ '**/*.html', '**/*.md', '**/*.txt', '**/*.xml', '**/*.yaml', '**/*.yml', '**/LICENSE', '**/NOTICE' ] +# # The branches below must be a subset of the branches above +# branches: [ 'main', '[0-9]+.[0-9]+.[Xx]' ] workflow_dispatch: jobs: diff --git a/.github/workflows/javadoc.yml b/.github/workflows/javadoc.yml index 72ff4c52b..6f797d11a 100644 --- a/.github/workflows/javadoc.yml +++ b/.github/workflows/javadoc.yml @@ -69,6 +69,12 @@ jobs: echo $VERSION_DIR rm -rf "gh-pages/${VERSION_DIR}" mkdir -p "gh-pages/${VERSION_DIR}" + TGT_DIR="target/reports/apidocs/" + if [ -d "$TGT_DIR" ]; then + echo "$TGT_DIR" exists." + else + echo "$TGT_DIR" does not exist." + fi cp -r target/reports/apidocs/* "gh-pages/${VERSION_DIR}/" - name: Regenerate multi-version index.html From 1b014bea1d6eb355e00d5a780836579a56eeeb9f Mon Sep 17 00:00:00 2001 From: Lee Rhodes Date: Mon, 17 Nov 2025 21:31:19 -0800 Subject: [PATCH 08/12] debug javadoc.yml --- .github/workflows/javadoc.yml | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/.github/workflows/javadoc.yml b/.github/workflows/javadoc.yml index 6f797d11a..ea8761979 100644 --- a/.github/workflows/javadoc.yml +++ b/.github/workflows/javadoc.yml @@ -69,13 +69,14 @@ jobs: echo $VERSION_DIR rm -rf "gh-pages/${VERSION_DIR}" mkdir -p "gh-pages/${VERSION_DIR}" - TGT_DIR="target/reports/apidocs/" - if [ -d "$TGT_DIR" ]; then - echo "$TGT_DIR" exists." + SRC_DIR="target/reports/apidocs/" + echo $SRC_DIR + if [ -d $SRC_DIR ]; then + echo $SRC_DIR exists." else - echo "$TGT_DIR" does not exist." + echo $SRC_DIR does not exist." fi - cp -r target/reports/apidocs/* "gh-pages/${VERSION_DIR}/" + cp -r target/reports/apidocs/ "gh-pages/${VERSION_DIR}/" - name: Regenerate multi-version index.html run: | From 1f52203ff750077eb31e4c6b8a396e8a34ac000c Mon Sep 17 00:00:00 2001 From: Lee Rhodes Date: Mon, 17 Nov 2025 21:39:46 -0800 Subject: [PATCH 09/12] debug javadoc.yml --- .github/workflows/javadoc.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/javadoc.yml b/.github/workflows/javadoc.yml index ea8761979..e4090dd3d 100644 --- a/.github/workflows/javadoc.yml +++ b/.github/workflows/javadoc.yml @@ -66,15 +66,15 @@ jobs: # TAG="${GITHUB_REF_NAME}" TAG="9.0.0-rc" VERSION_DIR="docs/${TAG}" - echo $VERSION_DIR + echo "$VERSION_DIR" rm -rf "gh-pages/${VERSION_DIR}" mkdir -p "gh-pages/${VERSION_DIR}" SRC_DIR="target/reports/apidocs/" echo $SRC_DIR - if [ -d $SRC_DIR ]; then - echo $SRC_DIR exists." + if [ -d "$SRC_DIR" ]; then + echo "$SRC_DIR exists." else - echo $SRC_DIR does not exist." + echo "$SRC_DIR does not exist." fi cp -r target/reports/apidocs/ "gh-pages/${VERSION_DIR}/" From f6d3f6a2cca30d41f1a65db09d5fc815a1caeb46 Mon Sep 17 00:00:00 2001 From: Lee Rhodes Date: Wed, 19 Nov 2025 14:01:19 -0800 Subject: [PATCH 10/12] Updates to javadoc.yml --- .github/workflows/javadoc.yml | 264 ++++++++-------------------------- 1 file changed, 61 insertions(+), 203 deletions(-) diff --git a/.github/workflows/javadoc.yml b/.github/workflows/javadoc.yml index e4090dd3d..22bde6d5e 100644 --- a/.github/workflows/javadoc.yml +++ b/.github/workflows/javadoc.yml @@ -1,209 +1,67 @@ -name: JavaDoc Releases +name: Deploy Versioned Javadoc (Manual Trigger) + +# Select the target TAG where to run the workflow from. +# This TAG name becomes the subdirectory under branch gh-pages/docs/${TAG} +# where the javadocs will be copied to. on: - push: - tags: - - "*" # Only publish docs for tags - delete: - tags: - - "*" # Auto-remove docs when tags are deleted workflow_dispatch: - -permissions: - contents: write - pages: write - id-token: write + inputs: + tag_ref: + description: 'Existing Git Tag to deploy (e.g., 1.0.0)' + required: true + default: '1.0.0' # Default can be left blank or set to a placeholder jobs: - build: - name: Build JavaDoc - runs-on: ubuntu-latest - - steps: - - name: Checkout source - uses: actions/checkout@v5 - - - name: Set up JDK - uses: actions/setup-java@v5 - with: - distribution: temurin - java-version: 25 - - - name: Set up Maven cache - uses: actions/cache@v4 - with: - path: | - ~/.m2/repository - key: maven-${{ runner.os }}-${{ hashFiles('**/pom.xml') }} - restore-keys: | - maven-${{ runner.os }}- - - - name: Build JavaDoc - run: mvn -B javadoc:javadoc - - # Upload for GitHub Pages preview (this is optional) - - name: Upload Pages Artifact - uses: actions/upload-pages-artifact@v4 - with: - path: target/reports/apidocs - - update-gh-pages: - name: Update gh-pages - needs: build + build-and-deploy-javadoc: runs-on: ubuntu-latest - - if: github.event_name != 'delete' # skip this job on delete events - - steps: - - name: Checkout gh-pages branch - uses: actions/checkout@v5 - with: - ref: gh-pages - path: gh-pages - - - name: Copy Javadoc into versioned folder - run: | - # TAG="${GITHUB_REF_NAME}" - TAG="9.0.0-rc" - VERSION_DIR="docs/${TAG}" - echo "$VERSION_DIR" - rm -rf "gh-pages/${VERSION_DIR}" - mkdir -p "gh-pages/${VERSION_DIR}" - SRC_DIR="target/reports/apidocs/" - echo $SRC_DIR - if [ -d "$SRC_DIR" ]; then - echo "$SRC_DIR exists." - else - echo "$SRC_DIR does not exist." - fi - cp -r target/reports/apidocs/ "gh-pages/${VERSION_DIR}/" - - - name: Regenerate multi-version index.html - run: | - cd gh-pages - - # Create index header - cat > index.html << 'EOF' - - - - - Javadoc Versions - - - -

Available Javadoc Versions

-
    - EOF - - # Insert entries for each version directory - for dir in docs/*; do - [ -d "$dir" ] || continue - ver=$(basename "$dir") - echo "
  • $ver
  • " >> index.html - done - - # Close HTML - cat >> index.html << 'EOF' -
- - - EOF - - - name: Commit & push changes - run: | - cd gh-pages - git config user.email "github-actions[bot]@users.noreply.github.com" - git config user.name "github-actions[bot]" - - if git diff --quiet; then - echo "No changes to commit" - exit 0 - fi - - git add . - git commit -m "Update release Javadoc for ${GITHUB_REF_NAME}" - git push origin gh-pages - -# delete-tag-docs: -# name: Remove deleted tag docs -# runs-on: ubuntu-latest -# -# if: github.event_name == 'delete' && github.event.ref_type == 'tag' -# -# steps: -# - name: Checkout gh-pages -# uses: actions/checkout@v5 -# with: -# ref: gh-pages -# path: gh-pages -# -# - name: Remove documentation for deleted tag -# run: | -# TAG="${GITHUB_REF_NAME}" -# VERSION_DIR="docs/${TAG}" -# rm -rf "gh-pages/${VERSION_DIR}" -# -# - name: Rebuild index.html -# run: | -# cd gh-pages -# -# # Regenerate index.html (same as above) -# cat > index.html << 'EOF' -# -# -# -# -# Javadoc Versions -# -# -# -#

Available Javadoc Versions

-#
    -# EOF -# -# for dir in docs/*; do -# [ -d "$dir" ] || continue -# ver=$(basename "$dir") -# echo "
  • $ver
  • " >> index.html -# done -# -# cat >> index.html << 'EOF' -#
-# -# -# EOF -# -# - name: Commit & push removal -# run: | -# cd gh-pages -# git config user.email "github-actions[bot]@users.noreply.github.com" -# git config user.name "github-actions[bot]" -# -# if git diff --quiet; then -# echo "No changes to commit" -# exit 0 -# fi -# -# git add . -# git commit -m "Remove Javadoc for deleted tag ${GITHUB_REF_NAME}" -# git push origin gh-pages -# -# Summary of Features -# Multi-version docs (docs//) -# Auto-generated index.html -# Only publish for tags -# Delete docs when tags are deleted -# Upload Pages artifact -# Maven dependency caching -# No third-party actions -# No force-push -# Clean HTML layout + permissions: + contents: write + pages: write + id-token: write + + steps: + - name: Checkout Code at Specified Tag + uses: actions/checkout@v5 + with: + ref: ${{ github.event.inputs.tag_ref }} # from manual trigger input + fetch-depth: 0 + + - name: Set up JDK + uses: actions/setup-java@v4 + with: + java-version: '25' + distribution: 'temurin' + cache: 'maven' + + - name: Build and Generate Javadoc + run: mvn javadoc:javadoc + + - name: Deploy Javadoc to gh-pages/docs/${TAG} + env: + GH_PAGES_EMAIL: noreply@github.com + GH_PAGES_NAME: github-actions[bot] + GIT_TAG_NAME: ${{ github.event.inputs.tag_ref }} + TARGET_DIR: docs/${{ github.event.inputs.tag_ref }} + run: | + # 1. Configure Git user + git config user.email "${GH_PAGES_EMAIL}" + git config user.name "${GH_PAGES_NAME}" + + # 2. Fetch and checkout the existing gh-pages branch + git fetch origin gh-pages:gh-pages + git checkout gh-pages + + # 3. Clean up any previous documentation for this tag (optional, but safer) + rm -rf $TARGET_DIR + + # 4. Create the versioned directory structure + mkdir -p $TARGET_DIR + + # 5. Copy the generated Javadoc files into the versioned directory + cp -r target/reports/apidocs/* $TARGET_DIR/ + + # 6. Add the new directory and files, commit, and push + git add $TARGET_DIR + git commit -m "Manual Javadoc deployment for tag ${GIT_TAG_NAME} into $TARGET_DIR" + git push origin gh-pages From 75a363f31687407e80baa6d84d9393e487e24d7b Mon Sep 17 00:00:00 2001 From: Lee Rhodes Date: Wed, 19 Nov 2025 16:31:01 -0800 Subject: [PATCH 11/12] Re-enable workflows --- .github/workflows/auto-jdk-matrix.yml | 12 ++++++------ .github/workflows/auto-os-matrix.yml | 14 +++++++------- .github/workflows/check_cpp_files.yml | 14 +++++++------- .github/workflows/codeql-analysis.yml | 14 +++++++------- 4 files changed, 27 insertions(+), 27 deletions(-) diff --git a/.github/workflows/auto-jdk-matrix.yml b/.github/workflows/auto-jdk-matrix.yml index 4f731fbc5..0d387c944 100644 --- a/.github/workflows/auto-jdk-matrix.yml +++ b/.github/workflows/auto-jdk-matrix.yml @@ -1,12 +1,12 @@ name: Auto JDK Matrix Test & Install on: -# push: -# branches: [ 'main', '[0-9]+.[0-9]+.[Xx]' ] -# pull_request: -# paths-ignore: [ '**/*.html', '**/*.md', '**/*.txt', '**/*.xml', '**/*.yaml', '**/*.yml', '**/LICENSE', '**/NOTICE' ] -# # The branches below must be a subset of the branches above -# branches: [ 'main', '[0-9]+.[0-9]+.[Xx]' ] + push: + branches: [ 'main', '[0-9]+.[0-9]+.[Xx]' ] + pull_request: + paths-ignore: [ '**/*.html', '**/*.md', '**/*.txt', '**/*.xml', '**/*.yaml', '**/*.yml', '**/LICENSE', '**/NOTICE' ] + # The branches below must be a subset of the branches above + branches: [ 'main', '[0-9]+.[0-9]+.[Xx]' ] workflow_dispatch: env: diff --git a/.github/workflows/auto-os-matrix.yml b/.github/workflows/auto-os-matrix.yml index 629a0a0d7..df3c7135f 100644 --- a/.github/workflows/auto-os-matrix.yml +++ b/.github/workflows/auto-os-matrix.yml @@ -1,13 +1,13 @@ name: Auto OS Matrix Test & Install on: -# push: -# paths-ignore: [ '**/*.html', '**/*.md', '**/*.txt', '**/*.xml', '**/*.yaml', '**/*.yml', '**/LICENSE', '**/NOTICE' ] -# branches: [ 'main', '[0-9]+.[0-9]+.[Xx]' ] -# pull_request: -# paths-ignore: [ '**/*.html', '**/*.md', '**/*.txt', '**/*.xml', '**/*.yaml', '**/*.yml', '**/LICENSE', '**/NOTICE' ] -# # The branches below must be a subset of the branches above -# branches: [ 'main', '[0-9]+.[0-9]+.[Xx]' ] + push: + paths-ignore: [ '**/*.html', '**/*.md', '**/*.txt', '**/*.xml', '**/*.yaml', '**/*.yml', '**/LICENSE', '**/NOTICE' ] + branches: [ 'main', '[0-9]+.[0-9]+.[Xx]' ] + pull_request: + paths-ignore: [ '**/*.html', '**/*.md', '**/*.txt', '**/*.xml', '**/*.yaml', '**/*.yml', '**/LICENSE', '**/NOTICE' ] + # The branches below must be a subset of the branches above + branches: [ 'main', '[0-9]+.[0-9]+.[Xx]' ] workflow_dispatch: env: diff --git a/.github/workflows/check_cpp_files.yml b/.github/workflows/check_cpp_files.yml index 1fc522ad5..694aa139d 100644 --- a/.github/workflows/check_cpp_files.yml +++ b/.github/workflows/check_cpp_files.yml @@ -1,13 +1,13 @@ name: CPP SerDe Compatibility Test on: -# push: -# paths-ignore: [ '**/*.html', '**/*.md', '**/*.txt', '**/*.xml', '**/*.yaml', '**/*.yml', '**/LICENSE', '**/NOTICE' ] -# branches: [ 'main', '[0-9]+.[0-9]+.[Xx]' ] -# pull_request: -# paths-ignore: [ '**/*.html', '**/*.md', '**/*.txt', '**/*.xml', '**/*.yaml', '**/*.yml', '**/LICENSE', '**/NOTICE' ] -# # The branches below must be a subset of the branches above -# branches: [ 'main', '[0-9]+.[0-9]+.[Xx]' ] + push: + paths-ignore: [ '**/*.html', '**/*.md', '**/*.txt', '**/*.xml', '**/*.yaml', '**/*.yml', '**/LICENSE', '**/NOTICE' ] + branches: [ 'main', '[0-9]+.[0-9]+.[Xx]' ] + pull_request: + paths-ignore: [ '**/*.html', '**/*.md', '**/*.txt', '**/*.xml', '**/*.yaml', '**/*.yml', '**/LICENSE', '**/NOTICE' ] + # The branches below must be a subset of the branches above + branches: [ 'main', '[0-9]+.[0-9]+.[Xx]' ] workflow_dispatch: jobs: diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml index 57fc7706f..0a6de05d9 100644 --- a/.github/workflows/codeql-analysis.yml +++ b/.github/workflows/codeql-analysis.yml @@ -1,13 +1,13 @@ name: "CodeQL" on: -# push: -# paths-ignore: [ '**/*.html', '**/*.md', '**/*.txt', '**/*.xml', '**/*.yaml', '**/*.yml', '**/LICENSE', '**/NOTICE' ] -# branches: [ 'main', '[0-9]+.[0-9]+.[Xx]' ] -# pull_request: -# paths-ignore: [ '**/*.html', '**/*.md', '**/*.txt', '**/*.xml', '**/*.yaml', '**/*.yml', '**/LICENSE', '**/NOTICE' ] -# # The branches below must be a subset of the branches above -# branches: [ 'main', '[0-9]+.[0-9]+.[Xx]' ] + push: + paths-ignore: [ '**/*.html', '**/*.md', '**/*.txt', '**/*.xml', '**/*.yaml', '**/*.yml', '**/LICENSE', '**/NOTICE' ] + branches: [ 'main', '[0-9]+.[0-9]+.[Xx]' ] + pull_request: + paths-ignore: [ '**/*.html', '**/*.md', '**/*.txt', '**/*.xml', '**/*.yaml', '**/*.yml', '**/LICENSE', '**/NOTICE' ] + # The branches below must be a subset of the branches above + branches: [ 'main', '[0-9]+.[0-9]+.[Xx]' ] workflow_dispatch: jobs: From f81137045c9afc0ce0c1f49864dc953cda9c9bb5 Mon Sep 17 00:00:00 2001 From: Lee Rhodes Date: Wed, 19 Nov 2025 17:48:44 -0800 Subject: [PATCH 12/12] Anticipate next release snapshot --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index befed4013..ab514cef5 100644 --- a/pom.xml +++ b/pom.xml @@ -33,7 +33,7 @@ under the License. org.apache.datasketches datasketches-java - 9.0.0-SNAPSHOT + 9.0.1-SNAPSHOT jar ${project.artifactId}