From 82630e554e35d702ec6358b1d1ec5e1f186e7447 Mon Sep 17 00:00:00 2001 From: Mahesh G Pai Date: Sat, 7 Jun 2025 11:28:16 +0530 Subject: [PATCH 1/6] Provide get_centroids implementation --- tdigest/include/tdigest.hpp | 5 +++++ tdigest/include/tdigest_impl.hpp | 5 +++++ 2 files changed, 10 insertions(+) diff --git a/tdigest/include/tdigest.hpp b/tdigest/include/tdigest.hpp index d33084ed..21cf47a2 100644 --- a/tdigest/include/tdigest.hpp +++ b/tdigest/include/tdigest.hpp @@ -143,6 +143,11 @@ class tdigest { */ uint64_t get_total_weight() const; + /** + * @return centroids + */ + vector_centroid get_centroids() const; + /** * Returns an instance of the allocator for this t-Digest. * @return allocator diff --git a/tdigest/include/tdigest_impl.hpp b/tdigest/include/tdigest_impl.hpp index 6e3ae1a0..73429f6d 100644 --- a/tdigest/include/tdigest_impl.hpp +++ b/tdigest/include/tdigest_impl.hpp @@ -85,6 +85,11 @@ uint64_t tdigest::get_total_weight() const { return centroids_weight_ + buffer_.size(); } +template +auto tdigest::get_centroids() const -> vector_centroid{ + return centroids_; +} + template A tdigest::get_allocator() const { return buffer_.get_allocator(); From 866f6d036a7fe91153d01d9648f9127755e5af77 Mon Sep 17 00:00:00 2001 From: Mahesh G Pai Date: Mon, 9 Jun 2025 19:54:59 +0530 Subject: [PATCH 2/6] Introduced const_iterator for tdigest --- tdigest/include/tdigest.hpp | 42 ++++++++++++++++++--- tdigest/include/tdigest_impl.hpp | 64 +++++++++++++++++++++++++++++--- tdigest/test/tdigest_test.cpp | 14 +++++++ 3 files changed, 110 insertions(+), 10 deletions(-) diff --git a/tdigest/include/tdigest.hpp b/tdigest/include/tdigest.hpp index 21cf47a2..e821e4c0 100644 --- a/tdigest/include/tdigest.hpp +++ b/tdigest/include/tdigest.hpp @@ -143,11 +143,6 @@ class tdigest { */ uint64_t get_total_weight() const; - /** - * @return centroids - */ - vector_centroid get_centroids() const; - /** * Returns an instance of the allocator for this t-Digest. * @return allocator @@ -262,6 +257,21 @@ class tdigest { */ static tdigest deserialize(const void* bytes, size_t size, const Allocator& allocator = Allocator()); + class const_iterator; + + /** + * Iterator pointing to the first centroid in the sketch. + * If the sketch is empty, the returned iterator must not be dereferenced or incremented. + * @return iterator pointing to the first centroid in the sketch + */ + const_iterator begin() const; + + /** + * Iterator pointing to the past-the-end centroid in the sketch. + * It does not point to any centroid, and must not be dereferenced or incremented. + * @return iterator pointing to the past-the-end centroid in the sketch + */ + const_iterator end() const; private: bool reverse_merge_; uint16_t k_; @@ -302,6 +312,28 @@ class tdigest { static inline void check_split_points(const T* values, uint32_t size); }; +template +class tdigest::const_iterator { +public: + using iterator_category = std::input_iterator_tag; + using value_type = std::pair; + using difference_type = void; + using pointer = const return_value_holder; + using reference = const value_type; + + const_iterator(const tdigest &tdigest_, bool is_end); + + const_iterator& operator++(); + const_iterator& operator++(int); + bool operator==(const const_iterator& other) const; + bool operator!=(const const_iterator& other) const; + reference operator*() const; + pointer operator->() const; +private: + friend class tdigest; + uint32_t index_; + vector_centroid centroids_; +}; } /* namespace datasketches */ #include "tdigest_impl.hpp" diff --git a/tdigest/include/tdigest_impl.hpp b/tdigest/include/tdigest_impl.hpp index 73429f6d..49fd98a5 100644 --- a/tdigest/include/tdigest_impl.hpp +++ b/tdigest/include/tdigest_impl.hpp @@ -85,11 +85,6 @@ uint64_t tdigest::get_total_weight() const { return centroids_weight_ + buffer_.size(); } -template -auto tdigest::get_centroids() const -> vector_centroid{ - return centroids_; -} - template A tdigest::get_allocator() const { return buffer_.get_allocator(); @@ -632,6 +627,65 @@ void tdigest::check_split_points(const T* values, uint32_t size) { } } +template +typename tdigest::const_iterator tdigest::begin() const { + return tdigest::const_iterator(*this, false); +} + +template + typename tdigest::const_iterator tdigest::end() const { + return tdigest::const_iterator(*this, true); +} + +template +tdigest::const_iterator::const_iterator(const tdigest& tdigest_, const bool is_end): + centroids_() +{ + // Create a copy of the tdigest to generate the centroids after processing the buffered values + tdigest tmp(tdigest_); + tmp.compress(); + centroids_.insert(centroids_.end(), tmp.centroids_.begin(), tmp.centroids_.end()); + + if (is_end) { + index_ = centroids_.size(); + } else { + index_ = 0; + } +} + +template +typename tdigest::const_iterator& tdigest::const_iterator::operator++() { + ++index_; + return *this; +} + +template +typename tdigest::const_iterator& tdigest::const_iterator::operator++(int) { + const_iterator tmp(*this); + operator++(); + return tmp; +} + +template +bool tdigest::const_iterator::operator==(const const_iterator& other) const { + return index_ == other.index_; +} + +template +bool tdigest::const_iterator::operator!=(const const_iterator& other) const { + return !operator==(other); +} + +template +auto tdigest::const_iterator::operator*() const -> reference { + return value_type(centroids_[index_].get_mean(), centroids_[index_].get_weight()); +} + +template +auto tdigest::const_iterator::operator->() const -> pointer { + return **this; +} + } /* namespace datasketches */ #endif // _TDIGEST_IMPL_HPP_ diff --git a/tdigest/test/tdigest_test.cpp b/tdigest/test/tdigest_test.cpp index fc3f5d1c..41b00943 100644 --- a/tdigest/test/tdigest_test.cpp +++ b/tdigest/test/tdigest_test.cpp @@ -453,4 +453,18 @@ TEST_CASE("deserialize from reference implementation bytes float", "[tdigest]") REQUIRE(td.get_rank(n) == 1); } +TEST_CASE("iterate centroids", "[tdigest]") { + tdigest_double td(100); + for (int i = 0; i < 10; i++) { + td.update(i); + } + + auto centroid_count = 0; + for (const auto ¢roid: td) { + centroid_count++; + } + // Ensure that centroids are retrieved for a case where there is buffered values + REQUIRE(centroid_count == 10); +} + } /* namespace datasketches */ From 315e50b290387ac0af2b6dc032bf09715d2a8deb Mon Sep 17 00:00:00 2001 From: Mahesh G Pai Date: Tue, 10 Jun 2025 00:28:48 +0530 Subject: [PATCH 3/6] Addressing review comments --- tdigest/include/tdigest.hpp | 8 ++++++-- tdigest/include/tdigest_impl.hpp | 24 +++++++++++++++++++++--- tdigest/test/tdigest_test.cpp | 3 +++ 3 files changed, 30 insertions(+), 5 deletions(-) diff --git a/tdigest/include/tdigest.hpp b/tdigest/include/tdigest.hpp index e821e4c0..2ad410f5 100644 --- a/tdigest/include/tdigest.hpp +++ b/tdigest/include/tdigest.hpp @@ -106,6 +106,12 @@ class tdigest { */ explicit tdigest(uint16_t k = DEFAULT_K, const Allocator& allocator = Allocator()); + /** + * Copy constructor + * @param other sketch to be copied + */ + tdigest(const tdigest& other); + /** * Update this t-Digest with the given value * @param value to update the t-Digest with @@ -275,13 +281,11 @@ class tdigest { private: bool reverse_merge_; uint16_t k_; - uint16_t internal_k_; T min_; T max_; size_t centroids_capacity_; vector_centroid centroids_; uint64_t centroids_weight_; - size_t buffer_capacity_; vector_t buffer_; static const size_t BUFFER_MULTIPLIER = 4; diff --git a/tdigest/include/tdigest_impl.hpp b/tdigest/include/tdigest_impl.hpp index 49fd98a5..1dba9eb1 100644 --- a/tdigest/include/tdigest_impl.hpp +++ b/tdigest/include/tdigest_impl.hpp @@ -597,6 +597,24 @@ bool tdigest::is_single_value() const { return get_total_weight() == 1; } +template +tdigest::tdigest(const tdigest& other): + reverse_merge_(other.reverse_merge_), + k_(other.k_), + min_(other.min_), + max_(other.max_), + centroids_capacity_(other.centroids_capacity_), + centroids_(other.centroids_, other.get_allocator()), + centroids_weight_(other.centroids_weight_), + buffer_(other.buffer_, other.get_allocator()) +{ + if (other.k_ < 10) throw std::invalid_argument("k must be at least 10"); + const size_t fudge = other.k_ < 30 ? 30 : 10; + centroids_capacity_ = 2 * k_ + fudge; + centroids_.reserve(centroids_capacity_); + buffer_.reserve(centroids_capacity_ * BUFFER_MULTIPLIER); +} + template tdigest::tdigest(bool reverse_merge, uint16_t k, T min, T max, vector_centroid&& centroids, uint64_t weight, vector_t&& buffer): reverse_merge_(reverse_merge), @@ -638,11 +656,11 @@ template } template -tdigest::const_iterator::const_iterator(const tdigest& tdigest_, const bool is_end): - centroids_() +tdigest::const_iterator::const_iterator(const tdigest& tdigest_, const bool is_end): + centroids_(tdigest_.get_allocator()) { // Create a copy of the tdigest to generate the centroids after processing the buffered values - tdigest tmp(tdigest_); + tdigest tmp(tdigest_); tmp.compress(); centroids_.insert(centroids_.end(), tmp.centroids_.begin(), tmp.centroids_.end()); diff --git a/tdigest/test/tdigest_test.cpp b/tdigest/test/tdigest_test.cpp index 41b00943..9f92094d 100644 --- a/tdigest/test/tdigest_test.cpp +++ b/tdigest/test/tdigest_test.cpp @@ -460,11 +460,14 @@ TEST_CASE("iterate centroids", "[tdigest]") { } auto centroid_count = 0; + uint64_t total_weight = 0; for (const auto ¢roid: td) { centroid_count++; + total_weight += centroid.second; } // Ensure that centroids are retrieved for a case where there is buffered values REQUIRE(centroid_count == 10); + REQUIRE(td.get_total_weight() == total_weight); } } /* namespace datasketches */ From faca5d0262173c96c846d0d36d900cc5bfa48b6d Mon Sep 17 00:00:00 2001 From: Mahesh G Pai Date: Wed, 11 Jun 2025 22:47:10 +0530 Subject: [PATCH 4/6] Retaining the default copy constructor --- tdigest/include/tdigest.hpp | 6 ------ tdigest/include/tdigest_impl.hpp | 18 ------------------ 2 files changed, 24 deletions(-) diff --git a/tdigest/include/tdigest.hpp b/tdigest/include/tdigest.hpp index 2ad410f5..99e8dfa3 100644 --- a/tdigest/include/tdigest.hpp +++ b/tdigest/include/tdigest.hpp @@ -106,12 +106,6 @@ class tdigest { */ explicit tdigest(uint16_t k = DEFAULT_K, const Allocator& allocator = Allocator()); - /** - * Copy constructor - * @param other sketch to be copied - */ - tdigest(const tdigest& other); - /** * Update this t-Digest with the given value * @param value to update the t-Digest with diff --git a/tdigest/include/tdigest_impl.hpp b/tdigest/include/tdigest_impl.hpp index 1dba9eb1..ab4ce9e4 100644 --- a/tdigest/include/tdigest_impl.hpp +++ b/tdigest/include/tdigest_impl.hpp @@ -597,24 +597,6 @@ bool tdigest::is_single_value() const { return get_total_weight() == 1; } -template -tdigest::tdigest(const tdigest& other): - reverse_merge_(other.reverse_merge_), - k_(other.k_), - min_(other.min_), - max_(other.max_), - centroids_capacity_(other.centroids_capacity_), - centroids_(other.centroids_, other.get_allocator()), - centroids_weight_(other.centroids_weight_), - buffer_(other.buffer_, other.get_allocator()) -{ - if (other.k_ < 10) throw std::invalid_argument("k must be at least 10"); - const size_t fudge = other.k_ < 30 ? 30 : 10; - centroids_capacity_ = 2 * k_ + fudge; - centroids_.reserve(centroids_capacity_); - buffer_.reserve(centroids_capacity_ * BUFFER_MULTIPLIER); -} - template tdigest::tdigest(bool reverse_merge, uint16_t k, T min, T max, vector_centroid&& centroids, uint64_t weight, vector_t&& buffer): reverse_merge_(reverse_merge), From ada87563432eebc989088d6fab3a1fd4d0aabc36 Mon Sep 17 00:00:00 2001 From: Mahesh G Pai Date: Thu, 12 Jun 2025 11:32:08 +0530 Subject: [PATCH 5/6] Removing the unnecessary parameters --- tdigest/include/tdigest_impl.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tdigest/include/tdigest_impl.hpp b/tdigest/include/tdigest_impl.hpp index ab4ce9e4..0f53adc4 100644 --- a/tdigest/include/tdigest_impl.hpp +++ b/tdigest/include/tdigest_impl.hpp @@ -638,11 +638,11 @@ template } template -tdigest::const_iterator::const_iterator(const tdigest& tdigest_, const bool is_end): +tdigest::const_iterator::const_iterator(const tdigest& tdigest_, const bool is_end): centroids_(tdigest_.get_allocator()) { // Create a copy of the tdigest to generate the centroids after processing the buffered values - tdigest tmp(tdigest_); + tdigest tmp(tdigest_); tmp.compress(); centroids_.insert(centroids_.end(), tmp.centroids_.begin(), tmp.centroids_.end()); From 5be04f2561ca2d394f564b78400aa981d98b4c9e Mon Sep 17 00:00:00 2001 From: Mahesh G Pai Date: Fri, 13 Jun 2025 10:50:19 +0530 Subject: [PATCH 6/6] Review comments --- tdigest/include/tdigest.hpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tdigest/include/tdigest.hpp b/tdigest/include/tdigest.hpp index 99e8dfa3..cc7898e3 100644 --- a/tdigest/include/tdigest.hpp +++ b/tdigest/include/tdigest.hpp @@ -319,8 +319,6 @@ class tdigest::const_iterator { using pointer = const return_value_holder; using reference = const value_type; - const_iterator(const tdigest &tdigest_, bool is_end); - const_iterator& operator++(); const_iterator& operator++(int); bool operator==(const const_iterator& other) const; @@ -328,9 +326,10 @@ class tdigest::const_iterator { reference operator*() const; pointer operator->() const; private: - friend class tdigest; + friend class tdigest; uint32_t index_; vector_centroid centroids_; + const_iterator(const tdigest& tdigest_, bool is_end); }; } /* namespace datasketches */