diff --git a/.github/workflows/build-test.yml b/.github/workflows/build-test.yml index e05a98c..db8665f 100644 --- a/.github/workflows/build-test.yml +++ b/.github/workflows/build-test.yml @@ -27,6 +27,10 @@ jobs: - name: Run Unittests working-directory: ./build run: ./unittests + + - name: Run LOUDS Tree Tests + working-directory: ./build + run: ./louds_tree_tests build-and-test-with-SDE: runs-on: ubuntu-latest @@ -53,4 +57,8 @@ jobs: - name: Run Unittests working-directory: ./build - run: sde-external-9.58.0-2025-06-16-lin/sde64 -icl -emu-xinuse 0 -- ./unittests \ No newline at end of file + run: sde-external-9.58.0-2025-06-16-lin/sde64 -icl -emu-xinuse 0 -- ./unittests + + - name: Run LOUDS Tree Tests + working-directory: ./build + run: sde-external-9.58.0-2025-06-16-lin/sde64 -icl -emu-xinuse 0 -- ./louds_tree_tests \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index 63b1877..319f813 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -93,6 +93,20 @@ target_include_directories(bench_rmm target_link_libraries(bench_rmm benchmark) + +add_executable(louds_tree_benchmarks + src/louds_tree_benchmarks.cpp) + +target_include_directories(louds_tree_benchmarks + PUBLIC include +) + +target_link_libraries(louds_tree_benchmarks + benchmark + benchmark_main +) + + add_executable(test_rmm src/test_rmm.cpp) target_include_directories(test_rmm @@ -103,6 +117,15 @@ target_link_libraries(test_rmm gtest gtest_main) +add_executable(louds_tree_tests + src/louds_tree_tests.cpp) +target_include_directories(louds_tree_tests + PUBLIC include + PUBLIC ${GOOGLETEST_SOURCE_DIR}/include +) +target_link_libraries(louds_tree_tests + gtest + gtest_main) FetchContent_Declare( doxygen-awesome-css diff --git a/include/bits.h b/include/bits.h index c7ee26d..dd87f7f 100644 --- a/include/bits.h +++ b/include/bits.h @@ -128,8 +128,8 @@ uint64_t select_512(const uint64_t* x, uint64_t rank) { #ifdef PIXIE_AVX512_SUPPORT __m512i res = _mm512_loadu_epi64(x); - std::array counts; - _mm512_storeu_epi64(counts.data(), _mm512_popcnt_epi64(res)); + alignas(64) std::array counts; + _mm512_store_epi64(counts.data(), _mm512_popcnt_epi64(res)); size_t i = 0; while (i < 8 && counts[i] <= rank) { @@ -150,6 +150,37 @@ uint64_t select_512(const uint64_t* x, uint64_t rank) { #endif } +/** + * @brief Return position of @p rank0 0 bit in @p x + * @details select_512 with bit inversion. + */ +uint64_t select0_512(const uint64_t* x, uint64_t rank0) { +#ifdef PIXIE_AVX512_SUPPORT + + __m512i res = _mm512_loadu_epi64(x); + res = _mm512_xor_epi64(res, _mm512_set1_epi64(-1)); + alignas(64) std::array counts; + _mm512_store_epi64(counts.data(), _mm512_popcnt_epi64(res)); + + size_t i = 0; + while (i < 8 && counts[i] <= rank0) { + rank0 -= counts[i++]; + } + return i * 64 + select_64(~x[i], rank0); + +#else + + size_t i = 0; + int popcount = std::popcount(~x[0]); + while (i < 7 && popcount <= rank0) { + rank0 -= popcount; + popcount = std::popcount(~x[++i]); + } + return i * 64 + select_64(~x[i], rank0); + +#endif +} + /** * @brief Compare 4 64-bit numbers of @p x with @p y and * return the length of the prefix where @p y is less then @p x @@ -190,6 +221,61 @@ uint16_t lower_bound_4x64(const uint64_t* x, uint64_t y) { #endif } +/** + * @brief Compare 4 64-bit numbers of ( @p dlt_array + @p dlt_scalar - @p x ) + * with @p y and return the length of the prefix + * where @p y is less then ( @p dlt_array + @p dlt_scalar - @p x ) + */ +uint16_t lower_bound_dlt_4x64(const uint64_t* x, + uint64_t y, + const uint64_t* dlt_array, + uint64_t dlt_scalar) { +#ifdef PIXIE_AVX512_SUPPORT + + const __m256i dlt_256 = _mm256_loadu_epi64(dlt_array); + auto x_256 = _mm256_loadu_epi64(x); + auto dlt_4 = _mm256_set1_epi64x(dlt_scalar); + auto y_4 = _mm256_set1_epi64x(y); + + auto tmp = _mm256_add_epi64(dlt_4, dlt_256); + auto reg_256 = _mm256_sub_epi64(tmp, x_256); + auto cmp = _mm256_cmpge_epu64_mask(reg_256, y_4); + + return _tzcnt_u16(cmp); + +#else +#ifdef PIXIE_AVX2_SUPPORT + + const __m256i dlt_256 = + _mm256_loadu_si256(reinterpret_cast(dlt_array)); + auto x_256 = _mm256_loadu_si256(reinterpret_cast(x)); + auto dlt_4 = _mm256_set1_epi64x(dlt_scalar); + auto y_4 = _mm256_set1_epi64x(y); + + auto tmp = _mm256_add_epi64(dlt_4, dlt_256); + auto reg_256 = _mm256_sub_epi64(tmp, x_256); + + const __m256i offset = _mm256_set1_epi64x(0x8000000000000000ULL); + __m256i x_offset = _mm256_xor_si256(reg_256, offset); + __m256i y_offset = _mm256_xor_si256(y_4, offset); + auto mask = _mm256_movemask_epi8(_mm256_cmpgt_epi64( + x_offset, _mm256_sub_epi64(y_offset, _mm256_set1_epi64x(1)))); + + return _tzcnt_u32(mask) >> 3; + +#else + + for (uint16_t i = 0; i < 4; ++i) { + if (dlt_array[i] + dlt_scalar - x[i] >= y) { + return i; + } + } + return 4; + +#endif +#endif +} + /** * @brief Compare 8 64-bit numbers of @p x with @p y and * return the length of the prefix where @p y is less then @p x @@ -227,11 +313,57 @@ uint16_t lower_bound_8x64(const uint64_t* x, uint64_t y) { #endif } +/** + * @brief Compare 8 64-bit numbers of ( @p dlt_array + @p dlt_scalar - @p x ) + * with @p y and return the length of the prefix + * where @p y is less then ( @p dlt_array + @p dlt_scalar - @p x ) + */ +uint16_t lower_bound_dlt_8x64(const uint64_t* x, + uint64_t y, + const uint64_t* dlt_array, + uint64_t dlt_scalar) { +#ifdef PIXIE_AVX512_SUPPORT + + const __m512i dlt_512 = _mm512_loadu_epi64(dlt_array); + auto x_512 = _mm512_loadu_epi64(x); + auto dlt_8 = _mm512_set1_epi64(dlt_scalar); + auto y_8 = _mm512_set1_epi64(y); + + auto tmp = _mm512_add_epi64(dlt_8, dlt_512); + auto reg_512 = _mm512_sub_epi64(tmp, x_512); + auto cmp = _mm512_cmpge_epu64_mask(reg_512, y_8); + + return _tzcnt_u16(cmp); + +#else +#ifdef PIXIE_AVX2_SUPPORT + + uint16_t len = lower_bound_dlt_4x64(x, y, dlt_array, dlt_scalar); + + if (len < 4) { + return len; + } + + return len + lower_bound_dlt_4x64(x + 4, y, dlt_array + 4, dlt_scalar); + +#else + + for (uint16_t i = 0; i < 8; ++i) { + if (dlt_array[i] + dlt_scalar - x[i] >= y) { + return i; + } + } + return 8; + +#endif +#endif +} + /** * @brief Compare 32 16-bit numbers of @p x with @p y and * return the count of numbers where @p x is less then @p y */ -uint16_t lower_bound_32x16(const uint16_t* x, uint64_t y) { +uint16_t lower_bound_32x16(const uint16_t* x, uint16_t y) { #ifdef PIXIE_AVX512_SUPPORT auto y_32 = _mm512_set1_epi16(y); @@ -273,6 +405,72 @@ uint16_t lower_bound_32x16(const uint16_t* x, uint64_t y) { #endif } +/** + * @brief Compare 32 16-bit numbers of ( @p dlt_array + @p dlt_scalar - @p x ) + * with @p y and return the count of numbers where + * ( @p dlt_array + @p dlt_scalar - @p x ) is less then @p y + */ +uint16_t lower_bound_dlt_32x16(const uint16_t* x, + uint16_t y, + const uint16_t* dlt_array, + uint16_t dlt_scalar) { +#ifdef PIXIE_AVX512_SUPPORT + + const __m512i dlt_512 = _mm512_loadu_epi64(dlt_array); + auto x_512 = _mm512_loadu_epi64(x); + auto dlt_32 = _mm512_set1_epi16(dlt_scalar); + auto y_32 = _mm512_set1_epi16(y); + + auto tmp = _mm512_add_epi16(dlt_32, dlt_512); + auto reg_512 = _mm512_sub_epi16(tmp, x_512); + auto cmp = _mm512_cmplt_epu16_mask(reg_512, y_32); + return std::popcount(cmp); + +#else +#ifdef PIXIE_AVX2_SUPPORT + + auto dlt_256 = + _mm256_loadu_si256(reinterpret_cast(dlt_array)); + auto x_256 = _mm256_loadu_si256(reinterpret_cast(x)); + auto dlt_16 = _mm256_set1_epi16(dlt_scalar); + auto y_16 = _mm256_set1_epi16(y); + + auto tmp = _mm256_add_epi16(dlt_16, dlt_256); + auto reg_256 = _mm256_sub_epi16(tmp, x_256); + + const __m256i offset = _mm256_set1_epi16(0x8000); + __m256i x_offset = _mm256_xor_si256(reg_256, offset); + __m256i y_offset = _mm256_xor_si256(y_16, offset); + uint32_t mask = _mm256_movemask_epi8(_mm256_cmpgt_epi16(y_offset, x_offset)); + + uint16_t count = std::popcount(mask) >> 1; + + dlt_256 = + _mm256_loadu_si256(reinterpret_cast(dlt_array + 16)); + x_256 = _mm256_loadu_si256(reinterpret_cast(x + 16)); + + tmp = _mm256_add_epi16(dlt_16, dlt_256); + reg_256 = _mm256_sub_epi16(tmp, x_256); + + x_offset = _mm256_xor_si256(reg_256, offset); + mask = _mm256_movemask_epi8(_mm256_cmpgt_epi16(y_offset, x_offset)); + + return count + (std::popcount(mask) >> 1); + +#else + + uint16_t cnt = 0; + for (uint16_t i = 0; i < 32; ++i) { + if (dlt_array[i] + dlt_scalar - x[i] < y) { + cnt++; + } + } + return cnt; + +#endif +#endif +} + /** * @brief Calculates 64 popcounts of 4-bits integers and stores as 64 4-bits * integers (packed into 32 bytes) diff --git a/include/bitvector.h b/include/bitvector.h index 117e00a..43e5bc9 100644 --- a/include/bitvector.h +++ b/include/bitvector.h @@ -72,9 +72,13 @@ class BitVector { constexpr static size_t kSelectSampleFrequency = 16384; constexpr static size_t kBlocksPerSuperBlock = 128; + alignas(64) uint64_t dlt_super[8]; + alignas(64) uint16_t dlt_basic[32]; + std::vector super_block_rank; std::vector basic_block_rank; std::vector select_samples; + std::vector select0_samples; const size_t num_bits_; const size_t padded_size_; size_t max_rank; @@ -119,10 +123,14 @@ class BitVector { */ void build_select() { uint64_t milestone = kSelectSampleFrequency; + uint64_t milestone0 = kSelectSampleFrequency; uint64_t rank = 0; + uint64_t rank0 = 0; select_samples.emplace_back(0); + select0_samples.emplace_back(0); for (size_t i = 0; i < bits.size(); ++i) { auto ones = std::popcount(bits[i]); + auto zeros = 64 - ones; if (rank + ones >= milestone) { auto pos = select_64(bits[i], milestone - rank - 1); // TODO: try including global rank into select samples to save @@ -130,7 +138,20 @@ class BitVector { select_samples.emplace_back((64 * i + pos) / kSuperBlockSize); milestone += kSelectSampleFrequency; } + if (rank0 + zeros >= milestone0) { + auto pos = select_64(~bits[i], milestone0 - rank0 - 1); + select0_samples.emplace_back((64 * i + pos) / kSuperBlockSize); + milestone0 += kSelectSampleFrequency; + } rank += ones; + rank0 += zeros; + } + + for (size_t i = 0; i < 8; ++i) { + dlt_super[i] = i * kSuperBlockSize; + } + for (size_t i = 0; i < 32; ++i) { + dlt_basic[i] = i * kBasicBlockSize; } } @@ -160,6 +181,35 @@ class BitVector { return left - 1; } + /** + * @brief first step of the select0 operation + */ + uint64_t find_superblock_zeros(uint64_t rank0) const { + uint64_t left = select0_samples[rank0 / kSelectSampleFrequency]; + + while (left + 7 < super_block_rank.size()) { + auto len = lower_bound_dlt_8x64(&super_block_rank[left], rank0, dlt_super, + kSuperBlockSize * left); + if (len < 8) { + return left + len - 1; + } + left += 8; + } + if (left + 3 < super_block_rank.size()) { + auto len = lower_bound_dlt_4x64(&super_block_rank[left], rank0, dlt_super, + kSuperBlockSize * left); + if (len < 4) { + return left + len - 1; + } + left += 4; + } + while (left < super_block_rank.size() && + kSuperBlockSize * left - super_block_rank[left] < rank0) { + left++; + } + return left - 1; + } + /** * @brief SIMD-optimized linear scan * @details @@ -167,17 +217,32 @@ class BitVector { * 4 iterations. */ uint64_t find_basicblock(uint16_t local_rank, uint64_t s_block) const { - auto pos = 0; + for (size_t pos = 0; pos < kBlocksPerSuperBlock; pos += 32) { + auto count = lower_bound_32x16( + &basic_block_rank[kBlocksPerSuperBlock * s_block + pos], local_rank); + if (count < 32) { + return kBlocksPerSuperBlock * s_block + pos + count - 1; + } + } + return kBlocksPerSuperBlock * s_block + kBlocksPerSuperBlock - 1; + } - for (size_t i = 0; i < 4; ++i) { - auto count = lower_bound_32x16(&basic_block_rank[128 * s_block + 32 * i], - local_rank); + /** + * @brief SIMD-optimized linear scan + * @details + * Processes 32 16-bit entries at once (full cache line), so there is at most + * 4 iterations. + */ + uint64_t find_basicblock_zeros(uint16_t local_rank0, uint64_t s_block) const { + for (size_t pos = 0; pos < kBlocksPerSuperBlock; pos += 32) { + auto count = lower_bound_dlt_32x16( + &basic_block_rank[kBlocksPerSuperBlock * s_block + pos], local_rank0, + dlt_basic, kBasicBlockSize * pos); if (count < 32) { return kBlocksPerSuperBlock * s_block + pos + count - 1; } - pos += 32; } - return kBlocksPerSuperBlock * s_block + pos - 1; + return kBlocksPerSuperBlock * s_block + kBlocksPerSuperBlock - 1; } /** @@ -197,8 +262,8 @@ class BitVector { pos = pos + 16 < 32 ? 0 : (pos - 16); pos = pos > 96 ? 96 : pos; while (pos < 96) { - auto count = - lower_bound_32x16(&basic_block_rank[128 * s_block + pos], local_rank); + auto count = lower_bound_32x16( + &basic_block_rank[kBlocksPerSuperBlock * s_block + pos], local_rank); if (count == 0) { return find_basicblock(local_rank, s_block); } @@ -208,14 +273,54 @@ class BitVector { pos += 32; } pos = 96; - auto count = - lower_bound_32x16(&basic_block_rank[128 * s_block + pos], local_rank); + auto count = lower_bound_32x16( + &basic_block_rank[kBlocksPerSuperBlock * s_block + pos], local_rank); if (count == 0) { return find_basicblock(local_rank, s_block); } return kBlocksPerSuperBlock * s_block + pos + count - 1; } + /** + * @brief Interpolation search with SIMD optimization + * @details + * Similar to find_basicblock_zeros but initial guess is based on linear + * interpolation, for random data it should make initial guess correct + * most of the times, we start from the 32 wide block with interpolation + * guess at the center, if we see that select result lie in lower blocks + * we backoff to find_basicblock_zeros + */ + uint64_t find_basicblock_is_zeros(uint16_t local_rank0, + uint64_t s_block) const { + auto lower = kSuperBlockSize * s_block - super_block_rank[s_block]; + auto upper = + kSuperBlockSize * (s_block + 1) - super_block_rank[s_block + 1]; + + uint64_t pos = kBlocksPerSuperBlock * local_rank0 / (upper - lower); + pos = pos + 16 < 32 ? 0 : (pos - 16); + pos = pos > 96 ? 96 : pos; + while (pos < 96) { + auto count = lower_bound_dlt_32x16( + &basic_block_rank[kBlocksPerSuperBlock * s_block + pos], local_rank0, + dlt_basic, kBasicBlockSize * pos); + if (count == 0) { + return find_basicblock_zeros(local_rank0, s_block); + } + if (count < 32) { + return kBlocksPerSuperBlock * s_block + pos + count - 1; + } + pos += 32; + } + pos = 96; + auto count = lower_bound_dlt_32x16( + &basic_block_rank[kBlocksPerSuperBlock * s_block + pos], local_rank0, + dlt_basic, kBasicBlockSize * pos); + if (count == 0) { + return find_basicblock_zeros(local_rank0, s_block); + } + return kBlocksPerSuperBlock * s_block + pos + count - 1; + } + public: /** * Constructor from an external array of uint64_t. @@ -248,6 +353,9 @@ class BitVector { * rank_1(pos) = number of 1s in positions [0...pos-1] */ uint64_t rank(size_t pos) const { + if (pos >= bits.size() * kWordSize) [[unlikely]] { + return max_rank; + } uint64_t b_block = pos / kBasicBlockSize; uint64_t s_block = pos / kSuperBlockSize; // Precomputed rank @@ -258,6 +366,21 @@ class BitVector { return result; } + /** + * Rank zero operation: count the number of 0-bits up to position pos + * (exclusive) rank_0(pos) = number of 0s in positions [0...pos-1] + */ + uint64_t rank0(size_t pos) const { + if (pos >= bits.size() * kWordSize) [[unlikely]] { + return bits.size() * kWordSize - max_rank; + } + return pos - rank(pos); + } + + /** + * Select operation: find the position of the i-th occurrence of a 1-bit + * select_1(rank) = index of the rank-th occurrence of 1-bit + */ uint64_t select(size_t rank) const { if (rank > max_rank) [[unlikely]] { return num_bits_; @@ -283,6 +406,36 @@ class BitVector { return kWordSize * pos + select_512(&bits[pos], rank - 1); } + /** + * Select zero operation: find the position of the i-th occurrence of a 0-bit + * select_0(rank0) = index of the rank0-th occurrence of 0-bit + */ + uint64_t select0(size_t rank0) const { + if (rank0 > num_bits_ - max_rank) [[unlikely]] { + return num_bits_; + } + if (rank0 == 0) [[unlikely]] { + return 0; + } + uint64_t s_block = find_superblock_zeros(rank0); + rank0 -= kSuperBlockSize * s_block - super_block_rank[s_block]; + auto pos = find_basicblock_is_zeros(rank0, s_block); + auto pos_in_super_block = pos & (kBlocksPerSuperBlock - 1); + rank0 -= kBasicBlockSize * pos_in_super_block - basic_block_rank[pos]; + pos *= kWordsPerBlock; + + // Final search + if (pos + kWordsPerBlock - 1 < kWordsPerBlock) [[unlikely]] { + size_t zeros = std::popcount(~bits[pos]); + while (pos < bits.size() && zeros < rank0) { + rank0 -= zeros; + zeros = std::popcount(~bits[++pos]); + } + return kWordSize * pos + select_64(~bits[pos], rank0 - 1); + } + return kWordSize * pos + select0_512(&bits[pos], rank0 - 1); + } + /** * Convert the bit vector to a binary string, for debug purpose only */ diff --git a/include/louds_tree.h b/include/louds_tree.h new file mode 100644 index 0000000..be1a291 --- /dev/null +++ b/include/louds_tree.h @@ -0,0 +1,115 @@ +#pragma once + +#include +#include + +#include "bitvector.h" + +namespace pixie { + +/** + * @brief A node class of LOUDS tree + */ +struct LoudsNode { + size_t number; + size_t pos; + + LoudsNode(size_t node_number, size_t louds_pos) + : number(node_number), pos(louds_pos) {} +}; + +/** + * @brief A tree class based on the level order unary degree sequence (LOUDS) + * representation + */ +class LoudsTree { + private: + BitVector bv; + + public: + /** + * @brief Constructor from an external array of uint64_t + */ + explicit LoudsTree(std::span louds, size_t tree_size) + : bv(louds, 2 * tree_size - 1) {} + + /** + * @brief Returns the root node + */ + LoudsNode root() const { return LoudsNode(0, 0); } + + /** + * @brief Returns the size of the tree + */ + size_t size() const { return (bv.size() + 1) / 2; } + + /** + * @brief Indicates if @p node is a leaf + */ + bool is_leaf(const LoudsNode& node) const { + return (node.pos + 1 == bv.size()) or bv[node.pos + 1]; + } + + /** + * @brief Indicates if @p node is a root + */ + bool is_root(const LoudsNode& node) const { return node.number == 0; } + + /** + * @brief Returns the number of children of a @p node + */ + size_t degree(const LoudsNode& node) const { + if (is_leaf(node)) { + return 0; + } + return bv.select(node.number + 2) - node.pos - 1; + } + + /** + * @brief Returns the i-th child of @p node + * Indexing starts at 0 + */ + LoudsNode child(const LoudsNode& node, size_t i) const { + size_t zeros = node.pos + i + 1 - node.number; + return LoudsNode(zeros, bv.select(zeros + 1)); + } + + /** + * @brief Returns first child of a @p node + */ + LoudsNode first_child(const LoudsNode& node) const { + size_t zeros = node.pos + 1 - node.number; + return LoudsNode(zeros, bv.select(zeros + 1)); + } + + /** + * @brief Returns the parent of a @p node if @p node is not root, + * else returns root + */ + LoudsNode parent(const LoudsNode& node) const { + if (node.number == 0) { + return root(); + } + size_t zero_pos = bv.select0(node.number); + size_t parent_number = zero_pos - node.number; + return LoudsNode(parent_number, bv.select(parent_number + 1)); + } + + /** + * @brief Indicates if @p node is last child + */ + bool is_last_child(const LoudsNode& node) const { + size_t zero_pos = bv.select0(node.number); + return bv[zero_pos + 1]; + } + + /** + * @brief Returns next sibling of a @p node + */ + LoudsNode next_sibling(const LoudsNode& node) const { + size_t sibling_number = node.number + 1; + return LoudsNode(sibling_number, bv.select(sibling_number + 1)); + } +}; + +} // namespace pixie diff --git a/include/utils.h b/include/utils.h new file mode 100644 index 0000000..b33d85f --- /dev/null +++ b/include/utils.h @@ -0,0 +1,146 @@ +#pragma once + +#include +#include +#include + +#include "louds_tree.h" + +using pixie::LoudsNode; + +std::vector> generate_random_tree(size_t tree_size, + std::mt19937_64& rng) { + if (tree_size == 0) { + return {}; + } + std::vector> adj(tree_size); + adj[0].push_back(0); + for (size_t i = 1; i < tree_size; i++) { + size_t parent = rng() % i; + adj[i].push_back(parent); + adj[parent].push_back(i); + } + return adj; +} + +std::vector> bfs_order( + size_t tree_size, + const std::vector>& adj) { + std::vector> bfs_adj(tree_size); + std::queue> q; + bfs_adj[0].push_back(0); + q.push({0, 0}); + int cnt = 1; + while (!q.empty()) { + size_t old_v = q.front().first; + size_t cur_v = q.front().second; + q.pop(); + for (size_t i = 1; i < adj[old_v].size(); i++) { + size_t old_u = adj[old_v][i]; + size_t cur_u = cnt++; + q.push({old_u, cur_u}); + bfs_adj[cur_u].push_back(cur_v); + bfs_adj[cur_v].push_back(cur_u); + } + } + return bfs_adj; +} + +std::vector adj_to_louds( + size_t tree_size, + const std::vector>& adj) { + size_t louds_size = tree_size * 2 - 1; + std::vector louds((louds_size + 63) / 64, 0); + size_t pos = 0; + for (size_t i = 0; i < tree_size; i++) { + louds[pos >> 6] = louds[pos >> 6] | (1ULL << (pos & 63)); + pos += adj[i].size(); + } + return louds; +} + +struct AdjListNode { + size_t number; +}; + +bool operator==(const AdjListNode& a, const LoudsNode& b) { + return a.number == b.number; +} + +bool operator==(const LoudsNode& b, const AdjListNode& a) { + return a.number == b.number; +} + +class AdjListTree { + private: + std::vector> adj; + + public: + /** + * @brief Constructor from adjacency list (root is 0) + */ + explicit AdjListTree(const std::vector>& adjacency_list) + : adj(adjacency_list) {} + + /** + * @brief Returns the root node + */ + AdjListNode root() const { return AdjListNode(0); } + + /** + * @brief Checks if @p node is a leaf + */ + bool is_leaf(const AdjListNode& node) const { + return adj[node.number].size() <= 1; + } + + /** + * @brief Checks if @p node is a root + */ + bool is_root(const AdjListNode& node) const { return node.number == 0; } + + /** + * @brief Returns the number of children of @p node + */ + size_t degree(const AdjListNode& node) const { + return adj[node.number].size() - 1; + } + + /** + * @brief Returns the i-th child of @p node + * Indexing starts at 0 + */ + AdjListNode child(const AdjListNode& node, size_t i) const { + return AdjListNode(adj[node.number][i + 1]); + } + + /** + * @brief Returns the first child of @p node + */ + AdjListNode first_child(const AdjListNode& node) const { + return AdjListNode(adj[node.number][1]); + } + + /** + * @brief Returns the parent of a @p node if @p node is not root, + * else returns root + */ + AdjListNode parent(const AdjListNode& node) const { + return AdjListNode(adj[node.number][0]); + } + + /** + * @brief Indicates if @p node is last child + */ + bool is_last_child(const AdjListNode& node) const { + size_t p = parent(node).number; + return adj[p].back() == node.number; + } + + /** + * @brief Returns next sibling of a @p node + */ + AdjListNode next_sibling(const AdjListNode& node) const { + return AdjListNode(node.number + 1); + } +}; diff --git a/report/data/dfs.txt b/report/data/dfs.txt new file mode 100644 index 0000000..713bb20 --- /dev/null +++ b/report/data/dfs.txt @@ -0,0 +1,36 @@ +BM_LoudsTreeDFS/tree_size:256/iterations:10 17158 ns 15960 ns 10 +BM_LoudsTreeDFS/tree_size:512/iterations:10 32205 ns 30170 ns 10 +BM_LoudsTreeDFS/tree_size:1024/iterations:10 64570 ns 60450 ns 10 +BM_LoudsTreeDFS/tree_size:2048/iterations:10 149284 ns 139960 ns 10 +BM_LoudsTreeDFS/tree_size:4096/iterations:10 306694 ns 287710 ns 10 +BM_LoudsTreeDFS/tree_size:8192/iterations:10 639481 ns 599620 ns 10 +BM_LoudsTreeDFS/tree_size:16384/iterations:10 1321981 ns 1239680 ns 10 +BM_LoudsTreeDFS/tree_size:32768/iterations:10 2621919 ns 2458610 ns 10 +BM_LoudsTreeDFS/tree_size:65536/iterations:10 5418396 ns 5081010 ns 10 +BM_LoudsTreeDFS/tree_size:131072/iterations:10 11477777 ns 10762480 ns 10 +BM_LoudsTreeDFS/tree_size:262144/iterations:10 22475017 ns 21071420 ns 10 +BM_LoudsTreeDFS/tree_size:524288/iterations:10 51507287 ns 48337760 ns 10 +BM_LoudsTreeDFS/tree_size:1048576/iterations:10 139215927 ns 130654490 ns 10 +BM_LoudsTreeDFS/tree_size:2097152/iterations:10 274188404 ns 257485140 ns 10 +BM_LoudsTreeDFS/tree_size:4194304/iterations:10 393504994 ns 379028300 ns 10 +BM_LoudsTreeDFS/tree_size:8388608/iterations:10 772261790 ns 742915900 ns 10 +BM_LoudsTreeDFS/tree_size:16777216/iterations:10 1924302888 ns 1847300630 ns 10 +BM_LoudsTreeDFS/tree_size:33554432/iterations:10 4033885540 ns 3862881520 ns 10 +BM_AdjListTreeDFS/tree_size:256/iterations:10 7004 ns 6500 ns 10 +BM_AdjListTreeDFS/tree_size:512/iterations:10 12709 ns 11890 ns 10 +BM_AdjListTreeDFS/tree_size:1024/iterations:10 27121 ns 25400 ns 10 +BM_AdjListTreeDFS/tree_size:2048/iterations:10 61831 ns 57990 ns 10 +BM_AdjListTreeDFS/tree_size:4096/iterations:10 173807 ns 163320 ns 10 +BM_AdjListTreeDFS/tree_size:8192/iterations:10 346847 ns 325820 ns 10 +BM_AdjListTreeDFS/tree_size:16384/iterations:10 697983 ns 655640 ns 10 +BM_AdjListTreeDFS/tree_size:32768/iterations:10 1422141 ns 1336020 ns 10 +BM_AdjListTreeDFS/tree_size:65536/iterations:10 2862014 ns 2688070 ns 10 +BM_AdjListTreeDFS/tree_size:131072/iterations:10 6096719 ns 5727260 ns 10 +BM_AdjListTreeDFS/tree_size:262144/iterations:10 13984236 ns 13137420 ns 10 +BM_AdjListTreeDFS/tree_size:524288/iterations:10 36108305 ns 33921960 ns 10 +BM_AdjListTreeDFS/tree_size:1048576/iterations:10 97158888 ns 91052080 ns 10 +BM_AdjListTreeDFS/tree_size:2097152/iterations:10 130849640 ns 122713210 ns 10 +BM_AdjListTreeDFS/tree_size:4194304/iterations:10 519976249 ns 492515650 ns 10 +BM_AdjListTreeDFS/tree_size:8388608/iterations:10 1076193064 ns 1019121630 ns 10 +BM_AdjListTreeDFS/tree_size:16777216/iterations:10 2151390243 ns 2036195610 ns 10 +BM_AdjListTreeDFS/tree_size:33554432/iterations:10 4386640878 ns 4148986820 ns 10 \ No newline at end of file diff --git a/report/data/dfs_comparison.png b/report/data/dfs_comparison.png new file mode 100644 index 0000000..6c115f7 Binary files /dev/null and b/report/data/dfs_comparison.png differ diff --git a/report/data/load_comparison_256.png b/report/data/load_comparison_256.png new file mode 100644 index 0000000..ecdc17d Binary files /dev/null and b/report/data/load_comparison_256.png differ diff --git a/report/data/load_comparison_512.png b/report/data/load_comparison_512.png new file mode 100644 index 0000000..8592604 Binary files /dev/null and b/report/data/load_comparison_512.png differ diff --git a/report/data/rank_select_bm.txt b/report/data/rank_select_bm.txt new file mode 100644 index 0000000..b6ec6aa --- /dev/null +++ b/report/data/rank_select_bm.txt @@ -0,0 +1,221 @@ +BM_RankInterleaved/n:8/iterations:100000 4.96 ns 4.61 ns 100000 +BM_RankInterleaved/n:16/iterations:100000 4.33 ns 4.03 ns 100000 +BM_RankInterleaved/n:64/iterations:100000 4.16 ns 3.87 ns 100000 +BM_RankInterleaved/n:256/iterations:100000 14.1 ns 13.1 ns 100000 +BM_RankInterleaved/n:1024/iterations:100000 16.2 ns 15.1 ns 100000 +BM_RankInterleaved/n:4096/iterations:100000 16.0 ns 14.9 ns 100000 +BM_RankInterleaved/n:16384/iterations:100000 15.9 ns 14.8 ns 100000 +BM_RankInterleaved/n:65536/iterations:100000 16.7 ns 15.3 ns 100000 +BM_RankInterleaved/n:262144/iterations:100000 16.0 ns 14.9 ns 100000 +BM_RankInterleaved/n:1048576/iterations:100000 15.9 ns 14.8 ns 100000 +BM_RankInterleaved/n:4194304/iterations:100000 16.2 ns 15.1 ns 100000 +BM_RankInterleaved/n:16777216/iterations:100000 19.1 ns 17.6 ns 100000 +BM_RankInterleaved/n:67108864/iterations:100000 35.9 ns 33.4 ns 100000 +BM_RankInterleaved/n:268435456/iterations:100000 39.4 ns 36.7 ns 100000 +BM_RankInterleaved/n:1073741824/iterations:100000 47.2 ns 43.9 ns 100000 +BM_RankInterleaved/n:4294967296/iterations:100000 55.6 ns 51.7 ns 100000 +BM_RankInterleaved/n:17179869184/iterations:100000 90.0 ns 84.1 ns 100000 +BM_RankNonInterleaved/n:8/iterations:100000 3.27 ns 3.05 ns 100000 +BM_RankNonInterleaved/n:16/iterations:100000 3.32 ns 3.10 ns 100000 +BM_RankNonInterleaved/n:64/iterations:100000 3.24 ns 3.03 ns 100000 +BM_RankNonInterleaved/n:256/iterations:100000 12.2 ns 11.4 ns 100000 +BM_RankNonInterleaved/n:1024/iterations:100000 14.3 ns 13.4 ns 100000 +BM_RankNonInterleaved/n:4096/iterations:100000 13.9 ns 13.0 ns 100000 +BM_RankNonInterleaved/n:16384/iterations:100000 13.8 ns 12.9 ns 100000 +BM_RankNonInterleaved/n:65536/iterations:100000 13.8 ns 12.9 ns 100000 +BM_RankNonInterleaved/n:262144/iterations:100000 13.6 ns 12.7 ns 100000 +BM_RankNonInterleaved/n:1048576/iterations:100000 13.6 ns 12.7 ns 100000 +BM_RankNonInterleaved/n:4194304/iterations:100000 18.0 ns 16.8 ns 100000 +BM_RankNonInterleaved/n:16777216/iterations:100000 19.3 ns 18.1 ns 100000 +BM_RankNonInterleaved/n:67108864/iterations:100000 29.8 ns 27.9 ns 100000 +BM_RankNonInterleaved/n:268435456/iterations:100000 39.9 ns 37.3 ns 100000 +BM_RankNonInterleaved/n:1073741824/iterations:100000 45.3 ns 42.4 ns 100000 +BM_RankNonInterleaved/n:4294967296/iterations:100000 70.1 ns 65.6 ns 100000 +BM_RankNonInterleaved/n:17179869184/iterations:100000 87.1 ns 81.4 ns 100000 +BM_RankZeroNonInterleaved/n:8/iterations:100000 3.72 ns 3.47 ns 100000 +BM_RankZeroNonInterleaved/n:16/iterations:100000 3.73 ns 3.49 ns 100000 +BM_RankZeroNonInterleaved/n:64/iterations:100000 3.66 ns 3.42 ns 100000 +BM_RankZeroNonInterleaved/n:256/iterations:100000 12.5 ns 11.7 ns 100000 +BM_RankZeroNonInterleaved/n:1024/iterations:100000 13.7 ns 12.8 ns 100000 +BM_RankZeroNonInterleaved/n:4096/iterations:100000 13.6 ns 12.7 ns 100000 +BM_RankZeroNonInterleaved/n:16384/iterations:100000 13.7 ns 12.8 ns 100000 +BM_RankZeroNonInterleaved/n:65536/iterations:100000 13.4 ns 12.6 ns 100000 +BM_RankZeroNonInterleaved/n:262144/iterations:100000 13.3 ns 12.4 ns 100000 +BM_RankZeroNonInterleaved/n:1048576/iterations:100000 13.2 ns 12.4 ns 100000 +BM_RankZeroNonInterleaved/n:4194304/iterations:100000 13.5 ns 12.6 ns 100000 +BM_RankZeroNonInterleaved/n:16777216/iterations:100000 26.1 ns 24.4 ns 100000 +BM_RankZeroNonInterleaved/n:67108864/iterations:100000 15.5 ns 14.5 ns 100000 +BM_RankZeroNonInterleaved/n:268435456/iterations:100000 37.1 ns 34.7 ns 100000 +BM_RankZeroNonInterleaved/n:1073741824/iterations:100000 49.7 ns 46.5 ns 100000 +BM_RankZeroNonInterleaved/n:4294967296/iterations:100000 78.5 ns 73.5 ns 100000 +BM_RankZeroNonInterleaved/n:17179869184/iterations:100000 78.4 ns 73.6 ns 100000 +BM_SelectNonInterleaved/n:8/iterations:100000 12.4 ns 11.6 ns 100000 +BM_SelectNonInterleaved/n:16/iterations:100000 12.4 ns 11.7 ns 100000 +BM_SelectNonInterleaved/n:64/iterations:100000 16.3 ns 15.3 ns 100000 +BM_SelectNonInterleaved/n:256/iterations:100000 28.8 ns 26.7 ns 100000 +BM_SelectNonInterleaved/n:1024/iterations:100000 33.3 ns 31.2 ns 100000 +BM_SelectNonInterleaved/n:4096/iterations:100000 31.6 ns 29.6 ns 100000 +BM_SelectNonInterleaved/n:16384/iterations:100000 40.7 ns 38.2 ns 100000 +BM_SelectNonInterleaved/n:65536/iterations:100000 35.6 ns 33.4 ns 100000 +BM_SelectNonInterleaved/n:262144/iterations:100000 37.7 ns 35.4 ns 100000 +BM_SelectNonInterleaved/n:1048576/iterations:100000 44.6 ns 41.9 ns 100000 +BM_SelectNonInterleaved/n:4194304/iterations:100000 52.3 ns 49.1 ns 100000 +BM_SelectNonInterleaved/n:16777216/iterations:100000 60.8 ns 57.1 ns 100000 +BM_SelectNonInterleaved/n:67108864/iterations:100000 92.0 ns 86.4 ns 100000 +BM_SelectNonInterleaved/n:268435456/iterations:100000 120 ns 112 ns 100000 +BM_SelectNonInterleaved/n:1073741824/iterations:100000 158 ns 149 ns 100000 +BM_SelectNonInterleaved/n:4294967296/iterations:100000 275 ns 259 ns 100000 +BM_SelectNonInterleaved/n:17179869184/iterations:100000 333 ns 313 ns 100000 +BM_SelectZeroNonInterleaved/n:8/iterations:100000 13.4 ns 12.6 ns 100000 +BM_SelectZeroNonInterleaved/n:16/iterations:100000 13.7 ns 12.9 ns 100000 +BM_SelectZeroNonInterleaved/n:64/iterations:100000 14.3 ns 13.5 ns 100000 +BM_SelectZeroNonInterleaved/n:256/iterations:100000 2.87 ns 2.69 ns 100000 +BM_SelectZeroNonInterleaved/n:1024/iterations:100000 22.4 ns 21.1 ns 100000 +BM_SelectZeroNonInterleaved/n:4096/iterations:100000 30.7 ns 28.8 ns 100000 +BM_SelectZeroNonInterleaved/n:16384/iterations:100000 32.7 ns 30.7 ns 100000 +BM_SelectZeroNonInterleaved/n:65536/iterations:100000 44.9 ns 42.2 ns 100000 +BM_SelectZeroNonInterleaved/n:262144/iterations:100000 37.8 ns 35.5 ns 100000 +BM_SelectZeroNonInterleaved/n:1048576/iterations:100000 39.0 ns 36.6 ns 100000 +BM_SelectZeroNonInterleaved/n:4194304/iterations:100000 45.0 ns 42.2 ns 100000 +BM_SelectZeroNonInterleaved/n:16777216/iterations:100000 54.0 ns 50.7 ns 100000 +BM_SelectZeroNonInterleaved/n:67108864/iterations:100000 81.6 ns 76.7 ns 100000 +BM_SelectZeroNonInterleaved/n:268435456/iterations:100000 118 ns 111 ns 100000 +BM_SelectZeroNonInterleaved/n:1073741824/iterations:100000 156 ns 146 ns 100000 +BM_SelectZeroNonInterleaved/n:4294967296/iterations:100000 212 ns 200 ns 100000 +BM_SelectZeroNonInterleaved/n:17179869184/iterations:100000 345 ns 324 ns 100000 +BM_RankNonInterleaved10PercentFill/n:8/iterations:100000 3.33 ns 3.11 ns 100000 +BM_RankNonInterleaved10PercentFill/n:16/iterations:100000 3.50 ns 3.29 ns 100000 +BM_RankNonInterleaved10PercentFill/n:64/iterations:100000 3.73 ns 3.51 ns 100000 +BM_RankNonInterleaved10PercentFill/n:256/iterations:100000 12.2 ns 11.5 ns 100000 +BM_RankNonInterleaved10PercentFill/n:1024/iterations:100000 13.9 ns 13.0 ns 100000 +BM_RankNonInterleaved10PercentFill/n:4096/iterations:100000 13.9 ns 13.1 ns 100000 +BM_RankNonInterleaved10PercentFill/n:16384/iterations:100000 13.7 ns 12.8 ns 100000 +BM_RankNonInterleaved10PercentFill/n:65536/iterations:100000 14.8 ns 13.9 ns 100000 +BM_RankNonInterleaved10PercentFill/n:262144/iterations:100000 14.2 ns 13.4 ns 100000 +BM_RankNonInterleaved10PercentFill/n:1048576/iterations:100000 13.9 ns 13.0 ns 100000 +BM_RankNonInterleaved10PercentFill/n:4194304/iterations:100000 14.0 ns 13.1 ns 100000 +BM_RankNonInterleaved10PercentFill/n:16777216/iterations:100000 14.7 ns 13.8 ns 100000 +BM_RankNonInterleaved10PercentFill/n:67108864/iterations:100000 23.7 ns 22.3 ns 100000 +BM_RankNonInterleaved10PercentFill/n:268435456/iterations:100000 37.3 ns 35.0 ns 100000 +BM_RankNonInterleaved10PercentFill/n:1073741824/iterations:100000 43.7 ns 41.0 ns 100000 +BM_RankNonInterleaved10PercentFill/n:4294967296/iterations:100000 83.2 ns 77.5 ns 100000 +BM_RankNonInterleaved10PercentFill/n:17179869184/iterations:100000 83.9 ns 78.7 ns 100000 +BM_RankZeroNonInterleaved10PercentFill/n:8/iterations:100000 3.71 ns 3.46 ns 100000 +BM_RankZeroNonInterleaved10PercentFill/n:16/iterations:100000 3.54 ns 3.32 ns 100000 +BM_RankZeroNonInterleaved10PercentFill/n:64/iterations:100000 3.53 ns 3.31 ns 100000 +BM_RankZeroNonInterleaved10PercentFill/n:256/iterations:100000 12.1 ns 11.4 ns 100000 +BM_RankZeroNonInterleaved10PercentFill/n:1024/iterations:100000 14.0 ns 13.1 ns 100000 +BM_RankZeroNonInterleaved10PercentFill/n:4096/iterations:100000 13.7 ns 12.8 ns 100000 +BM_RankZeroNonInterleaved10PercentFill/n:16384/iterations:100000 13.5 ns 12.6 ns 100000 +BM_RankZeroNonInterleaved10PercentFill/n:65536/iterations:100000 13.3 ns 12.5 ns 100000 +BM_RankZeroNonInterleaved10PercentFill/n:262144/iterations:100000 13.2 ns 12.3 ns 100000 +BM_RankZeroNonInterleaved10PercentFill/n:1048576/iterations:100000 13.2 ns 12.3 ns 100000 +BM_RankZeroNonInterleaved10PercentFill/n:4194304/iterations:100000 13.7 ns 12.8 ns 100000 +BM_RankZeroNonInterleaved10PercentFill/n:16777216/iterations:100000 14.3 ns 13.4 ns 100000 +BM_RankZeroNonInterleaved10PercentFill/n:67108864/iterations:100000 15.4 ns 14.4 ns 100000 +BM_RankZeroNonInterleaved10PercentFill/n:268435456/iterations:100000 36.4 ns 34.1 ns 100000 +BM_RankZeroNonInterleaved10PercentFill/n:1073741824/iterations:100000 40.6 ns 37.8 ns 100000 +BM_RankZeroNonInterleaved10PercentFill/n:4294967296/iterations:100000 53.0 ns 49.7 ns 100000 +BM_RankZeroNonInterleaved10PercentFill/n:17179869184/iterations:100000 83.9 ns 78.1 ns 100000 +BM_SelectNonInterleaved10PercentFill/n:8/iterations:100000 2.84 ns 2.63 ns 100000 +BM_SelectNonInterleaved10PercentFill/n:16/iterations:100000 13.0 ns 12.1 ns 100000 +BM_SelectNonInterleaved10PercentFill/n:64/iterations:100000 16.3 ns 15.1 ns 100000 +BM_SelectNonInterleaved10PercentFill/n:256/iterations:100000 26.2 ns 24.4 ns 100000 +BM_SelectNonInterleaved10PercentFill/n:1024/iterations:100000 33.1 ns 30.8 ns 100000 +BM_SelectNonInterleaved10PercentFill/n:4096/iterations:100000 36.0 ns 33.5 ns 100000 +BM_SelectNonInterleaved10PercentFill/n:16384/iterations:100000 33.8 ns 31.5 ns 100000 +BM_SelectNonInterleaved10PercentFill/n:65536/iterations:100000 35.1 ns 32.7 ns 100000 +BM_SelectNonInterleaved10PercentFill/n:262144/iterations:100000 34.9 ns 32.5 ns 100000 +BM_SelectNonInterleaved10PercentFill/n:1048576/iterations:100000 37.3 ns 34.7 ns 100000 +BM_SelectNonInterleaved10PercentFill/n:4194304/iterations:100000 41.8 ns 38.9 ns 100000 +BM_SelectNonInterleaved10PercentFill/n:16777216/iterations:100000 47.6 ns 44.3 ns 100000 +BM_SelectNonInterleaved10PercentFill/n:67108864/iterations:100000 57.6 ns 53.6 ns 100000 +BM_SelectNonInterleaved10PercentFill/n:268435456/iterations:100000 105 ns 98.3 ns 100000 +BM_SelectNonInterleaved10PercentFill/n:1073741824/iterations:100000 159 ns 149 ns 100000 +BM_SelectNonInterleaved10PercentFill/n:4294967296/iterations:100000 206 ns 193 ns 100000 +BM_SelectNonInterleaved10PercentFill/n:17179869184/iterations:100000 296 ns 278 ns 100000 +BM_SelectZeroNonInterleaved10PercentFill/n:8/iterations:100000 13.8 ns 13.0 ns 100000 +BM_SelectZeroNonInterleaved10PercentFill/n:16/iterations:100000 14.3 ns 13.4 ns 100000 +BM_SelectZeroNonInterleaved10PercentFill/n:64/iterations:100000 14.1 ns 13.3 ns 100000 +BM_SelectZeroNonInterleaved10PercentFill/n:256/iterations:100000 28.8 ns 27.1 ns 100000 +BM_SelectZeroNonInterleaved10PercentFill/n:1024/iterations:100000 34.5 ns 32.3 ns 100000 +BM_SelectZeroNonInterleaved10PercentFill/n:4096/iterations:100000 32.7 ns 30.8 ns 100000 +BM_SelectZeroNonInterleaved10PercentFill/n:16384/iterations:100000 45.0 ns 42.2 ns 100000 +BM_SelectZeroNonInterleaved10PercentFill/n:65536/iterations:100000 38.0 ns 35.8 ns 100000 +BM_SelectZeroNonInterleaved10PercentFill/n:262144/iterations:100000 38.3 ns 36.0 ns 100000 +BM_SelectZeroNonInterleaved10PercentFill/n:1048576/iterations:100000 39.4 ns 37.0 ns 100000 +BM_SelectZeroNonInterleaved10PercentFill/n:4194304/iterations:100000 45.0 ns 42.3 ns 100000 +BM_SelectZeroNonInterleaved10PercentFill/n:16777216/iterations:100000 50.1 ns 47.0 ns 100000 +BM_SelectZeroNonInterleaved10PercentFill/n:67108864/iterations:100000 58.9 ns 55.4 ns 100000 +BM_SelectZeroNonInterleaved10PercentFill/n:268435456/iterations:100000 118 ns 110 ns 100000 +BM_SelectZeroNonInterleaved10PercentFill/n:1073741824/iterations:100000 169 ns 159 ns 100000 +BM_SelectZeroNonInterleaved10PercentFill/n:4294967296/iterations:100000 278 ns 261 ns 100000 +BM_SelectZeroNonInterleaved10PercentFill/n:17179869184/iterations:100000 326 ns 305 ns 100000 +BM_RankNonInterleaved90PercentFill/n:8/iterations:100000 3.41 ns 3.18 ns 100000 +BM_RankNonInterleaved90PercentFill/n:16/iterations:100000 3.28 ns 3.07 ns 100000 +BM_RankNonInterleaved90PercentFill/n:64/iterations:100000 3.29 ns 3.08 ns 100000 +BM_RankNonInterleaved90PercentFill/n:256/iterations:100000 12.4 ns 11.6 ns 100000 +BM_RankNonInterleaved90PercentFill/n:1024/iterations:100000 14.2 ns 13.2 ns 100000 +BM_RankNonInterleaved90PercentFill/n:4096/iterations:100000 13.8 ns 12.9 ns 100000 +BM_RankNonInterleaved90PercentFill/n:16384/iterations:100000 14.1 ns 13.2 ns 100000 +BM_RankNonInterleaved90PercentFill/n:65536/iterations:100000 13.6 ns 12.8 ns 100000 +BM_RankNonInterleaved90PercentFill/n:262144/iterations:100000 14.1 ns 13.2 ns 100000 +BM_RankNonInterleaved90PercentFill/n:1048576/iterations:100000 20.6 ns 19.3 ns 100000 +BM_RankNonInterleaved90PercentFill/n:4194304/iterations:100000 14.1 ns 13.2 ns 100000 +BM_RankNonInterleaved90PercentFill/n:16777216/iterations:100000 14.5 ns 13.6 ns 100000 +BM_RankNonInterleaved90PercentFill/n:67108864/iterations:100000 15.2 ns 14.3 ns 100000 +BM_RankNonInterleaved90PercentFill/n:268435456/iterations:100000 35.3 ns 33.1 ns 100000 +BM_RankNonInterleaved90PercentFill/n:1073741824/iterations:100000 52.6 ns 49.4 ns 100000 +BM_RankNonInterleaved90PercentFill/n:4294967296/iterations:100000 59.1 ns 55.0 ns 100000 +BM_RankNonInterleaved90PercentFill/n:17179869184/iterations:100000 83.8 ns 77.8 ns 100000 +BM_RankZeroNonInterleaved90PercentFill/n:8/iterations:100000 3.70 ns 3.43 ns 100000 +BM_RankZeroNonInterleaved90PercentFill/n:16/iterations:100000 3.96 ns 3.68 ns 100000 +BM_RankZeroNonInterleaved90PercentFill/n:64/iterations:100000 3.79 ns 3.53 ns 100000 +BM_RankZeroNonInterleaved90PercentFill/n:256/iterations:100000 12.5 ns 11.6 ns 100000 +BM_RankZeroNonInterleaved90PercentFill/n:1024/iterations:100000 14.4 ns 13.4 ns 100000 +BM_RankZeroNonInterleaved90PercentFill/n:4096/iterations:100000 14.2 ns 13.2 ns 100000 +BM_RankZeroNonInterleaved90PercentFill/n:16384/iterations:100000 14.1 ns 13.1 ns 100000 +BM_RankZeroNonInterleaved90PercentFill/n:65536/iterations:100000 13.9 ns 12.9 ns 100000 +BM_RankZeroNonInterleaved90PercentFill/n:262144/iterations:100000 14.0 ns 13.0 ns 100000 +BM_RankZeroNonInterleaved90PercentFill/n:1048576/iterations:100000 14.4 ns 13.4 ns 100000 +BM_RankZeroNonInterleaved90PercentFill/n:4194304/iterations:100000 14.6 ns 13.5 ns 100000 +BM_RankZeroNonInterleaved90PercentFill/n:16777216/iterations:100000 15.1 ns 14.0 ns 100000 +BM_RankZeroNonInterleaved90PercentFill/n:67108864/iterations:100000 15.7 ns 14.5 ns 100000 +BM_RankZeroNonInterleaved90PercentFill/n:268435456/iterations:100000 182 ns 170 ns 100000 +BM_RankZeroNonInterleaved90PercentFill/n:1073741824/iterations:100000 42.0 ns 39.3 ns 100000 +BM_RankZeroNonInterleaved90PercentFill/n:4294967296/iterations:100000 60.4 ns 56.1 ns 100000 +BM_RankZeroNonInterleaved90PercentFill/n:17179869184/iterations:100000 86.5 ns 80.7 ns 100000 +BM_SelectNonInterleaved90PercentFill/n:8/iterations:100000 18.0 ns 16.8 ns 100000 +BM_SelectNonInterleaved90PercentFill/n:16/iterations:100000 19.2 ns 17.9 ns 100000 +BM_SelectNonInterleaved90PercentFill/n:64/iterations:100000 19.2 ns 17.9 ns 100000 +BM_SelectNonInterleaved90PercentFill/n:256/iterations:100000 28.3 ns 26.4 ns 100000 +BM_SelectNonInterleaved90PercentFill/n:1024/iterations:100000 34.1 ns 31.8 ns 100000 +BM_SelectNonInterleaved90PercentFill/n:4096/iterations:100000 33.3 ns 31.0 ns 100000 +BM_SelectNonInterleaved90PercentFill/n:16384/iterations:100000 34.2 ns 31.9 ns 100000 +BM_SelectNonInterleaved90PercentFill/n:65536/iterations:100000 42.5 ns 39.7 ns 100000 +BM_SelectNonInterleaved90PercentFill/n:262144/iterations:100000 36.4 ns 33.9 ns 100000 +BM_SelectNonInterleaved90PercentFill/n:1048576/iterations:100000 37.6 ns 35.1 ns 100000 +BM_SelectNonInterleaved90PercentFill/n:4194304/iterations:100000 41.7 ns 38.9 ns 100000 +BM_SelectNonInterleaved90PercentFill/n:16777216/iterations:100000 46.8 ns 43.7 ns 100000 +BM_SelectNonInterleaved90PercentFill/n:67108864/iterations:100000 89.7 ns 83.6 ns 100000 +BM_SelectNonInterleaved90PercentFill/n:268435456/iterations:100000 112 ns 105 ns 100000 +BM_SelectNonInterleaved90PercentFill/n:1073741824/iterations:100000 162 ns 150 ns 100000 +BM_SelectNonInterleaved90PercentFill/n:4294967296/iterations:100000 246 ns 229 ns 100000 +BM_SelectNonInterleaved90PercentFill/n:17179869184/iterations:100000 308 ns 288 ns 100000 +BM_SelectZeroNonInterleaved90PercentFill/n:8/iterations:100000 13.6 ns 12.7 ns 100000 +BM_SelectZeroNonInterleaved90PercentFill/n:16/iterations:100000 15.9 ns 14.9 ns 100000 +BM_SelectZeroNonInterleaved90PercentFill/n:64/iterations:100000 16.8 ns 15.8 ns 100000 +BM_SelectZeroNonInterleaved90PercentFill/n:256/iterations:100000 24.8 ns 23.3 ns 100000 +BM_SelectZeroNonInterleaved90PercentFill/n:1024/iterations:100000 32.2 ns 30.2 ns 100000 +BM_SelectZeroNonInterleaved90PercentFill/n:4096/iterations:100000 31.9 ns 29.9 ns 100000 +BM_SelectZeroNonInterleaved90PercentFill/n:16384/iterations:100000 39.6 ns 37.1 ns 100000 +BM_SelectZeroNonInterleaved90PercentFill/n:65536/iterations:100000 35.0 ns 32.8 ns 100000 +BM_SelectZeroNonInterleaved90PercentFill/n:262144/iterations:100000 34.6 ns 32.4 ns 100000 +BM_SelectZeroNonInterleaved90PercentFill/n:1048576/iterations:100000 35.6 ns 33.4 ns 100000 +BM_SelectZeroNonInterleaved90PercentFill/n:4194304/iterations:100000 38.7 ns 36.2 ns 100000 +BM_SelectZeroNonInterleaved90PercentFill/n:16777216/iterations:100000 45.5 ns 42.6 ns 100000 +BM_SelectZeroNonInterleaved90PercentFill/n:67108864/iterations:100000 59.0 ns 55.3 ns 100000 +BM_SelectZeroNonInterleaved90PercentFill/n:268435456/iterations:100000 104 ns 97.0 ns 100000 +BM_SelectZeroNonInterleaved90PercentFill/n:1073741824/iterations:100000 148 ns 139 ns 100000 +BM_SelectZeroNonInterleaved90PercentFill/n:4294967296/iterations:100000 199 ns 186 ns 100000 +BM_SelectZeroNonInterleaved90PercentFill/n:17179869184/iterations:100000 277 ns 260 ns 100000 \ No newline at end of file diff --git a/report/data/select_bm_10.png b/report/data/select_bm_10.png new file mode 100644 index 0000000..c2d1dd7 Binary files /dev/null and b/report/data/select_bm_10.png differ diff --git a/report/data/select_bm_50.png b/report/data/select_bm_50.png new file mode 100644 index 0000000..7373b95 Binary files /dev/null and b/report/data/select_bm_50.png differ diff --git a/report/data/select_bm_90.png b/report/data/select_bm_90.png new file mode 100644 index 0000000..ac1d31c Binary files /dev/null and b/report/data/select_bm_90.png differ diff --git a/report/data/store_comparison_256.png b/report/data/store_comparison_256.png new file mode 100644 index 0000000..6477658 Binary files /dev/null and b/report/data/store_comparison_256.png differ diff --git a/report/data/store_comparison_512.png b/report/data/store_comparison_512.png new file mode 100644 index 0000000..2ae7f51 Binary files /dev/null and b/report/data/store_comparison_512.png differ diff --git a/report/report.md b/report/report.md new file mode 100644 index 0000000..c2faf1b --- /dev/null +++ b/report/report.md @@ -0,0 +1,124 @@ +# Отчет по семестровому проекту + +## План + +1. Aligned/unaligned инструкции +2. Rank0/Select0 +3. LOUDS Tree + +## Aligned/unaligned инструкции + +### Мотивация + +Текущая реализация бит-вектора использует инструкции ```loadu/storeu```, поддерживающие невыровненные по размеру регистров адреса. Теоретически, невыровненные инструкции могут работать медленнее выровненных, так как запрос может пересекать границу кэш-линии. Попробуем замерить и сравнить время работы для различных случаев. + +### Эксперименты для 256-битных инструкций + +Стандартный размер кэш-линии у большинства процессоров равен 64-м байтам (512 битам). +Для блока из 256 бит возможны следующие варианты: +1. Адрес выровнен +2. Адрес невыровнен, блок полность попадает в одну кэш-линию +3. Адрес невыровнен, блок перескает границу кэш-линий + +Ниже приведены графики времени выполнения одной 256-битной инструкции на случайном адресе в зависмости от размера массива. + +![](data/load_comparison_256.png "") +![](data/store_comparison_256.png "") + +Замеры выполнялись на процессоре AMD Ryzen 5 5600H 6 Cores / 12 Threads. +Размеры кэшей: +* L1 Data 32 KiB (x6) +* L1 Instruction 32 KiB (x6) +* L2 Unified 512 KiB (x6) +* L3 Unified 16384 KiB (x1) + +Для ```load/loadu``` инструкций существенных различий не наблюдатеся, все различия в пределах 1.5 нс. +Для ```store/storeu``` аналогично, кроме ситуации с пересечением границы кэш-линии, в этом случае, как только массив перестает помещаться в L3 (221 байт), операция сильно замедляется. + +### Эксперименты для 512-битных инструкций + +Для блока из 512 бит возможны следующие варианты: +1. Адрес выровнен +2. Адрес невыровнен, блок перескает границу кэш-линий + +![](data/load_comparison_512.png "") +![](data/store_comparison_512.png "") + +Замеры выполнялись на процессоре ... 6 Cores / 12 Threads. +Размеры кэшей: +* L1 Data 32 KiB (x8) +* L1 Instruction 32 KiB (x8) +* L2 Unified 1024 KiB (x8) +* L3 Unified 16384 KiB (x1) + +Для ```load/loadu``` инструкций различия в пределах 3 нс. +Для ```store/storeu``` снова явное замедление при пересечении границы кэш-линий. + +### Выводы + +По наблюдаемым результатм можно сделать вывод, что оптимальнее использовать выровненные инструкции. +В реализации бит-вектора ```storeu``` использовалась только в одной функции ```rank_512``` и была заменена на выровненную версию. +Замена ```loadu``` на ```load``` требует изменения логики всего алгоритма без потери эффективности, пока что отложена. + +## Rank0/Select0 + +### Выбор реализации + +```rank0``` реализуется тривиально: ```rank0(pos) = pos - rank(pos)```. +C ```select0``` несколько сложнее: +1. Можно хранить копии массивов предподсчитаных значений для инвертированного исходного массива, такой подход удвоит текущий 3.61% оверхэд. +2. Согласно статье "Engineering Compact Data Structures for Rank and Select Queries on Bit Vectors" Florian Kurpicz +можно реализовать вторую операцию (в нашем случае ```select0```) без дополнительной памяти но с замедлением в ~1.5 раза. +3. Можно дополнительно предпосчитать позиции нулей с частатой ```kSelectSampleFrequency```, как это было сделано для единиц и ```select```. Тогда оверхэд увеличится всего на 0.4%, но ```select0``` будет работать медленнее ```select```. + +Был выбран третий вариант, так как количество дополнительной памяти и, как окажется далее, замедленее ```select0``` незначительные. + +### Реализация + +Реализация ```select0``` почти полностью повторяет ```select```, основное отличие в том, что для базовых и супер блоков на префиксах предподсчитаны количества единиц, а нужны количества нулей. Нужно учитывать это при обращении к блокам, предварительно вычитая ранк из длины префикса. Для битовых операций ```lower_bound_*```, при выполнении которых одна инструкция затрагивает сразу несколько блоков, созданы копии ```lower_bound_dlt_*```, учитывающие предварительное вычитание и принимающие на вход позицию первого блока и предподсчитанную арифметическую прогрессию с шагом в размер блока. +Для нового функционала реализованы тесты и добавлены в CI. Для ```rank0/select0``` реализованы бенчмарки. + +### Время работы + +Ниже приведены графики времени выполнения различных операциий в зависмости от размера массива с различными процентами заполнения (количеством единиц). Замеры выполнялись на процессоре AMD Ryzen 5 5600H 6 Cores / 12 Threads с поддержкой AVX2 (нужно также замерить для AVX512). +Размеры кэшей: +* L1 Data 32 KiB (x6) +* L1 Instruction 32 KiB (x6) +* L2 Unified 512 KiB (x6) +* L3 Unified 16384 KiB (x1) + +![](data/select_bm_50.png "") +![](data/select_bm_10.png "") +![](data/select_bm_90.png "") + +По графикам можно сказать, что обе операции работают примерно одинаково. Также видна прямая зависимость времени выполнения ```select/select0``` от количества единичных/нулевых битов соответсвенно, которая является следствием реалиации. + +## LOUDS Tree + +### Реализация + +Level Order Unary Degree Sequence - сжатое представление дерева в виде последовательности битов длины 2n-1. +Идея взята из [sdsl](https://github.com/simongog/sdsl-lite/blob/master/include/sdsl/louds_tree.hpp). +Дерево представляется в порядке обхода в ширину: для каждого узла записываем единичный бит, за которым следует столько нулей, сколько у этого узла детей. +Для вершин создан отдельный класс ```LoudsNode```, хранящий номер вершины и ее позицию в LOUDS представлении. Это позволяет реализовать все основные операции для навигации с помощью ```select/select0```. +Реализованы две дополнительные операции, которых нет в [sdsl](https://github.com/simongog/sdsl-lite/blob/master/include/sdsl/louds_tree.hpp): +* node next_sibling(node) - Возвращает следующего брата (сына отца в порядке BFS). +* bool is_last_child(node) - Проверяет, является ли текущая вершина последним сыном в порядке BFS среди братьев. + +Они позволяют обходить дерево в порядке DFS с $O(1)$ дополнительной памяти. +Реализованы тесты и бенчмарки, а также дерево на списке смежности для них. + +### Сравнение со стандартной реализацией + +Будем сравнивать LOUDS реализацию дерева с реализацией на списке смежности. Для каждого размера строим 10 случайных деревьев и берем среднее время их обхода. +Замеры выполнялись на процессоре AMD Ryzen 5 5600H 6 Cores / 12 Threads с поддержкой AVX2 (нужно также замерить для AVX512). +Размеры кэшей: +* L1 Data 32 KiB (x6) +* L1 Instruction 32 KiB (x6) +* L2 Unified 512 KiB (x6) +* L3 Unified 16384 KiB (x1) + +![](data/dfs_comparison.png "") + +По графику можно сказать, что LOUDS работает не хуже. При росте размера дерева LOUDS немного выигрывает у стандартной реализации, что объясняется меньшим количеством кэш-промахов. +Также по полученным результатам можно оценить эффективность бит-вектора, до этого момента ее замеряли только на равномерно распределнных данных. \ No newline at end of file diff --git a/src/benchmark_tests.cpp b/src/benchmark_tests.cpp index 1233a3c..2c302f4 100644 --- a/src/benchmark_tests.cpp +++ b/src/benchmark_tests.cpp @@ -10,7 +10,7 @@ using pixie::BitVector; using pixie::BitVectorInterleaved; TEST(BitVectorBenchmarkTest, SelectNonInterleaved10PercentFill) { - for (size_t n = 4; n <= (1ull << 34); n <<= 2) { + for (size_t n = 8; n <= (1ull << 34); n <<= 2) { std::mt19937_64 rng(42); std::vector bits(((8 + n / 64) / 8) * 8); @@ -30,8 +30,29 @@ TEST(BitVectorBenchmarkTest, SelectNonInterleaved10PercentFill) { } } +TEST(BitVectorBenchmarkTest, SelectZeroNonInterleaved10PercentFill) { + for (size_t n = 8; n <= (1ull << 34); n <<= 2) { + std::mt19937_64 rng(42); + + std::vector bits(((8 + n / 64) / 8) * 8); + size_t num_ones = n * 0.1; + for (int i = 0; i < num_ones; i++) { + uint64_t pos = rng() % n; + bits[pos / 64] |= (1ULL << pos % 64); + } + + pixie::BitVector bv(bits, n); + + auto max_rank0 = bv.rank0(bv.size()) + 1; + for (int i = 0; i < 100000; i++) { + uint64_t rank0 = rng() % max_rank0; + bv.select0(rank0); + } + } +} + TEST(BitVectorBenchmarkTest, SelectNonInterleaved90PercentFill) { - for (size_t n = 4; n <= (1ull << 34); n <<= 2) { + for (size_t n = 8; n <= (1ull << 34); n <<= 2) { std::mt19937_64 rng(42); std::vector bits(((8 + n / 64) / 8) * 8); @@ -51,8 +72,29 @@ TEST(BitVectorBenchmarkTest, SelectNonInterleaved90PercentFill) { } } +TEST(BitVectorBenchmarkTest, SelectZeroNonInterleaved90PercentFill) { + for (size_t n = 8; n <= (1ull << 34); n <<= 2) { + std::mt19937_64 rng(42); + + std::vector bits(((8 + n / 64) / 8) * 8); + size_t num_ones = n * 0.9; + for (int i = 0; i < num_ones; i++) { + uint64_t pos = rng() % n; + bits[pos / 64] |= (1ULL << pos % 64); + } + + pixie::BitVector bv(bits, n); + + auto max_rank0 = bv.rank0(bv.size()) + 1; + for (int i = 0; i < 100000; i++) { + uint64_t rank0 = rng() % max_rank0; + bv.select0(rank0); + } + } +} + TEST(BitVectorBenchmarkTest, SelectNonInterleaved) { - for (size_t n = 4; n <= (1ull << 34); n <<= 2) { + for (size_t n = 8; n <= (1ull << 34); n <<= 2) { std::mt19937_64 rng(42); std::vector bits(((8 + n / 64) / 8) * 8); @@ -69,3 +111,22 @@ TEST(BitVectorBenchmarkTest, SelectNonInterleaved) { } } } + +TEST(BitVectorBenchmarkTest, SelectZeroNonInterleaved) { + for (size_t n = 8; n <= (1ull << 34); n <<= 2) { + std::mt19937_64 rng(42); + + std::vector bits(((8 + n / 64) / 8) * 8); + for (auto& x : bits) { + x = rng(); + } + pixie::BitVector bv(bits, n); + + auto max_rank0 = bv.rank0(bv.size()) + 1; + + for (int i = 0; i < 100000; i++) { + uint64_t rank0 = rng() % max_rank0; + bv.select0(rank0); + } + } +} diff --git a/src/benchmarks.cpp b/src/benchmarks.cpp index e2586fa..54a0ea6 100644 --- a/src/benchmarks.cpp +++ b/src/benchmarks.cpp @@ -24,6 +24,22 @@ static void BM_RankNonInterleaved(benchmark::State& state) { } } +static void BM_RankZeroNonInterleaved(benchmark::State& state) { + size_t n = state.range(0); + std::mt19937_64 rng(42); + + std::vector bits(((8 + n / 64) / 8) * 8); + for (auto& x : bits) { + x = rng(); + } + pixie::BitVector bv(bits, n); + + for (auto _ : state) { + uint64_t pos = rng() % n; + benchmark::DoNotOptimize(bv.rank0(pos)); + } +} + static void BM_RankInterleaved(benchmark::State& state) { size_t n = state.range(0); std::mt19937_64 rng(42); @@ -58,6 +74,24 @@ static void BM_SelectNonInterleaved(benchmark::State& state) { } } +static void BM_SelectZeroNonInterleaved(benchmark::State& state) { + size_t n = state.range(0); + std::mt19937_64 rng(42); + + std::vector bits(((8 + n / 64) / 8) * 8); + for (auto& x : bits) { + x = rng(); + } + pixie::BitVector bv(bits, n); + + auto max_rank0 = bv.rank0(bv.size()) + 1; + + for (auto _ : state) { + uint64_t rank0 = rng() % max_rank0; + benchmark::DoNotOptimize(bv.select0(rank0)); + } +} + static void BM_RankNonInterleaved10PercentFill(benchmark::State& state) { size_t n = state.range(0); std::mt19937_64 rng(42); @@ -77,6 +111,25 @@ static void BM_RankNonInterleaved10PercentFill(benchmark::State& state) { } } +static void BM_RankZeroNonInterleaved10PercentFill(benchmark::State& state) { + size_t n = state.range(0); + std::mt19937_64 rng(42); + + std::vector bits(((8 + n / 64) / 8) * 8); + size_t num_ones = n * 0.1; + for (int i = 0; i < num_ones; i++) { + uint64_t pos = rng() % n; + bits[pos / 64] |= (1ULL << pos % 64); + } + + pixie::BitVector bv(bits, n); + + for (auto _ : state) { + uint64_t pos = rng() % n; + benchmark::DoNotOptimize(bv.rank0(pos)); + } +} + static void BM_SelectNonInterleaved10PercentFill(benchmark::State& state) { size_t n = state.range(0); std::mt19937_64 rng(42); @@ -98,6 +151,27 @@ static void BM_SelectNonInterleaved10PercentFill(benchmark::State& state) { } } +static void BM_SelectZeroNonInterleaved10PercentFill(benchmark::State& state) { + size_t n = state.range(0); + std::mt19937_64 rng(42); + + std::vector bits(((8 + n / 64) / 8) * 8); + size_t num_ones = n * 0.1; + for (int i = 0; i < num_ones; i++) { + uint64_t pos = rng() % n; + bits[pos / 64] |= (1ULL << pos % 64); + } + + pixie::BitVector bv(bits, n); + + auto max_rank0 = bv.rank0(bv.size()) + 1; + + for (auto _ : state) { + uint64_t rank0 = rng() % max_rank0; + benchmark::DoNotOptimize(bv.select0(rank0)); + } +} + static void BM_RankNonInterleaved90PercentFill(benchmark::State& state) { size_t n = state.range(0); std::mt19937_64 rng(42); @@ -117,6 +191,25 @@ static void BM_RankNonInterleaved90PercentFill(benchmark::State& state) { } } +static void BM_RankZeroNonInterleaved90PercentFill(benchmark::State& state) { + size_t n = state.range(0); + std::mt19937_64 rng(42); + + std::vector bits(((8 + n / 64) / 8) * 8); + size_t num_ones = n * 0.9; + for (int i = 0; i < num_ones; i++) { + uint64_t pos = rng() % n; + bits[pos / 64] |= (1ULL << pos % 64); + } + + pixie::BitVector bv(bits, n); + + for (auto _ : state) { + uint64_t pos = rng() % n; + benchmark::DoNotOptimize(bv.rank0(pos)); + } +} + static void BM_SelectNonInterleaved90PercentFill(benchmark::State& state) { size_t n = state.range(0); std::mt19937_64 rng(42); @@ -138,13 +231,40 @@ static void BM_SelectNonInterleaved90PercentFill(benchmark::State& state) { } } +static void BM_SelectZeroNonInterleaved90PercentFill(benchmark::State& state) { + size_t n = state.range(0); + std::mt19937_64 rng(42); + + std::vector bits(((8 + n / 64) / 8) * 8); + size_t num_ones = n * 0.9; + for (int i = 0; i < num_ones; i++) { + uint64_t pos = rng() % n; + bits[pos / 64] |= (1ULL << pos % 64); + } + + pixie::BitVector bv(bits, n); + + auto max_rank0 = bv.rank0(bv.size()) + 1; + + for (auto _ : state) { + uint64_t rank0 = rng() % max_rank0; + benchmark::DoNotOptimize(bv.select(rank0)); + } +} + +BENCHMARK(BM_RankInterleaved) + ->ArgNames({"n"}) + ->RangeMultiplier(4) + ->Range(8, 1ull << 34) + ->Iterations(100000); + BENCHMARK(BM_RankNonInterleaved) ->ArgNames({"n"}) ->RangeMultiplier(4) ->Range(8, 1ull << 34) ->Iterations(100000); -BENCHMARK(BM_RankInterleaved) +BENCHMARK(BM_RankZeroNonInterleaved) ->ArgNames({"n"}) ->RangeMultiplier(4) ->Range(8, 1ull << 34) @@ -156,26 +276,56 @@ BENCHMARK(BM_SelectNonInterleaved) ->Range(8, 1ull << 34) ->Iterations(100000); +BENCHMARK(BM_SelectZeroNonInterleaved) + ->ArgNames({"n"}) + ->RangeMultiplier(4) + ->Range(8, 1ull << 34) + ->Iterations(100000); + BENCHMARK(BM_RankNonInterleaved10PercentFill) ->ArgNames({"n"}) ->RangeMultiplier(4) ->Range(8, 1ull << 34) ->Iterations(100000); +BENCHMARK(BM_RankZeroNonInterleaved10PercentFill) + ->ArgNames({"n"}) + ->RangeMultiplier(4) + ->Range(8, 1ull << 34) + ->Iterations(100000); + BENCHMARK(BM_SelectNonInterleaved10PercentFill) ->ArgNames({"n"}) ->RangeMultiplier(4) ->Range(8, 1ull << 34) ->Iterations(100000); +BENCHMARK(BM_SelectZeroNonInterleaved10PercentFill) + ->ArgNames({"n"}) + ->RangeMultiplier(4) + ->Range(8, 1ull << 34) + ->Iterations(100000); + BENCHMARK(BM_RankNonInterleaved90PercentFill) ->ArgNames({"n"}) ->RangeMultiplier(4) ->Range(8, 1ull << 34) ->Iterations(100000); +BENCHMARK(BM_RankZeroNonInterleaved90PercentFill) + ->ArgNames({"n"}) + ->RangeMultiplier(4) + ->Range(8, 1ull << 34) + ->Iterations(100000); + BENCHMARK(BM_SelectNonInterleaved90PercentFill) ->ArgNames({"n"}) ->RangeMultiplier(4) ->Range(8, 1ull << 34) ->Iterations(100000); + +BENCHMARK(BM_SelectZeroNonInterleaved90PercentFill) + ->ArgNames({"n"}) + ->RangeMultiplier(4) + ->Range(8, 1ull << 34) + ->Iterations(100000); diff --git a/src/louds_tree_benchmarks.cpp b/src/louds_tree_benchmarks.cpp new file mode 100644 index 0000000..4091aa3 --- /dev/null +++ b/src/louds_tree_benchmarks.cpp @@ -0,0 +1,116 @@ +#include + +#include +#include +#include + +#include "louds_tree.h" +#include "utils.h" + +using pixie::LoudsNode; +using pixie::LoudsTree; + +/** + * DFS with O(1) extra memory + */ +static void BM_LoudsTreeDFS(benchmark::State& state) { + size_t tree_size = state.range(0); + std::mt19937_64 rng(42); + + for (auto _ : state) { + state.PauseTiming(); + + std::vector> adj = generate_random_tree(tree_size, rng); + adj = bfs_order(tree_size, adj); + std::vector louds = adj_to_louds(tree_size, adj); + LoudsTree tree(louds, tree_size); + + LoudsNode cur = tree.root(); + bool above = 1; + + state.ResumeTiming(); + + benchmark::DoNotOptimize(cur); + + while (true) { + if (above) { + if (tree.is_leaf(cur)) { + above = 0; + } else { + cur = tree.first_child(cur); + } + benchmark::DoNotOptimize(cur); + } else { + if (tree.is_last_child(cur)) { + cur = tree.parent(cur); + if (tree.is_root(cur)) { + break; + } + benchmark::DoNotOptimize(cur); + } else { + cur = tree.next_sibling(cur); + above = 1; + benchmark::DoNotOptimize(cur); + } + } + } + } +} + +/** + * DFS with O(1) extra memory + */ +static void BM_AdjListTreeDFS(benchmark::State& state) { + size_t tree_size = state.range(0); + std::mt19937_64 rng(42); + + for (auto _ : state) { + state.PauseTiming(); + + std::vector> adj = generate_random_tree(tree_size, rng); + adj = bfs_order(tree_size, adj); + AdjListTree tree(adj); + + AdjListNode cur = tree.root(); + bool above = 1; + + state.ResumeTiming(); + + benchmark::DoNotOptimize(cur); + + while (true) { + if (above) { + if (tree.is_leaf(cur)) { + above = 0; + } else { + cur = tree.first_child(cur); + } + benchmark::DoNotOptimize(cur); + } else { + if (tree.is_last_child(cur)) { + cur = tree.parent(cur); + if (tree.is_root(cur)) { + break; + } + benchmark::DoNotOptimize(cur); + } else { + cur = tree.next_sibling(cur); + above = 1; + benchmark::DoNotOptimize(cur); + } + } + } + } +} + +BENCHMARK(BM_LoudsTreeDFS) + ->ArgNames({"tree_size"}) + ->RangeMultiplier(2) + ->Range(1ull << 8, 1ull << 22) + ->Iterations(10); + +BENCHMARK(BM_AdjListTreeDFS) + ->ArgNames({"tree_size"}) + ->RangeMultiplier(2) + ->Range(1ull << 8, 1ull << 22) + ->Iterations(10); diff --git a/src/louds_tree_tests.cpp b/src/louds_tree_tests.cpp new file mode 100644 index 0000000..9744984 --- /dev/null +++ b/src/louds_tree_tests.cpp @@ -0,0 +1,115 @@ +#include "louds_tree.h" + +#include + +#include +#include +#include +#include + +#include "utils.h" + +using pixie::LoudsNode; +using pixie::LoudsTree; + +TEST(LoudsTreeTest, Basic) { + std::vector> adj = {{0, 1}, {0, 2}, {1, 3}, {2, 4}, {3}}; + size_t tree_size = 5; + + std::vector louds = adj_to_louds(tree_size, adj); + + LoudsTree louds_tree(louds, 5); + AdjListTree debug_tree(adj); + + LoudsNode cur = louds_tree.root(); + AdjListNode debug = debug_tree.root(); + for (size_t i = 0; i < tree_size - 1; i++) { + EXPECT_EQ(cur, debug); + cur = louds_tree.child(cur, 0); + debug = debug_tree.child(debug, 0); + } + EXPECT_EQ(cur, debug); +} + +TEST(LoudsTreeTest, RandomTreeDFS) { + for (size_t tree_size = 8; tree_size < (1 << 22); tree_size <<= 1) { + std::mt19937_64 rng(42); + std::vector> adj = generate_random_tree(tree_size, rng); + adj = bfs_order(tree_size, adj); + std::vector louds = adj_to_louds(tree_size, adj); + LoudsTree louds_tree(louds, tree_size); + AdjListTree debug_tree(adj); + + std::stack> st; + + st.push({louds_tree.root(), debug_tree.root()}); + + while (!st.empty()) { + auto cur = st.top().first; + auto debug = st.top().second; + st.pop(); + EXPECT_EQ(cur, debug); + EXPECT_EQ(louds_tree.parent(cur), debug_tree.parent(debug)); + EXPECT_EQ(louds_tree.is_last_child(cur), debug_tree.is_last_child(debug)); + + if (!debug_tree.is_last_child(debug)) { + EXPECT_EQ(louds_tree.next_sibling(cur), debug_tree.next_sibling(debug)); + } + + size_t deg = louds_tree.degree(cur); + EXPECT_EQ(deg, debug_tree.degree(debug)); + + for (size_t i = 0; i < deg; i++) { + st.push({louds_tree.child(cur, i), debug_tree.child(debug, i)}); + } + } + } +} + +TEST(LoudsTreeTest, RandomTreeDFSWithZeroExtraMemory) { + for (size_t tree_size = 8; tree_size < (1 << 22); tree_size <<= 1) { + std::mt19937_64 rng(42); + std::vector> adj = generate_random_tree(tree_size, rng); + adj = bfs_order(tree_size, adj); + std::vector louds = adj_to_louds(tree_size, adj); + LoudsTree louds_tree(louds, tree_size); + AdjListTree debug_tree(adj); + + LoudsNode cur = louds_tree.root(); + AdjListNode debug = debug_tree.root(); + + bool above = 1; + size_t cnt_visited = 1; + while (true) { + if (above) { + ASSERT_EQ(louds_tree.is_leaf(cur), debug_tree.is_leaf(debug)); + if (louds_tree.is_leaf(cur)) { + above = 0; + } else { + cur = louds_tree.first_child(cur); + debug = debug_tree.first_child(debug); + ASSERT_EQ(cur, debug); + cnt_visited++; + } + } else { + ASSERT_EQ(louds_tree.is_last_child(cur), + debug_tree.is_last_child(debug)); + if (louds_tree.is_last_child(cur)) { + cur = louds_tree.parent(cur); + debug = debug_tree.parent(debug); + ASSERT_EQ(cur, debug); + if (louds_tree.is_root(cur)) { + break; + } + } else { + cur = louds_tree.next_sibling(cur); + debug = debug_tree.next_sibling(debug); + ASSERT_EQ(cur, debug); + above = 1; + cnt_visited++; + } + } + } + ASSERT_EQ(tree_size, cnt_visited); + } +} diff --git a/src/unittests.cpp b/src/unittests.cpp index dddb0b7..27d037c 100644 --- a/src/unittests.cpp +++ b/src/unittests.cpp @@ -89,6 +89,14 @@ TEST(Select512, Ones) { } } +TEST(SelectZero512, Zeros) { + std::array a{0, 0, 0, 0, 0, 0, 0, 0}; + for (size_t i = 0; i < 512; ++i) { + auto p = select0_512(a.data(), i); + EXPECT_EQ(p, i); + } +} + TEST(Select512, Random) { std::array a{std::numeric_limits::max(), std::numeric_limits::max(), @@ -114,6 +122,31 @@ TEST(Select512, Random) { } } +TEST(SelectZero512, Random) { + std::array a{std::numeric_limits::max(), + std::numeric_limits::max(), + std::numeric_limits::max(), + std::numeric_limits::max(), + std::numeric_limits::max(), + std::numeric_limits::max(), + std::numeric_limits::max(), + std::numeric_limits::max()}; + + std::mt19937_64 rng(42); + for (size_t t = 0; t < 1000; ++t) { + for (auto& x : a) { + x = rng(); + } + size_t rank = 0; + for (size_t i = 0; i < 512; ++i) { + if ((1 & (a[i >> 6] >> (i & 63))) == 0) { + auto p = select0_512(a.data(), rank++); + ASSERT_EQ(p, i); + } + } + } +} + TEST(Select512, RankCompativility) { std::array a{std::numeric_limits::max(), std::numeric_limits::max(), @@ -209,7 +242,7 @@ TEST(BitVectorTest, RankWithZeros) { TEST(BitVectorTest, SelectBasic) { std::vector bits(8, 0); bits[0] = 0b1100010110010110; - BitVector bv(bits, 5); + BitVector bv(bits, 16); EXPECT_EQ(bv.select(1), 1); EXPECT_EQ(bv.select(2), 2); @@ -255,6 +288,78 @@ TEST(BitVectorTest, MainSelectTest) { } } +TEST(BitVectorTest, RankZeroBasic) { + std::vector bits(8, 0); + bits[0] = 0b10110; + BitVector bv(bits, 10); + + EXPECT_EQ(bv.rank0(0), 0); + EXPECT_EQ(bv.rank0(1), 1); + EXPECT_EQ(bv.rank0(2), 1); + EXPECT_EQ(bv.rank0(3), 1); + EXPECT_EQ(bv.rank0(4), 2); + EXPECT_EQ(bv.rank0(5), 2); +} + +TEST(BitVectorTest, RankZeroWithOnes) { + std::vector bits(8, 0); + BitVector bv(bits, 5); + + for (size_t i = 0; i <= 5; i++) { + EXPECT_EQ(bv.rank0(i), i); + } +} + +TEST(BitVectorTest, SelectZeroBasic) { + std::vector bits(8, 0); + bits[0] = 0b1100010110010110; + BitVector bv(bits, 16); + + EXPECT_EQ(bv.select0(1), 0); + EXPECT_EQ(bv.select0(2), 3); + EXPECT_EQ(bv.select0(3), 5); + EXPECT_EQ(bv.select0(4), 6); + EXPECT_EQ(bv.select0(5), 9); + EXPECT_EQ(bv.select0(6), 11); + EXPECT_EQ(bv.select0(7), 12); + EXPECT_EQ(bv.select0(8), 13); +} + +TEST(BitVectorTest, MainRankZeroTest) { + std::mt19937_64 rng(42); + std::vector bits(65536 * 32); + for (size_t i = 0; i < bits.size(); i++) { + bits[i] = rng(); + } + + BitVector bv(bits, bits.size() * 64); + uint64_t zero_count = 0; + for (size_t i = 0; i < bv.size(); ++i) { + ASSERT_EQ(zero_count, bv.rank0(i)); + zero_count += (bv[i] == 0) ? 1 : 0; + } +} + +TEST(BitVectorTest, MainSelectZeroTest) { + std::mt19937_64 rng(42); + std::vector bits(65536 * 32); + for (size_t i = 0; i < bits.size(); i++) { + bits[i] = rng(); + } + + BitVector bv(bits, bits.size() * 64); + uint64_t zero_rank = 0; + + for (size_t i = 0; i < bv.size(); ++i) { + if (bv[i] == 0) { + zero_rank++; + EXPECT_EQ(bv.select0(zero_rank), i); + EXPECT_EQ(bv.rank0(i), zero_rank - 1); + EXPECT_EQ(bv.rank0(i + 1), zero_rank); + } + } +} + TEST(BitVectorInterleavedTest, AtTest) { std::mt19937_64 rng(42); /* @@ -330,6 +435,52 @@ TEST(LowerBound8x64, Random) { } } +TEST(LowerBoundDlt4x64, Random) { + std::vector x(4); + uint64_t dlt_array[4]; + std::mt19937_64 rng(42); + for (size_t i = 0; i < 100000; i++) { + uint64_t y = rng(); + uint64_t dlt_scalar = rng(); + uint16_t cnt = 0; + bool fl = 1; + for (size_t j = 0; j < 4; j++) { + dlt_array[j] = rng(); + x[j] = rng(); + fl &= dlt_scalar + dlt_array[j] - x[j] < y; + cnt += fl; + } + if (cnt < 4) { + ASSERT_EQ(lower_bound_dlt_4x64(x.data(), y, dlt_array, dlt_scalar), cnt); + } else { + ASSERT_GE(lower_bound_dlt_4x64(x.data(), y, dlt_array, dlt_scalar), cnt); + } + } +} + +TEST(LowerBoundDlt8x64, Random) { + std::vector x(8); + uint64_t dlt_array[8]; + std::mt19937_64 rng(42); + for (size_t i = 0; i < 100000; i++) { + uint64_t y = rng(); + uint64_t dlt_scalar = rng(); + uint16_t cnt = 0; + bool fl = 1; + for (size_t j = 0; j < 8; j++) { + dlt_array[j] = rng(); + x[j] = rng(); + fl &= dlt_scalar + dlt_array[j] - x[j] < y; + cnt += fl; + } + if (cnt < 8) { + ASSERT_EQ(lower_bound_dlt_8x64(x.data(), y, dlt_array, dlt_scalar), cnt); + } else { + ASSERT_GE(lower_bound_dlt_8x64(x.data(), y, dlt_array, dlt_scalar), cnt); + } + } +} + TEST(LowerBound32x16, Random) { std::vector x(32); std::mt19937 rng(42); @@ -344,6 +495,28 @@ TEST(LowerBound32x16, Random) { } } +TEST(LowerBoundDlt32x16, Random) { + std::vector x(32); + uint16_t dlt_array[32]; + std::mt19937 rng(42); + for (size_t i = 0; i < 100000; i++) { + uint16_t y = rng(); + uint16_t dlt_scalar = rng(); + uint16_t cnt = 0; + for (size_t j = 0; j < 32; j++) { + x[j] = rng(); + if (dlt_scalar < x[j]) { + dlt_array[j] = + x[j] - dlt_scalar + rng() % ((1 << 16) - x[j] + dlt_scalar); + } else { + dlt_array[j] = rng() % ((1 << 16) - dlt_scalar); + } + cnt += dlt_array[j] + dlt_scalar - x[j] < y; + } + ASSERT_EQ(lower_bound_dlt_32x16(x.data(), y, dlt_array, dlt_scalar), cnt); + } +} + TEST(Popcount64x4, Random) { std::vector x(32); std::vector y(32);