-
Notifications
You must be signed in to change notification settings - Fork 1.8k
feat: Implement the statistics_cache function
#19054
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 2 commits
ab30e28
1a4596d
fd4748a
3ecbd51
03bc0a7
c1c0fb4
57c0dda
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -17,6 +17,7 @@ | |
|
|
||
| use crate::cache::cache_unit::DefaultFilesMetadataCache; | ||
| use crate::cache::CacheAccessor; | ||
| use datafusion_common::stats::Precision; | ||
| use datafusion_common::{Result, Statistics}; | ||
| use object_store::path::Path; | ||
| use object_store::ObjectMeta; | ||
|
|
@@ -32,8 +33,27 @@ use std::sync::Arc; | |
| /// session lifetime. | ||
| /// | ||
| /// See [`crate::runtime_env::RuntimeEnv`] for more details | ||
| pub type FileStatisticsCache = | ||
| Arc<dyn CacheAccessor<Path, Arc<Statistics>, Extra = ObjectMeta>>; | ||
| pub trait FileStatisticsCache: | ||
| CacheAccessor<Path, Arc<Statistics>, Extra = ObjectMeta> | ||
| { | ||
| /// Retrieves the information about the entries currently cached. | ||
| fn list_entries(&self) -> HashMap<Path, FileStatisticsCacheEntry>; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we want to make this thread safe? When we bring in size limit, we might want to evict the cache object as well
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not sure I understand, this should be thread safe as is right?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sorry, I got confused with an inner object created as HashMap, this makes sense here. |
||
| } | ||
|
|
||
| #[derive(Debug, Clone, PartialEq, Eq)] | ||
nuno-faria marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| /// Represents information about a cached statistics entry. | ||
| /// This is used to expose the statistics cache contents to outside modules. | ||
| pub struct FileStatisticsCacheEntry { | ||
| pub object_meta: ObjectMeta, | ||
| /// Number of table rows. | ||
| pub num_rows: Precision<usize>, | ||
| /// Number of table columns. | ||
| pub num_columns: usize, | ||
| /// Total table size, in bytes. | ||
| pub table_size_bytes: Precision<usize>, | ||
| /// Size of the statistics entry, in bytes. | ||
| pub statistics_size_bytes: usize, | ||
| } | ||
|
|
||
| /// Cache for storing the [`ObjectMeta`]s that result from listing a path | ||
| /// | ||
|
|
@@ -103,7 +123,7 @@ pub struct FileMetadataCacheEntry { | |
| pub extra: HashMap<String, String>, | ||
| } | ||
|
|
||
| impl Debug for dyn CacheAccessor<Path, Arc<Statistics>, Extra = ObjectMeta> { | ||
| impl Debug for dyn FileStatisticsCache { | ||
| fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { | ||
| write!(f, "Cache name: {} with length: {}", self.name(), self.len()) | ||
| } | ||
|
|
@@ -130,7 +150,7 @@ impl Debug for dyn FileMetadataCache { | |
| /// See [`CacheManagerConfig`] for configuration options. | ||
| #[derive(Debug)] | ||
| pub struct CacheManager { | ||
| file_statistic_cache: Option<FileStatisticsCache>, | ||
| file_statistic_cache: Option<Arc<dyn FileStatisticsCache>>, | ||
| list_files_cache: Option<ListFilesCache>, | ||
| file_metadata_cache: Arc<dyn FileMetadataCache>, | ||
| } | ||
|
|
@@ -161,7 +181,7 @@ impl CacheManager { | |
| } | ||
|
|
||
| /// Get the cache of listing files statistics. | ||
| pub fn get_file_statistic_cache(&self) -> Option<FileStatisticsCache> { | ||
| pub fn get_file_statistic_cache(&self) -> Option<Arc<dyn FileStatisticsCache>> { | ||
| self.file_statistic_cache.clone() | ||
| } | ||
|
|
||
|
|
@@ -188,7 +208,7 @@ pub struct CacheManagerConfig { | |
| /// Enable cache of files statistics when listing files. | ||
| /// Avoid get same file statistics repeatedly in same datafusion session. | ||
| /// Default is disable. Fow now only supports Parquet files. | ||
| pub table_files_statistics_cache: Option<FileStatisticsCache>, | ||
| pub table_files_statistics_cache: Option<Arc<dyn FileStatisticsCache>>, | ||
| /// Enable cache of file metadata when listing files. | ||
| /// This setting avoids listing file meta of the same path repeatedly | ||
| /// in same session, which may be expensive in certain situations (e.g. remote object storage). | ||
|
|
@@ -221,7 +241,7 @@ impl CacheManagerConfig { | |
| /// Default is `None` (disabled). | ||
| pub fn with_files_statistics_cache( | ||
| mut self, | ||
| cache: Option<FileStatisticsCache>, | ||
| cache: Option<Arc<dyn FileStatisticsCache>>, | ||
| ) -> Self { | ||
| self.table_files_statistics_cache = cache; | ||
| self | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This confirms that the file statistics cache is not populated when the table is created, only after accessing it once.
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do we want to provide a pre-warming option to it? Similar to metadataCache?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think so, it's being implemented here: #18971.