11use crate :: error:: { CudaResult , DropResult , ToResult } ;
22use crate :: memory:: device:: { AsyncCopyDestination , CopyDestination , DeviceSlice } ;
33use crate :: memory:: malloc:: { cuda_free, cuda_malloc} ;
4- use crate :: memory:: DeviceCopy ;
5- use crate :: memory:: DevicePointer ;
4+ use crate :: memory:: { cuda_free_async , DevicePointer } ;
5+ use crate :: memory:: { cuda_malloc_async , DeviceCopy } ;
66use crate :: stream:: Stream ;
77use crate :: sys as cuda;
8- use std:: mem;
8+ #[ cfg_attr( docsrs, doc( cfg( feature = "bytemuck" ) ) ) ]
9+ pub use bytemuck;
10+ #[ cfg( feature = "bytemuck" ) ]
11+ use bytemuck:: { Pod , PodCastError , Zeroable } ;
12+ use std:: mem:: { self , align_of, size_of, transmute, ManuallyDrop } ;
913use std:: ops:: { Deref , DerefMut } ;
1014
1115/// Fixed-size device-side buffer. Provides basic access to device memory.
1216#[ derive( Debug ) ]
1317#[ repr( C ) ]
1418pub struct DeviceBuffer < T : DeviceCopy > {
1519 buf : DevicePointer < T > ,
16- capacity : usize ,
20+ len : usize ,
1721}
1822
1923unsafe impl < T : Send + DeviceCopy > Send for DeviceBuffer < T > { }
@@ -42,57 +46,84 @@ impl<T: DeviceCopy> DeviceBuffer<T> {
4246 /// buffer.copy_from(&[0u64, 1, 2, 3, 4]).unwrap();
4347 /// ```
4448 pub unsafe fn uninitialized ( size : usize ) -> CudaResult < Self > {
45- let ptr = if size > 0 && mem :: size_of :: < T > ( ) > 0 {
49+ let ptr = if size > 0 && size_of :: < T > ( ) > 0 {
4650 cuda_malloc ( size) ?
4751 } else {
4852 // FIXME (AL): Do we /really/ want to allow creating an invalid buffer?
4953 DevicePointer :: null ( )
5054 } ;
5155 Ok ( DeviceBuffer {
5256 buf : ptr,
53- capacity : size,
57+ len : size,
5458 } )
5559 }
5660
57- /// Allocate a new device buffer large enough to hold `size` `T`'s and fill the contents with
58- /// zeroes (`0u8`).
61+ /// Allocates device memory asynchronously on a stream, without initializing it.
5962 ///
60- /// # Errors
61- ///
62- /// If the allocation fails, returns the error from CUDA. If `size` is large enough that
63- /// `size * mem::sizeof::<T>()` overflows usize, then returns InvalidMemoryAllocation.
63+ /// This doesn't actually allocate if `T` is zero sized.
6464 ///
6565 /// # Safety
6666 ///
67- /// The backing memory is zeroed, which may not be a valid bit-pattern for type `T`. The caller
68- /// must ensure either that all-zeroes is a valid bit-pattern for type `T` or that the backing
69- /// memory is set to a valid value before it is read.
67+ /// The allocated memory retains all of the unsafety of [`DeviceBuffer::uninitialized`], with
68+ /// the additional consideration that the memory cannot be used until it is actually allocated
69+ /// on the stream. This means proper stream ordering semantics must be followed, such as
70+ /// only enqueing kernel launches that use the memory AFTER the allocation call.
7071 ///
71- /// # Examples
72- ///
73- /// ```
74- /// # let _context = cust::quick_init().unwrap();
75- /// use cust::memory::*;
76- /// let buffer = unsafe { DeviceBuffer::zeroed(5).unwrap() };
77- /// let mut host_values = [1u64, 2, 3, 4, 5];
78- /// buffer.copy_to(&mut host_values).unwrap();
79- /// assert_eq!([0u64, 0, 0, 0, 0], host_values);
80- /// ```
81- pub unsafe fn zeroed ( size : usize ) -> CudaResult < Self > {
82- let ptr = if size > 0 && mem:: size_of :: < T > ( ) > 0 {
83- let ptr = cuda_malloc ( size) ?;
84- cuda:: cuMemsetD8_v2 ( ptr. as_raw ( ) , 0 , size * mem:: size_of :: < T > ( ) ) . to_result ( ) ?;
85- ptr
72+ /// You can synchronize the stream to ensure the memory allocation operation is complete.
73+ pub unsafe fn uninitialized_async ( size : usize , stream : & Stream ) -> CudaResult < Self > {
74+ let ptr = if size > 0 && size_of :: < T > ( ) > 0 {
75+ cuda_malloc_async ( stream, size) ?
8676 } else {
87- // FIXME (AL): Do we /really/ want to allow creating an invalid buffer?
8877 DevicePointer :: null ( )
8978 } ;
9079 Ok ( DeviceBuffer {
9180 buf : ptr,
92- capacity : size,
81+ len : size,
9382 } )
9483 }
9584
85+ /// Enqueues an operation to free the memory backed by this [`DeviceBuffer`] on a
86+ /// particular stream. The stream will free the allocation as soon as it reaches
87+ /// the operation in the stream. You can ensure the memory is freed by synchronizing
88+ /// the stream.
89+ ///
90+ /// This function uses internal memory pool semantics. Async allocations will reserve memory
91+ /// in the default memory pool in the stream, and async frees will release the memory back to the pool
92+ /// for further use by async allocations.
93+ ///
94+ /// The memory inside of the pool is all freed back to the OS once the stream is synchronized unless
95+ /// a custom pool is configured to not do so.
96+ ///
97+ /// # Examples
98+ ///
99+ /// ```
100+ /// # fn main() -> Result<(), Box<dyn std::error::Error>> {
101+ /// # let _context = cust::quick_init().unwrap();
102+ /// use cust::{memory::*, stream::*};
103+ /// let stream = Stream::new(StreamFlags::DEFAULT, None)?;
104+ /// let mut host_vals = [1, 2, 3];
105+ /// unsafe {
106+ /// let mut allocated = DeviceBuffer::from_slice_async(&[4u8, 5, 6], &stream)?;
107+ /// allocated.async_copy_to(&mut host_vals, &stream)?;
108+ /// allocated.drop_async(&stream)?;
109+ /// }
110+ /// // ensure all async ops are done before trying to access the value
111+ /// stream.synchronize()?;
112+ /// assert_eq!(host_vals, [4, 5, 6]);
113+ /// # Ok(())
114+ /// # }
115+ /// ```
116+ pub fn drop_async ( self , stream : & Stream ) -> CudaResult < ( ) > {
117+ if self . buf . is_null ( ) {
118+ return Ok ( ( ) ) ;
119+ }
120+ // make sure we dont run the normal destructor, otherwise a double drop will happen
121+ let me = ManuallyDrop :: new ( self ) ;
122+ // SAFETY: we consume the box so its not possible to use the box past its drop point unless
123+ // you keep around a pointer, but in that case, we cannot guarantee safety.
124+ unsafe { cuda_free_async ( stream, me. buf ) }
125+ }
126+
96127 /// Creates a `DeviceBuffer<T>` directly from the raw components of another device buffer.
97128 ///
98129 /// # Safety
@@ -130,7 +161,10 @@ impl<T: DeviceCopy> DeviceBuffer<T> {
130161 /// let buffer = unsafe { DeviceBuffer::from_raw_parts(ptr, size) };
131162 /// ```
132163 pub unsafe fn from_raw_parts ( ptr : DevicePointer < T > , capacity : usize ) -> DeviceBuffer < T > {
133- DeviceBuffer { buf : ptr, capacity }
164+ DeviceBuffer {
165+ buf : ptr,
166+ len : capacity,
167+ }
134168 }
135169
136170 /// Destroy a `DeviceBuffer`, returning an error.
@@ -157,8 +191,8 @@ impl<T: DeviceCopy> DeviceBuffer<T> {
157191 return Ok ( ( ) ) ;
158192 }
159193
160- if dev_buf. capacity > 0 && mem :: size_of :: < T > ( ) > 0 {
161- let capacity = dev_buf. capacity ;
194+ if dev_buf. len > 0 && size_of :: < T > ( ) > 0 {
195+ let capacity = dev_buf. len ;
162196 let ptr = mem:: replace ( & mut dev_buf. buf , DevicePointer :: null ( ) ) ;
163197 unsafe {
164198 match cuda_free ( ptr) {
@@ -174,6 +208,132 @@ impl<T: DeviceCopy> DeviceBuffer<T> {
174208 }
175209 }
176210}
211+
212+ #[ cfg( feature = "bytemuck" ) ]
213+ impl < T : DeviceCopy + Zeroable > DeviceBuffer < T > {
214+ /// Allocate device memory and fill it with zeroes (`0u8`).
215+ ///
216+ /// This doesn't actually allocate if `T` is zero-sized.
217+ ///
218+ /// # Examples
219+ ///
220+ /// ```
221+ /// # let _context = cust::quick_init().unwrap();
222+ /// use cust::memory::*;
223+ /// let mut zero = DeviceBuffer::zeroed(4).unwrap();
224+ /// let mut values = [1u8, 2, 3, 4];
225+ /// zero.copy_to(&mut values).unwrap();
226+ /// assert_eq!(values, [0; 4]);
227+ /// ```
228+ #[ cfg_attr( docsrs, doc( cfg( feature = "bytemuck" ) ) ) ]
229+ pub fn zeroed ( size : usize ) -> CudaResult < Self > {
230+ unsafe {
231+ let new_buf = DeviceBuffer :: uninitialized ( size) ?;
232+ if size_of :: < T > ( ) != 0 {
233+ cuda:: cuMemsetD8_v2 ( new_buf. as_device_ptr ( ) . as_raw ( ) , 0 , size_of :: < T > ( ) * size)
234+ . to_result ( ) ?;
235+ }
236+ Ok ( new_buf)
237+ }
238+ }
239+
240+ /// Allocates device memory asynchronously and asynchronously fills it with zeroes (`0u8`).
241+ ///
242+ /// This doesn't actually allocate if `T` is zero-sized.
243+ ///
244+ /// # Safety
245+ ///
246+ /// This method enqueues two operations on the stream: An async allocation
247+ /// and an async memset. Because of this, you must ensure that:
248+ /// - The memory is not used in any way before it is actually allocated on the stream. You
249+ /// can ensure this happens by synchronizing the stream explicitly or using events.
250+ ///
251+ /// # Examples
252+ ///
253+ /// ```
254+ /// # fn main() -> Result<(), Box<dyn std::error::Error>> {
255+ /// # let _context = cust::quick_init().unwrap();
256+ /// use cust::{memory::*, stream::*};
257+ /// let stream = Stream::new(StreamFlags::DEFAULT, None)?;
258+ /// let mut values = [1u8, 2, 3, 4];
259+ /// unsafe {
260+ /// let mut zero = DeviceBuffer::zeroed_async(4, &stream)?;
261+ /// zero.async_copy_to(&mut values, &stream)?;
262+ /// zero.drop_async(&stream)?;
263+ /// }
264+ /// stream.synchronize()?;
265+ /// assert_eq!(values, [0; 4]);
266+ /// # Ok(())
267+ /// # }
268+ /// ```
269+ #[ cfg_attr( docsrs, doc( cfg( feature = "bytemuck" ) ) ) ]
270+ pub unsafe fn zeroed_async ( size : usize , stream : & Stream ) -> CudaResult < Self > {
271+ let new_buf = DeviceBuffer :: uninitialized_async ( size, stream) ?;
272+ if size_of :: < T > ( ) != 0 {
273+ cuda:: cuMemsetD8Async (
274+ new_buf. as_device_ptr ( ) . as_raw ( ) ,
275+ 0 ,
276+ size_of :: < T > ( ) * size,
277+ stream. as_inner ( ) ,
278+ )
279+ . to_result ( ) ?;
280+ }
281+ Ok ( new_buf)
282+ }
283+ }
284+
285+ fn casting_went_wrong ( src : & str , err : PodCastError ) -> ! {
286+ panic ! ( "{}>{:?}" , src, err) ;
287+ }
288+
289+ #[ cfg( feature = "bytemuck" ) ]
290+ impl < A : DeviceCopy + Pod > DeviceBuffer < A > {
291+ /// Same as [`DeviceBuffer::try_cast`] but panics if the cast fails.
292+ ///
293+ /// # Panics
294+ ///
295+ /// See [`DeviceBuffer::try_cast`].
296+ #[ cfg_attr( docsrs, doc( cfg( feature = "bytemuck" ) ) ) ]
297+ pub fn cast < B : Pod + DeviceCopy > ( self ) -> DeviceBuffer < B > {
298+ match Self :: try_cast ( self ) {
299+ Ok ( b) => b,
300+ Err ( e) => casting_went_wrong ( "cast" , e) ,
301+ }
302+ }
303+
304+ /// Tries to convert a [`DeviceBuffer`] of type `A` to a [`DeviceBuffer`] of type `B`. Returning
305+ /// an error if it failed.
306+ ///
307+ /// The length of the buffer after the conversion may have changed.
308+ ///
309+ /// # Failure
310+ ///
311+ /// - If the target type has a greater alignment requirement.
312+ /// - If the target element type is a different size and the output buffer wouldn't have a
313+ /// whole number of elements. Such as `3` x [`u16`] -> `1.5` x [`u32`].
314+ /// - If either type is a ZST (but not both).
315+ #[ cfg_attr( docsrs, doc( cfg( feature = "bytemuck" ) ) ) ]
316+ pub fn try_cast < B : Pod + DeviceCopy > ( self ) -> Result < DeviceBuffer < B > , PodCastError > {
317+ if align_of :: < B > ( ) > align_of :: < A > ( ) && ( self . buf . as_raw ( ) as usize ) % align_of :: < B > ( ) != 0
318+ {
319+ Err ( PodCastError :: TargetAlignmentGreaterAndInputNotAligned )
320+ } else if size_of :: < B > ( ) == size_of :: < A > ( ) {
321+ // SAFETY: we made sure sizes were compatible, and DeviceBuffer is repr(C)
322+ Ok ( unsafe { transmute :: < _ , DeviceBuffer < B > > ( self ) } )
323+ } else if size_of :: < A > ( ) == 0 || size_of :: < B > ( ) == 0 {
324+ Err ( PodCastError :: SizeMismatch )
325+ } else if ( size_of :: < A > ( ) * self . len ) % size_of :: < B > ( ) == 0 {
326+ let new_len = ( size_of :: < A > ( ) * self . len ) / size_of :: < B > ( ) ;
327+ Ok ( DeviceBuffer {
328+ buf : self . buf . cast ( ) ,
329+ len : new_len,
330+ } )
331+ } else {
332+ Err ( PodCastError :: OutputSliceWouldHaveSlop )
333+ }
334+ }
335+ }
336+
177337impl < T : DeviceCopy > DeviceBuffer < T > {
178338 /// Allocate a new device buffer of the same size as `slice`, initialized with a clone of
179339 /// the data in `slice`.
@@ -225,7 +385,7 @@ impl<T: DeviceCopy> DeviceBuffer<T> {
225385 /// }
226386 /// ```
227387 pub unsafe fn from_slice_async ( slice : & [ T ] , stream : & Stream ) -> CudaResult < Self > {
228- let mut uninit = DeviceBuffer :: uninitialized ( slice. len ( ) ) ?;
388+ let mut uninit = DeviceBuffer :: uninitialized_async ( slice. len ( ) , stream ) ?;
229389 uninit. async_copy_from ( slice, stream) ?;
230390 Ok ( uninit)
231391 }
@@ -256,13 +416,13 @@ impl<T: DeviceCopy> Drop for DeviceBuffer<T> {
256416 return ;
257417 }
258418
259- if self . capacity > 0 && mem :: size_of :: < T > ( ) > 0 {
419+ if self . len > 0 && size_of :: < T > ( ) > 0 {
260420 let ptr = mem:: replace ( & mut self . buf , DevicePointer :: null ( ) ) ;
261421 unsafe {
262422 let _ = cuda_free ( ptr) ;
263423 }
264424 }
265- self . capacity = 0 ;
425+ self . len = 0 ;
266426 }
267427}
268428
0 commit comments