[doc] add xpu-memory-management (#2504) (#2560)

guangyey · gujinghui · web-flow · commit 68cdc439dc88 · 2023-04-17T21:39:02.000+08:00
[doc] add xpu-memory-management add usm python API decrepated warning --------- Co-authored-by: Jinghui <jinghui.gu@intel.com> (cherry picked from commit 1344b88)
diff --git a/csrc/include/xpu/Utils.h b/csrc/include/xpu/Utils.h
@@ -30,6 +30,8 @@ namespace dpcpp {
 /// @param strides: strides.
 /// @param device_id: device id.
 /// @returns: Tensor.
+C10_DEPRECATED_MESSAGE(
+    "fromUSM is deprecated. Please use the USM-based DLPack solution instead.")
 IPEX_API at::Tensor fromUSM(
     void* src,
     const at::ScalarType stype,
@@ -40,6 +42,8 @@ IPEX_API at::Tensor fromUSM(
 /// Get a pointer of united shared memory from a tensor.
 /// @param src: Tensor.
 /// @returns: a pointer of united shared memory.
+C10_DEPRECATED_MESSAGE(
+    "toUSM is deprecated. Please use the USM-based DLPack solution instead.")
 IPEX_API void* toUSM(const at::Tensor& src);
 
 } // namespace dpcpp
diff --git a/docs/tutorials/technical_details.rst b/docs/tutorials/technical_details.rst
@@ -39,12 +39,31 @@ Optimizers are a key part of the training workloads. Intel® Extension for PyTor
 2. SplitSGD for BF16 training, which reduces the memory footprint of the master weights by half. **[CPU]**
 
 
-For more detailed information, check `Optimizer Fusion on CPU <technical_details/optimizer_fusion_cpu.md>`_, `Optimizer Fusion on GPU <technical_details/optimizer_fusion_gpu.md>`_ and `Split SGD <technical_details/split_sgd.html>`_ 
-
 .. toctree::
    :hidden:
    :maxdepth: 1
 
    technical_details/optimizer_fusion_cpu
    technical_details/optimizer_fusion_gpu
    technical_details/split_sgd
+
+
+.. _xpu-memory-management:
+
+Memory Management [GPU]
+---------------------------------
+
+Intel® Extension for PyTorch* uses a caching memory allocator to speed up memory allocations. This allows fast memory deallocation without any overhead.
+Allocations are associated with a sycl device. The allocator attempts to find the smallest cached block that will fit the requested size from the reserved block pool.
+If it unable to find a appropriate memory block inside of already allocated ares, the allocator will delegate to allocate a new block memory.
+
+For more detailed information, check `Memory Management <technical_details/memory_management.html>`_.
+
+.. toctree::
+   :hidden:
+   :maxdepth: 1
+
+   technical_details/memory_management
+
+
+For more detailed information, check `Optimizer Fusion on CPU <technical_details/optimizer_fusion_cpu.md>`_, `Optimizer Fusion on GPU <technical_details/optimizer_fusion_gpu.md>`_, `Split SGD <technical_details/split_sgd.html>`_ and `Memory Management <technical_details/memory_management.html>`_
diff --git a/docs/tutorials/technical_details/memory_management.rst b/docs/tutorials/technical_details/memory_management.rst
@@ -0,0 +1,17 @@
+Memory Management
+=================
+
+You can use :meth:`~torch.xpu.memory_allocated` and
+:meth:`~torch.xpu.max_memory_allocated` to monitor memory occupied by
+tensors, and use :meth:`~torch.xpu.memory_reserved` and
+:meth:`~torch.xpu.max_memory_reserved` to monitor the total amount of memory
+managed by the caching allocator. Calling :meth:`~torch.xpu.empty_cache`
+releases all **unused** cached memory from PyTorch so that those can be used
+by other GPU applications. However, the occupied GPU memory by tensors will not
+be freed so it can not increase the amount of GPU memory available for PyTorch.
+
+For more advanced users, we offer more comprehensive memory benchmarking via
+:meth:`~torch.xpu.memory_stats`. We also offer the capability to capture a
+complete snapshot of the memory allocator state via
+:meth:`~torch.xpu.memory_snapshot`, which can help you understand the
+underlying allocation patterns produced by your code.
diff --git a/intel_extension_for_pytorch/xpu/memory.py b/intel_extension_for_pytorch/xpu/memory.py
@@ -14,7 +14,8 @@ def empty_cache() -> None:
     .. note::
         :func:`~torch.xpu.empty_cache` doesn't increase the amount of GPU
         memory available for PyTorch. However, it may help reduce fragmentation
-        of GPU memory in certain cases.
+        of GPU memory in certain cases. See :ref:`xpu-memory-management` for
+        more details about GPU memory management.
     """
     intel_extension_for_pytorch._C._emptyCache()
 
@@ -73,6 +74,10 @@ def memory_stats(device: Union[Device, int] = None) -> Dict[str, Any]:
         device (torch.device or int, optional): selected device. Returns
             statistics for the current device, given by :func:`~torch.xpu.current_device`,
             if :attr:`device` is ``None`` (default).
+
+    .. note::
+        See :ref:`xpu-memory-management` for more details about GPU memory
+        management.
     """
     result = []
 
@@ -109,6 +114,10 @@ def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None:
         device (torch.device or int, optional): selected device. Returns
             statistic for the current device, given by :func:`~torch.xpu.current_device`,
             if :attr:`device` is ``None`` (default).
+
+    .. note::
+        See :ref:`xpu-memory-management` for more details about GPU memory
+        management.
     """
     device = _get_device_index(device, optional=True)
     return intel_extension_for_pytorch._C._resetAccumulatedMemoryStats(device)
@@ -124,6 +133,10 @@ def reset_peak_memory_stats(device: Union[Device, int] = None) -> None:
         device (torch.device or int, optional): selected device. Returns
             statistic for the current device, given by :func:`~torch.xpu.current_device`,
             if :attr:`device` is ``None`` (default).
+
+    .. note::
+        See :ref:`xpu-memory-management` for more details about GPU memory
+        management.
     """
     device = _get_device_index(device, optional=True)
     return intel_extension_for_pytorch._C._resetPeakMemoryStats(device)
@@ -141,7 +154,8 @@ def memory_allocated(device: Union[Device, int] = None) -> int:
     .. note::
         This is likely less than the amount shown in sysman toolkit since some
         unused memory can be held by the caching allocator and some context
-        needs to be created on GPU.
+        needs to be created on GPU. See :ref:`xpu-memory-management` for more
+        details about GPU memory management.
     """
     return memory_stats(device=device)["allocated_bytes.all.current"]
 
@@ -160,6 +174,10 @@ def max_memory_allocated(device: Union[Device, int] = None) -> int:
         device (torch.device or int, optional): selected device. Returns
             statistic for the current device, given by :func:`~torch.xpu.current_device`,
             if :attr:`device` is ``None`` (default).
+
+    .. note::
+        See :ref:`xpu-memory-management` for more details about GPU memory
+        management.
     """
     return memory_stats(device=device)["allocated_bytes.all.peak"]
 
@@ -172,6 +190,10 @@ def memory_reserved(device: Union[Device, int] = None) -> int:
         device (torch.device or int, optional): selected device. Returns
             statistic for the current device, given by :func:`~torch.xpu.current_device`,
             if :attr:`device` is ``None`` (default).
+
+    .. note::
+        See :ref:`xpu-memory-management` for more details about GPU memory
+        management.
     """
     return memory_stats(device=device)["reserved_bytes.all.current"]
 
@@ -190,6 +212,10 @@ def max_memory_reserved(device: Union[Device, int] = None) -> int:
         device (torch.device or int, optional): selected device. Returns
             statistic for the current device, given by :func:`~torch.xpu.current_device`,
             if :attr:`device` is ``None`` (default).
+
+    .. note::
+        See :ref:`xpu-memory-management` for more details about GPU memory
+        management.
     """
     return memory_stats(device=device)["reserved_bytes.all.peak"]
 
@@ -199,6 +225,10 @@ def memory_snapshot():
 
     Interpreting the output of this function requires familiarity with the
     memory allocator internals.
+
+    .. note::
+        See :ref:`xpu-memory-management` for more details about GPU memory
+        management.
     """
     return intel_extension_for_pytorch._C._memorySnapshot()
 
@@ -216,6 +246,10 @@ def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False)
             if :attr:`device` is ``None`` (default).
         abbreviated (bool, optional): whether to return an abbreviated summary
             (default: False).
+
+    .. note::
+        See :ref:`xpu-memory-management` for more details about GPU memory
+        management.
     """
     device = _get_device_index(device, optional=True)
     stats = memory_stats(device=device)
diff --git a/intel_extension_for_pytorch/xpu/utils.py b/intel_extension_for_pytorch/xpu/utils.py
@@ -2,6 +2,7 @@
 import torch
 from .. import _C
 from enum import Enum
+import warnings
 from .. import frontend
 import intel_extension_for_pytorch  # noqa
 
@@ -25,12 +26,27 @@ def from_usm(src, dtype, shape, stride = None, device_id: int = -1) -> torch.Ten
             returned tensor is contiguous.
         device_id: the root device id where the USM pointer is allocated. Default: -1,
             if the user is not sure.
+
+    Warning: This is decrepated. Please use torch.from_dlpack instead.
     """
 
+    warnings.warn("from_usm is decrepated. Please use torch.from_dlpack instead.")
     return _C._from_usm(src, dtype, shape, stride, device_id)
 
 
 def to_usm(src: torch.Tensor):
+    """to_usm(src: torch.Tensor): -> PyCapsule
+
+    Converts a torch tensor allocated in USM(United Shared Memory) into a ``PyCapsule``,
+    which encapsules a USM data pointer address.
+
+    Args:
+        src: a torch tensor.
+
+    Warning: This is decrepated. Please use torch.to_dlpack instead.
+    """
+
+    warnings.warn("to_usm is decrepated. Please use torch.to_dlpack instead.")
     return _C._to_usm(src)