update document (#851)

leslie-fang-intel · web-flow · commit 776812484779 · 2022-06-13T18:18:34.000+08:00
diff --git a/docs/tutorials/features/runtime_extension.md b/docs/tutorials/features/runtime_extension.md
@@ -7,7 +7,7 @@ Intel® Extension for PyTorch\* Runtime Extension provides a couple of PyTorch f
 2. Spawn asynchronous tasks via the Python frontend module `intel_extension_for_pytorch.cpu.runtime.Task`.
 3. Configure core bindings for OpenMP threads via the Python frontend `intel_extension_for_pytorch.cpu.runtime.pin`.
 
-Please **note**: Intel® Extension for PyTorch\* Runtime extension is still in the **POC** stage. The API is subject to change. More detailed descriptions are available at [API Documentation page](../api_doc.html).
+Please **note**: Intel® Extension for PyTorch\* Runtime extension is still in the **Experimental** stage. The API is subject to change. More detailed descriptions are available at [API Documentation page](../api_doc.html).
 
 ## Requirements
 
@@ -17,7 +17,9 @@ Intel® Extension for PyTorch\* Runtime Extension relies on `intel omp` to bind
 
 ### Example of Multi Stream Module
 
-Runtime extension supports weight-sharing multi-stream inference for throughput mode on CPU. You just need to convert the original model into multi stream model and run the new multi stream model as normal. The detailed description of parameters to create `MultiStreamModule` is available at [API Documentation page](../api_doc.html)
+Runtime extension supports weight-sharing multi-stream inference for throughput mode on CPU. You just need to convert the original model into multi stream model and run the new multi stream model as normal. The detailed description of parameters to create `MultiStreamModule` is available at [API Documentation page](../api_doc.html).
+
+`MultiStreamModule` targets to improve performance of inference in throughput mode. We recommend creating a `MultiStreamModule` object with the `num_streams` parameter set to "AUTO" to heuristically decide the number of streams. Usually, it provides reasonable performance. However, it may still not be optimal for some cases (refer to the section [Performance recipes](#performance-recipes) for details) where manual tuning for the number of streams is needed.
 
 The `MultiStreamModule` creates number of streams based on input parameter `num_streams` and bind cores to stream based on input parameter `cpu_pool`. If the number of cores inside `cpu_pool` is divisible by `num_streams`, the cores will be allocated equally to each stream. If the number of cores inside `cpu_pool` is not divisible by `num_streams` with remainder N, one extra core will be allocated to the first N streams. We suggest to set the `num_streams` as divisor of core number inside `cpu_pool`.
 
diff --git a/intel_extension_for_pytorch/cpu/runtime/multi_stream.py b/intel_extension_for_pytorch/cpu/runtime/multi_stream.py
@@ -5,6 +5,7 @@
 from .cpupool import CPUPool
 from .task import Task
 import copy
+import warnings
 
 class MultiStreamModuleHint(object):
     def __init__(self, *args, **kwargs):
@@ -91,6 +92,9 @@ def __init__(self,
                 output_concat_hint: MultiStreamModuleHint = default_multi_stream_module_concat_hint):
         super(MultiStreamModule, self).__init__()
         assert type(cpu_pool) is CPUPool, "Input of cpu_pool must be provided with type of ipex.cpu.runtime.CPUPool"
+        if not isinstance(model, torch.jit.ScriptModule):
+            warnings.warn("Creating MultiStreamModule on an nn.Module. This can be slow due "
+            "to Python Global Interpreter Lock (GIL). Suggest to use JIT ScriptModule for better performance.")
         self.core_list = cpu_pool.core_ids
         if isinstance(num_streams, str):
             # For str input of num_streams, it must be "auto"
@@ -215,7 +219,7 @@ def _do_get_input_for_each_stream(self, hint_object, input_object, stream_input_
                     self.init_forward_status(input_object[idx_or_key].size(hint_object[idx_or_key]), stream_id)
                 # Get the split input for each stream
                 # Here we assume split along the outside dim, otherwise memory copy happens and obviously hurt multi stream module's performance.
-                if hint_object[idx_or_key] is 0:
+                if hint_object[idx_or_key] == 0:
                     # Split along dim 0, the slice will not create new tensor
                     stream_input_object[idx_or_key] = input_object[idx_or_key][self.current_split_start_idx:self.current_split_end_idx]
                 else: