Add the function specialization for promote with ITensorListRef (#1230)

CaoE · web-flow · commit c0ca04760670 · 2022-10-28T08:42:03.000+08:00
* add the function specialization for promote with ITensorListRef

* update autocast doc

* add comments for test_cat_promote
diff --git a/csrc/cpu/autocast/autocast_mode.cpp b/csrc/cpu/autocast/autocast_mode.cpp
@@ -127,8 +127,7 @@ struct CPU_WrapFunction_<
         return (*F)(cpu_cached_cast(at::kFloat, args)...);
       case DtypeCastPolicy::promote:
         return (*F)(cpu_cached_cast(
-            promote_type(get_autocast_dtype(), DeviceType::CPU, args...),
-            args)...);
+            promote_type(get_autocast_dtype(), args...), args)...);
       default:
         return (*F)(args...);
     }
diff --git a/csrc/cpu/autocast/autocast_mode.h b/csrc/cpu/autocast/autocast_mode.h
@@ -54,6 +54,17 @@ inline c10::optional<Tensor> cpu_cached_cast(
   }
 }
 
+inline std::vector<Tensor> cpu_cached_cast(
+    at::ScalarType to_type,
+    const at::ITensorListRef& arg) {
+  std::vector<Tensor> vec;
+  vec.reserve(arg.size());
+  for (const auto& t : arg) {
+    vec.push_back(cpu_cached_cast(to_type, t));
+  }
+  return vec;
+}
+
 inline std::vector<Tensor> cpu_cached_cast(
     at::ScalarType to_type,
     const TensorList& arg) {
@@ -137,6 +148,15 @@ inline at::ScalarType prioritize(
   return current;
 }
 
+inline at::ScalarType prioritize(
+    at::ScalarType current,
+    const at::ITensorListRef& list) {
+  for (const auto& tensor : list) {
+    current = prioritize(current, tensor);
+  }
+  return current;
+}
+
 // Template to catch non-Tensor args (no-op that returns current best guess)
 template <typename T>
 inline at::ScalarType prioritize(at::ScalarType current, T nextArg) {
diff --git a/docs/tutorials/features/amp.md b/docs/tutorials/features/amp.md
@@ -91,7 +91,7 @@ If an op is unlisted, we assume it's numerically stable in `bfloat16`. If you be
 
 #### Ops that can autocast to `float32`
 
-`conv_transpose1d`, `conv_transpose2d`, `conv_transpose3d`, `mish`, `avg_pool3d`, `binary_cross_entropy`, `grid_sampler`, `polar`, `prod`, `quantile`, `nanquantile`, `stft`, `cdist`, `trace`, `view_as_complex`, `cholesky`, `cholesky_inverse`, `cholesky_solve`, `inverse`, `lu_solve`, `orgqr`, `ormqr`, `pinverse`, `max_unpool2d`, `max_unpool3d`, `adaptive_avg_pool3d`, `reflection_pad1d`, `reflection_pad2d`, `replication_pad1d`, `replication_pad2d`, `replication_pad3d`, `mse_loss`, `cosine_embedding_loss`, `nll_loss`, `nll_loss2d`, `hinge_embedding_loss`, `poisson_nll_loss`, `smooth_l1_loss`, `cross_entropy_loss`, `l1_loss`, `huber_loss`, `margin_ranking_loss`, `soft_margin_loss`, `triplet_margin_loss`, `multi_margin_loss`, `ctc_loss`, `kl_div`, `multilabel_margin_loss`, `binary_cross_entropy_with_logits`, `fft_fft`, `fft_ifft`, `fft_fft2`, `fft_ifft2`, `fft_fftn`, `fft_ifftn`, `fft_rfft`, `fft_irfft`, `fft_rfft2`, `fft_irfft2`, `fft_rfftn`, `fft_irfftn`, `fft_hfft`, `fft_ihfft`, `linalg_matrix_norm`, `linalg_cond`, `linalg_matrix_rank`, `linalg_solve`, `linalg_cholesky`, `linalg_svdvals`, `linalg_eigvals`, `linalg_eigvalsh`, `linalg_inv`, `linalg_householder_product`, `linalg_tensorinv`, `linalg_tensorsolve`, `fake_quantize_per_tensor_affine`, `geqrf`, `_lu_with_info`, `qr`, `svd`, `symeig`, `triangular_solve`, `fractional_max_pool2d`, `fractional_max_pool3d`, `adaptive_max_pool3d`, `multilabel_margin_loss_forward`, `linalg_qr`, `linalg_cholesky_ex`, `linalg_svd`, `linalg_eig`, `linalg_eigh`, `linalg_lstsq`, `linalg_inv_ex`
+`conv_transpose1d`, `conv_transpose2d`, `conv_transpose3d`, `mish`, `avg_pool3d`, `max_pool3d`, `binary_cross_entropy`, `grid_sampler`, `polar`, `prod`, `quantile`, `nanquantile`, `stft`, `cdist`, `trace`, `view_as_complex`, `cholesky`, `cholesky_inverse`, `cholesky_solve`, `inverse`, `lu_solve`, `orgqr`, `ormqr`, `pinverse`, `max_unpool2d`, `max_unpool3d`, `adaptive_avg_pool3d`, `reflection_pad1d`, `reflection_pad2d`, `replication_pad1d`, `replication_pad2d`, `replication_pad3d`, `mse_loss`, `cosine_embedding_loss`, `nll_loss`, `nll_loss2d`, `hinge_embedding_loss`, `poisson_nll_loss`, `smooth_l1_loss`, `cross_entropy_loss`, `l1_loss`, `huber_loss`, `margin_ranking_loss`, `soft_margin_loss`, `triplet_margin_loss`, `multi_margin_loss`, `ctc_loss`, `kl_div`, `multilabel_margin_loss`, `binary_cross_entropy_with_logits`, `fft_fft`, `fft_ifft`, `fft_fft2`, `fft_ifft2`, `fft_fftn`, `fft_ifftn`, `fft_rfft`, `fft_irfft`, `fft_rfft2`, `fft_irfft2`, `fft_rfftn`, `fft_irfftn`, `fft_hfft`, `fft_ihfft`, `linalg_matrix_norm`, `linalg_cond`, `linalg_matrix_rank`, `linalg_solve`, `linalg_cholesky`, `linalg_svdvals`, `linalg_eigvals`, `linalg_eigvalsh`, `linalg_inv`, `linalg_householder_product`, `linalg_tensorinv`, `linalg_tensorsolve`, `fake_quantize_per_tensor_affine`, `geqrf`, `_lu_with_info`, `qr`, `svd`, `symeig`, `triangular_solve`, `fractional_max_pool2d`, `fractional_max_pool3d`, `adaptive_max_pool3d`, `multilabel_margin_loss_forward`, `linalg_qr`, `linalg_cholesky_ex`, `linalg_svd`, `linalg_eig`, `linalg_eigh`, `linalg_lstsq`, `linalg_inv_ex`
 
 #### Ops that promote to the widest input type
 
diff --git a/tests/cpu/test_autocast.py b/tests/cpu/test_autocast.py
@@ -187,6 +187,34 @@ def test_nhwc_autocast_jit_trace_model(model, x):
                 continue
             test_nhwc_autocast_jit_trace_model(self.models[i], self.inputs[i])
 
+    # Check whether cat has done the promotion in AMP with mixed dtype inputs
+    # since input type of cat is changed to ITensorListRef
+    def test_cat_promote(self):
+        class TestModel(torch.nn.Module):
+            def __init__(self):
+                super(TestModel, self).__init__()
+
+            def forward(self, a, b):
+                return torch.cat([a, b], 0)
+        with torch.jit.fuser("none"):
+            # In this testcase, we will check whether cat has done the promotion in AMP with mixed dtype inputs.
+            # To avoid the fusion group from TE, we will disable the fuser here.
+            for jit_freeze_or_not in [False, True]:
+                test_model = TestModel().eval()
+                with torch.cpu.amp.autocast(cache_enabled=False, dtype=torch.bfloat16), torch.no_grad():
+                    a = torch.rand(24, 128, 128)
+                    b = torch.rand(24, 128, 128, dtype=torch.bfloat16)
+                    c = test_model(a, b)
+                    traced = torch.jit.trace(test_model, (a, b))
+                if jit_freeze_or_not:
+                    traced = torch.jit.freeze(traced)
+                for _ in range(3):
+                    c2 = traced(a, b)
+                self.assertTrue(c.dtype, torch.float32)
+                self.assertTrue(c2.dtype, torch.float32)
+                traced_graph = traced.graph_for(a, b)
+                self.assertTrue(any(n.kind() == "aten::to" for n in traced_graph.nodes()))
+
 class TestPyTorchOps(TestCase):
     def test_bernoulli(self):
         input = torch.rand(8, 8)