modify Batch NMS's output to make it jit traceable (#123)

leslie-fang-intel · web-flow · commit c2794b1dc6b3 · 2021-07-24T14:14:48.000+08:00
* enable jit trace of NMS for different BS

* change the UT: test_batch_nms_result's output to be compatiable with new NMS output

* add NMS jit trace test case

* clean format
diff --git a/tests/cpu/test_nms.py b/tests/cpu/test_nms.py
@@ -137,7 +137,63 @@ def test_batch_nms_result(self):
             bbox = bbox.squeeze(0)
             prob = prob.squeeze(0)
             output.append(self.decode_single(bbox, prob, criteria, max_output))
-        output2 = batch_score_nms(bboxes_clone, probs_clone, criteria, max_output)
+        output2_raw = batch_score_nms(bboxes_clone, probs_clone, criteria, max_output)
+
+        # Re-assembly the result
+        output2 = []
+        idx = 0
+        for i in range(output2_raw[3].size(0)):
+            output2.append((output2_raw[0][idx:idx+output2_raw[3][i]],
+                            output2_raw[1][idx:idx+output2_raw[3][i]],
+                            output2_raw[2][idx:idx+output2_raw[3][i]]))
+            idx += output2_raw[3][i]
+
+        for i in range(batch_size):
+            loc, label, prob = [r for r in output[i]]
+            loc2, label2, prob2 = [r for r in output2[i]]
+            self.assertTrue(torch.allclose(loc, loc2, rtol=1e-4, atol=1e-4))
+            self.assertEqual(label, label2)
+            self.assertTrue(torch.allclose(prob, prob2, rtol=1e-4, atol=1e-4))
+
+    def test_jit_trace_batch_nms(self):
+        class Batch_NMS(nn.Module):
+            def __init__(self, criteria, max_output):
+                super(Batch_NMS, self).__init__()
+                self.criteria = criteria
+                self.max_output = max_output
+            def forward(self, bboxes_clone, probs_clone):
+                return batch_score_nms(bboxes_clone, probs_clone, self.criteria, self.max_output)
+        batch_size = 1
+        number_boxes = 15130
+        scale_xy = 0.1
+        scale_wh = 0.2
+        criteria = 0.50
+        max_output = 200
+        predicted_loc = torch.load(os.path.join(os.path.dirname(__file__), "data/nms_ploc.pt")) # sizes: [1, 15130, 4]
+        predicted_score = torch.load(os.path.join(os.path.dirname(__file__), "data/nms_plabel.pt")) # sizes: [1, 15130, 81]
+        dboxes_xywh = torch.load(os.path.join(os.path.dirname(__file__), "data/nms_dboxes_xywh.pt"))
+        bboxes, probs = parallel_scale_back_batch(predicted_loc, predicted_score, dboxes_xywh, scale_xy, scale_wh)
+        bboxes_clone = bboxes.clone()
+        probs_clone = probs.clone()
+
+        output = []
+        for bbox, prob in zip(bboxes.split(1, 0), probs.split(1, 0)):
+            bbox = bbox.squeeze(0)
+            prob = prob.squeeze(0)
+            output.append(self.decode_single(bbox, prob, criteria, max_output))
+
+        batch_score_nms_module = Batch_NMS(criteria, max_output)
+        model_decode = torch.jit.trace(batch_score_nms_module, (bboxes_clone, probs_clone))
+        output2_raw = model_decode(bboxes_clone, probs_clone)
+
+        # Re-assembly the result
+        output2 = []
+        idx = 0
+        for i in range(output2_raw[3].size(0)):
+            output2.append((output2_raw[0][idx:idx+output2_raw[3][i]],
+                            output2_raw[1][idx:idx+output2_raw[3][i]],
+                            output2_raw[2][idx:idx+output2_raw[3][i]]))
+            idx += output2_raw[3][i]
 
         for i in range(batch_size):
             loc, label, prob = [r for r in output[i]]
diff --git a/torch_ipex/csrc/cpu/ExtendOPs.h b/torch_ipex/csrc/cpu/ExtendOPs.h
@@ -62,21 +62,25 @@ class AtenIpexTypeExt {
   /// \brief Perform batch non-maximum suppression.
   ///
   /// C++ version of Encoder::decode_single.
-  /// Refer to https://github.com/mlcommons/inference/blob/v0.7/others/cloud/single_stage_detector/pytorch/utils.py.
+  /// Refer to
+  /// https://github.com/mlcommons/inference/blob/v0.7/others/cloud/single_stage_detector/pytorch/utils.py.
   ///
-  /// \param dets: predicted loc in ltrb format, size [BS, number_boxes, 4], for example: [1, 15130, 4].
-  /// \param scores: predicted score, size [BS, number_boxes, class_number], for example: [1, 15130, 81].
-  /// \param threshold: IOU threshold(scalar) to suppress bboxs which has the IOU val larger than the threshold.
-  /// \param max_output: the max number of output bbox.
+  /// \param dets: predicted loc in ltrb format, size [BS, number_boxes, 4], for
+  /// example: [1, 15130, 4]. \param scores: predicted score, size [BS,
+  /// number_boxes, class_number], for example: [1, 15130, 81]. \param
+  /// threshold: IOU threshold(scalar) to suppress bboxs which has the IOU val
+  /// larger than the threshold. \param max_output: the max number of output
+  /// bbox.
   ///
-  /// \return result is a list of tuple. In each tuple, there are 3 tensors:
+  /// \return result is a list of tensors, each 4 continuous tensors
+  /// corresponding the decode results of one image
   ///   bboxes_out_: the selected out bboxes coordinate, size [max_output, 4].
   ///   labels_out_: the label of each selected out bboxes, size [max_output].
   ///   scores_out_: the score of each selected out bboxes, size [max_output].
-  static std::vector<std::tuple<at::Tensor, at::Tensor, at::Tensor>> batch_score_nms(const at::Tensor& dets,
-                        const at::Tensor& scores,
-                        const double threshold,
-                        const int64_t max_output);
+  ///   length_out_: the number of detection bboxs [1].
+  static std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor>
+  batch_score_nms(const at::Tensor &dets, const at::Tensor &scores,
+                  const double threshold, const int64_t max_output);
 
   /// \brief Perform batch non-maximum suppression (NMS) for MaskRCNN RPN part.
   ///
diff --git a/torch_ipex/csrc/cpu/nms.cpp b/torch_ipex/csrc/cpu/nms.cpp
@@ -292,9 +292,10 @@ std::vector<at::Tensor> remove_empty(std::vector<at::Tensor>& candidate, int64_t
 }
 
 template <typename scalar_t>
-std::vector<std::tuple<at::Tensor, at::Tensor, at::Tensor>> batch_score_nms_kernel(const at::Tensor& batch_dets,
-                          const at::Tensor& batch_scores,
-                          const float threshold, const int max_output=200) {
+std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor>
+batch_score_nms_kernel(const at::Tensor &batch_dets,
+                       const at::Tensor &batch_scores, const float threshold,
+                       const int max_output = 200) {
   // Reference to: https://github.com/mlcommons/inference/blob/0f096a18083c3fd529c1fbf97ebda7bc3f1fda70/others/cloud/single_stage_detector/pytorch/utils.py#L163
   // batch_dets: (batchsize, num_bbox, 4) For example: batch_dets: (1, 15130, 4)
   // batch_scores: (batchsize, num_bbox, label_num) For example: batch_scores: (1, 15130, 81)
@@ -351,7 +352,10 @@ std::vector<std::tuple<at::Tensor, at::Tensor, at::Tensor>> batch_score_nms_kern
     labels_out[index] = at::empty({keep.sizes()}).fill_(i);
   }
 
-  std::vector<std::tuple<at::Tensor, at::Tensor, at::Tensor>> output(nbatch);
+  std::vector<at::Tensor> output_bboxes_(nbatch);
+  std::vector<at::Tensor> output_labels_(nbatch);
+  std::vector<at::Tensor> output_scores_(nbatch);
+  std::vector<at::Tensor> output_length_(nbatch);
 #ifdef _OPENMP
 #if (_OPENMP >= 201307)
 # pragma omp parallel for simd schedule(static) if (omp_get_max_threads() > 1 && !omp_in_parallel())
@@ -372,11 +376,14 @@ std::vector<std::tuple<at::Tensor, at::Tensor, at::Tensor>> batch_score_nms_kern
     std::tuple<at::Tensor, at::Tensor> sort_result = scores_out_.sort(0);
     at::Tensor max_ids = std::get<1>(sort_result);
     max_ids = max_ids.slice(/*dim*/0, /*start*/std::max(max_ids.size(0) - max_output, static_cast<int64_t>(0)), /*end*/max_ids.size(0));
-    output[bs] = std::tuple<at::Tensor, at::Tensor, at::Tensor>(bboxes_out_.index_select(/*dim*/0, /*index*/max_ids),
-                                                                labels_out_.index_select(/*dim*/0, /*index*/max_ids),
-                                                                scores_out_.index_select(/*dim*/0, /*index*/max_ids));
+    output_bboxes_[bs] = bboxes_out_.index_select(/*dim*/ 0, /*index*/ max_ids);
+    output_labels_[bs] = labels_out_.index_select(/*dim*/ 0, /*index*/ max_ids);
+    output_scores_[bs] = scores_out_.index_select(/*dim*/ 0, /*index*/ max_ids);
+    output_length_[bs] = torch::tensor(max_ids.size(0), {torch::kInt32});
   }
-  return output;
+  return std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor>(
+      at::cat(output_bboxes_), at::cat(output_labels_), at::cat(output_scores_),
+      at::stack(output_length_));
 }
 
 template <typename scalar_t>
@@ -526,11 +533,10 @@ at::Tensor nms_cpu(const at::Tensor& dets,
   return result;
 }
 
-std::vector<std::tuple<at::Tensor, at::Tensor, at::Tensor>> batch_score_nms_cpu(const at::Tensor& dets,
-               const at::Tensor& scores,
-               const float threshold,
-               const int max_output) {
-  std::vector<std::tuple<at::Tensor, at::Tensor, at::Tensor>> result;
+std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor>
+batch_score_nms_cpu(const at::Tensor &dets, const at::Tensor &scores,
+                    const float threshold, const int max_output) {
+  std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor> result;
   AT_DISPATCH_FLOATING_TYPES(dets.scalar_type(), "batch_score_nms", [&] {
     result = batch_score_nms_kernel<scalar_t>(dets, scores, threshold, max_output);
   });
@@ -581,10 +587,11 @@ at::Tensor AtenIpexTypeExt::nms(const at::Tensor& dets,
   return result;
 }
 
-std::vector<std::tuple<at::Tensor, at::Tensor, at::Tensor>> AtenIpexTypeExt::batch_score_nms(const at::Tensor& dets,
-               const at::Tensor& scores,
-               const double threshold,
-               const int64_t max_output) {
+std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor>
+AtenIpexTypeExt::batch_score_nms(const at::Tensor &dets,
+                                 const at::Tensor &scores,
+                                 const double threshold,
+                                 const int64_t max_output) {
 #if defined(IPEX_DISP_OP)
   printf("IpexExternal::batch_score_nms\n");
 #endif
@@ -758,10 +765,9 @@ at::Tensor nms(const at::Tensor& dets,
   return op.call(cpu_cached_cast(at::kFloat, dets), cpu_cached_cast(at::kFloat, scores), threshold, sorted);
 }
 
-std::vector<std::tuple<at::Tensor, at::Tensor, at::Tensor>> batch_score_nms(const at::Tensor& dets,
-                           const at::Tensor& scores,
-                           const double threshold,
-                           const int64_t max_output) {
+std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor>
+batch_score_nms(const at::Tensor &dets, const at::Tensor &scores,
+                const double threshold, const int64_t max_output) {
   c10::impl::ExcludeDispatchKeyGuard no_autocastCPU(DispatchKey::AutocastCPU);
   static auto op = torch::Dispatcher::singleton()
     .findSchemaOrThrow("torch_ipex::batch_score_nms", "")