add gomp envs (#1572)

jingxu10 · web-flow · commit 48b4eeaf2e4f · 2023-04-25T07:57:12.000+09:00
diff --git a/docs/tutorials/performance_tuning/launch_script.md b/docs/tutorials/performance_tuning/launch_script.md
@@ -25,7 +25,7 @@ Available option settings (knobs) are listed below:
 | `-h`, `--help` | - | - | show this help message and exit |
 | `-m`, `--module` | - | False | Changes each process to interpret the launch script  as a python module, executing with the same behavior as 'python -m'. |
 | `--no-python` | - | False | Avoid applying `python` to execute `program`. |
-| `--log-path` | str | '' | The log file directory. Setting it to empty ('') disables logging to files. |
+| `--log-dir` | str | '' | The log file directory. Setting it to empty ('') disables logging to files. |
 | `--log-file-prefix` | str | 'run' | log file name prefix |
 
 Launcher Common Arguments:
@@ -78,7 +78,7 @@ Distributed Training Arguments With oneCCL backend:
 
 The *launch* script respects existing environment variables when it get launched, except for *LD_PRELOAD*. If you have your favorite values for certain environment variables, you can set them before running the *launch* script. Intel OpenMP library uses an environment variable *KMP_AFFINITY* to control its behavior. Different settings result in different performance numbers. By default, if you enable Intel OpenMP library, the *launch* script will set *KMP_AFFINITY* to `granularity=fine,compact,1,0`. If you want to try with other values, you can use `export` command on Linux to set *KMP_AFFINITY* before you run the *launch* script. In this case, the script will not set the default value but take the existing value of *KMP_AFFINITY*, and print a message to stdout.
 
-Execution via the *launch* script can dump logs into files under a designated log directory so you can do some investigations afterward. By default, it is disabled to avoid undesired log files. You can enable logging by setting knob `--log-path` to be:
+Execution via the *launch* script can dump logs into files under a designated log directory so you can do some investigations afterward. By default, it is disabled to avoid undesired log files. You can enable logging by setting knob `--log-dir` to be:
 
 - directory to store log files. It can be an absolute path or relative path.
 - types of log files to generate. One file (`<prefix>_timestamp_instances.log`) contains command and information when the script was launched. Another type of file (`<prefix>_timestamp_instance_#_core#-core#....log`) contain stdout print of each instance.
@@ -119,7 +119,7 @@ __Note:__ GIF files below illustrate CPU usage ONLY. Do NOT infer performance nu
 #### I. Use all physical cores
 
 ```
-ipexrun --log-path ./logs resnet50.py
+ipexrun --log-dir ./logs resnet50.py
 ```
 
 CPU usage is shown as below. 1 main worker thread was launched, then it launched physical core number of threads on all physical cores.
@@ -153,7 +153,7 @@ $ cat logs/run_20210712212258_instances.log
 #### II. Use all cores including logical cores
 
 ```
-ipexrun --use-logical-core --log-path ./logs resnet50.py
+ipexrun --use-logical-core --log-dir ./logs resnet50.py
 ```
 
 CPU usage is shown as below. 1 main worker thread was launched, then it launched threads on all cores, including logical cores.
@@ -187,7 +187,7 @@ $ cat logs/run_20210712223308_instances.log
 #### III. Use physical cores on designated nodes
 
 ```
-ipexrun --nodes-list 1 --log-path ./logs resnet50.py
+ipexrun --nodes-list 1 --log-dir ./logs resnet50.py
 ```
 
 CPU usage is shown as below. 1 main worker thread was launched, then it launched threads on all other cores on the same numa node.
@@ -221,7 +221,7 @@ $ cat logs/run_20210712214504_instances.log
 #### IV. Use your designated number of cores
 
 ```
-ipexrun --ninstances 1 --ncores-per-instance 10 --log-path ./logs resnet50.py
+ipexrun --ninstances 1 --ncores-per-instance 10 --log-dir ./logs resnet50.py
 ```
 
 CPU usage is shown as below. 1 main worker thread was launched, then it launched threads on other 9 physical cores.
@@ -254,7 +254,7 @@ $ cat logs/run_20210712220928_instances.log
 You can also specify the cores to be utilized using `--cores-list` argument. For example, if core id 11-20 are desired instead of the first 10 cores, the launch command would be as below.
 
 ```
-ipexrun --ncores-per-instance 10 --cores-list "11-20" --log-path ./logs resnet50.py
+ipexrun --ncores-per-instance 10 --cores-list "11-20" --log-dir ./logs resnet50.py
 ```
 
 Please notice that when specifying `--cores-list`, a correspondant `--ncores-per-instance` argument is required for instance number deduction.
@@ -286,7 +286,7 @@ $ cat logs/run_20210712221615_instances.log
 #### V. Throughput mode
 
 ```
-ipexrun --throughput-mode --log-path ./logs resnet50.py
+ipexrun --throughput-mode --log-dir ./logs resnet50.py
 ```
 
 CPU usage is shown as below. 2 main worker threads were launched on 2 numa nodes respectively, then they launched threads on other physical cores.
@@ -321,7 +321,7 @@ $ cat logs/run_20210712221150_instances.log
 #### VI. Latency mode
 
 ```
-ipexrun --latency-mode --log-path ./logs resnet50.py
+ipexrun --latency-mode --log-dir ./logs resnet50.py
 ```
 
 CPU usage is shown as below. 4 cores are used for each instance.
@@ -375,7 +375,7 @@ $ cat logs/run_20210712221415_instances.log
 #### VII. Your designated number of instances
 
 ```
-ipexrun --ninstances 4 --log-path ./logs resnet50.py
+ipexrun --ninstances 4 --log-dir ./logs resnet50.py
 ```
 
 CPU usage is shown as below. 4 main worker thread were launched, then they launched threads on all other physical cores.
@@ -416,7 +416,7 @@ $ cat logs/run_20210712221305_instances.log
 Launcher by default runs all `ninstances` for multi-instance inference/training as shown above. You can specify `instance_idx` to independently run that instance only among `ninstances`
 
 ```
-ipexrun --ninstances 4 --instance-idx 0 --log-path ./logs resnet50.py
+ipexrun --ninstances 4 --instance-idx 0 --log-dir ./logs resnet50.py
 ```
 
 you can confirm usage in log file:
@@ -431,7 +431,7 @@ you can confirm usage in log file:
 ```
 
 ```
-ipexrun --ninstances 4 --instance-idx 1 --log-path ./logs resnet50.py
+ipexrun --ninstances 4 --instance-idx 1 --log-dir ./logs resnet50.py
 ```
 
 you can confirm usage in log file:
@@ -454,7 +454,7 @@ Memory allocator influences performance sometime. If users do not designate desi
 __Note:__ You can set your favorite value to *MALLOC_CONF* before running the *launch* script if you do not want to use its default setting.
 
 ```
-ipexrun --memory-allocator jemalloc --log-path ./logs resnet50.py
+ipexrun --memory-allocator jemalloc --log-dir ./logs resnet50.py
 ```
 
 you can confirm usage in log file:
@@ -474,7 +474,7 @@ you can confirm usage in log file:
 #### TCMalloc
 
 ```
-ipexrun --memory-allocator tcmalloc --log-path ./logs resnet50.py
+ipexrun --memory-allocator tcmalloc --log-dir ./logs resnet50.py
 ```
 
 you can confirm usage in log file:
@@ -493,7 +493,7 @@ you can confirm usage in log file:
 #### Default memory allocator
 
 ```
-ipexrun --memory-allocator default --log-path ./logs resnet50.py
+ipexrun --memory-allocator default --log-dir ./logs resnet50.py
 ```
 
 you can confirm usage in log file:
@@ -516,16 +516,18 @@ Generally, Intel OpenMP library brings better performance. Thus, in the *launch*
 
 #### GNU OpenMP Library
 
-It is, however, not always that Intel OpenMP library brings better performance comparing to GNU OpenMP library. In this case, you can use knob `--disable_iomp` to switch active OpenMP library to the GNU one.
+It is, however, not always that Intel OpenMP library brings better performance comparing to GNU OpenMP library. In this case, you can use knob `--omp-runtime default` to switch active OpenMP library to the GNU one. GNU OpenMP specific environment variables, *OMP_SCHEDULE* and *OMP_PROC_BIND*, for setting CPU affinity are set automatically.
 
 ```
-ipexrun --omp-runtime default --log-path ./logs resnet50.py
+ipexrun --omp-runtime default --log-dir ./logs resnet50.py
 ```
 
 you can confirm usage in log file:
 
 ```
 2021-07-13 15:25:00,760 - __main__ - WARNING - Both TCMalloc and JeMalloc are not found in $CONDA_PREFIX/lib or $VIRTUAL_ENV/lib or /.local/lib/ or /usr/local/lib/ or /usr/local/lib64/ or /usr/lib or /usr/lib64 or /home/<user>/.local/lib/ so the LD_PRELOAD environment variable will not be set. This may drop the performance
+2021-07-13 15:25:00,761 - __main__ - INFO - OMP_SCHEDULE=STATIC
+2021-07-13 15:25:00,761 - __main__ - INFO - OMP_PROC_BIND=CLOSE
 2021-07-13 15:25:00,761 - __main__ - INFO - OMP_NUM_THREADS=44
 2021-07-13 15:25:00,761 - __main__ - WARNING - Numa Aware: cores:['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43'] on different NUMA nodes
 2021-07-13 15:25:00,761 - __main__ - INFO - numactl -C 0-43 <VIRTUAL_ENV>/bin/python resnet50.py 2>&1 | tee ./logs/run_20210713152500_instance_0_cores_0-43.log
diff --git a/intel_extension_for_pytorch/cpu/launch/launcher_base.py b/intel_extension_for_pytorch/cpu/launch/launcher_base.py
@@ -192,6 +192,7 @@ def set_memory_allocator(self, memory_allocator='auto', benchmark=False, skip_li
                 self.add_env('MALLOC_CONF', 'oversize_threshold:1,background_thread:false,metadata_thp:always,dirty_decay_ms:-1,muzzy_decay_ms:-1')
             else:
                 self.add_env('MALLOC_CONF', 'oversize_threshold:1,background_thread:true,metadata_thp:auto')
+        return ma_local
 
     def set_omp_runtime(self, omp_runtime='auto', set_kmp_affinity=True):
         '''
@@ -203,6 +204,10 @@ def set_omp_runtime(self, omp_runtime='auto', set_kmp_affinity=True):
             if set_kmp_affinity:
                 self.add_env('KMP_AFFINITY', 'granularity=fine,compact,1,0')
             self.add_env('KMP_BLOCKTIME', '1')
+        elif omp_local == 'default':
+            self.add_env('OMP_SCHEDULE', 'STATIC')
+            self.add_env('OMP_PROC_BIND', 'CLOSE')
+        return omp_local
 
     def parse_list_argument(self, txt):
         ret = []
diff --git a/intel_extension_for_pytorch/cpu/launch/launcher_distributed.py b/intel_extension_for_pytorch/cpu/launch/launcher_distributed.py
@@ -188,7 +188,6 @@ def launch(self, args):
         for k,v in self.environ_set.items():
             self.verbose('info', f'env: {k}={v}')
 
-        os.environ['LAUNCH_CMD'] = '#'
         cmd = ['mpiexec.hydra']
         genvs = [f'-genv {k}={v}' for k,v in self.environ_set.items()]
         mpi_config = f"-l -np {args.nnodes * args.nprocs_per_node} -ppn {args.nprocs_per_node} {' '.join(genvs)} "
@@ -228,8 +227,6 @@ def launch(self, args):
                         self.verbose('warning', f'Failed to detect rank id from log file {log_name} at line "{line.strip()}".')
             for fn in log_fns:
                 fn.close()
-        os.environ['LAUNCH_CMD'] += f'{" ".join(cmd)},#'
-        os.environ['LAUNCH_CMD'] = os.environ['LAUNCH_CMD'][:-2]
 
 if __name__ == '__main__':
     pass
diff --git a/intel_extension_for_pytorch/cpu/launch/launcher_multi_instances.py b/intel_extension_for_pytorch/cpu/launch/launcher_multi_instances.py
@@ -100,9 +100,10 @@ def set_multi_task_manager(self, multi_task_manager='auto', skip_list=[]):
         tm_local = self.set_lib_bin_from_list(multi_task_manager, tm_bin_name, 'multi-task manager', self.tm_supported, self.is_command_available, skip_list)
         return tm_local
 
-    def execution_command_builder(self, args, task_mgr, cpu_pools, index):
-        cmd = []
+    def execution_command_builder(self, args, omp_runtime, task_mgr, environ, cpu_pools, index):
         assert index > -1 and index <= len(cpu_pools), 'Designated instance index for constructing execution commands is out of range.'
+        cmd = []
+        environ_local = environ
         pool = cpu_pools[index]
         pool_txt = pool.get_pool_txt()
         cores_list_local = pool_txt['cores']
@@ -116,6 +117,19 @@ def execution_command_builder(self, args, task_mgr, cpu_pools, index):
                 params = f'-c {cores_list_local}'
             cmd.append(task_mgr)
             cmd.extend(params.split())
+        else:
+            k = ''
+            v = ''
+            if omp_runtime == 'default':
+                k = 'GOMP_CPU_AFFINITY'
+                v = cores_list_local
+            elif omp_runtime == 'intel':
+                k = 'KMP_AFFINITY'
+                v = f'granularity=fine,proclist=[{cores_list_local}],explicit'
+            if k != '':
+                self.verbose('info', '==========')
+                self.verbose('info', f'env: {k}={v}')
+                environ_local[k] = v
 
         if not args.no_python:
             cmd.append(sys.executable)
@@ -126,14 +140,13 @@ def execution_command_builder(self, args, task_mgr, cpu_pools, index):
         log_name = f'{args.log_file_prefix}_instance_{index}_cores_{cores_list_local.replace(",", "_")}.log'
         log_name = os.path.join(args.log_dir, log_name)
         cmd.extend(args.program_args)
-        os.environ['LAUNCH_CMD'] += '{" ".join(cmd)},#'
         cmd_s = ' '.join(cmd)
         if args.log_dir:
             cmd_s = f'{cmd_s} 2>&1 | tee {log_name}'
         self.verbose('info', f'cmd: {cmd_s}')
         if len(set([c.node for c in pool])) > 1:
             self.verbose('warning', f'Cross NUMA nodes execution detected: cores [{cores_list_local}] are on different NUMA nodes [{nodes_list_local}]')
-        process = subprocess.Popen(cmd_s, env=os.environ, shell=True)
+        process = subprocess.Popen(cmd_s, env=environ_local, shell=True)
         return {'process': process, 'cmd': cmd_s}
 
     def launch(self, args):
@@ -177,7 +190,7 @@ def launch(self, args):
             set_kmp_affinity = False
 
         self.set_memory_allocator(args.memory_allocator, args.benchmark)
-        self.set_omp_runtime(args.omp_runtime, set_kmp_affinity)
+        omp_runtime = self.set_omp_runtime(args.omp_runtime, set_kmp_affinity)
         self.add_env('OMP_NUM_THREADS', str(args.ncores_per_instance))
 
         skip_list = []
@@ -187,8 +200,12 @@ def launch(self, args):
 
         # Set environment variables for multi-instance execution
         for k,v in self.environ_set.items():
+            if task_mgr == self.tm_supported[1]:
+                if omp_runtime == 'default' and k == 'GOMP_CPU_AFFINITY':
+                    continue
+                if omp_runtime == 'intel' and k == 'KMP_AFFINITY':
+                    continue
             self.verbose('info', f'env: {k}={v}')
-            os.environ[k] = v
 
         if args.auto_ipex:
             args.program = auto_ipex.apply_monkey_patch(args.program, args.dtype, args.auto_ipex_verbose, args.disable_ipex_graph_mode)
@@ -203,15 +220,15 @@ def launch(self, args):
         instance_idx = list(set(instance_idx))
         assert set(instance_idx).issubset(set(instances_available)), f'Designated nodes list contains invalid nodes.'
         processes = []
-        os.environ["LAUNCH_CMD"] = "#"
         for i in instance_idx:
             process = self.execution_command_builder(
                     args = args,
+                    omp_runtime = omp_runtime,
                     task_mgr = task_mgr,
+                    environ = self.environ_set,
                     cpu_pools = self.cpuinfo.pools_ondemand,
                     index = i)
             processes.append(process)
-        os.environ["LAUNCH_CMD"] = os.environ["LAUNCH_CMD"][:-2]
         try:
             for process in processes:
                 p = process['process']