Skip to content

Commit 76dd768

Browse files
authored
adapt launch script for e-core and p-core arch (#1559)
1 parent 5159d50 commit 76dd768

File tree

7 files changed

+287
-137
lines changed

7 files changed

+287
-137
lines changed

docs/tutorials/performance_tuning/launch_script.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ Launcher Common Arguments:
3434
| :-- | :--: | :--: | :-- |
3535
| `--ncores-per-instance` | int | 0 | Number of cores per instance |
3636
| `--nodes-list` | str | '' | Specify nodes list for multiple instances to run on, in format of list of single node ids "node_id,node_id,..." or list of node ranges "node_id-node_id,...". By default all nodes will be used. |
37+
| `--use-e-cores` | - | False | Use Efficient-Cores on the workloads or not. By default, only Performance-Cores are used. |
3738
| `--memory-allocator` | str | 'auto' | Choose which memory allocator to run the workloads with. Supported choices are ['auto', 'default', 'tcmalloc', 'jemalloc']. |
3839
| `--omp-runtime` | str | 'auto' | Choose which OpenMP runtime to run the workloads with. Supported choices are ['auto', 'default', 'intel']. |
3940

intel_extension_for_pytorch/cpu/launch/cpu_info.py

Lines changed: 117 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -8,61 +8,86 @@
88
# # The following is the parsable format, which can be fed to other
99
# # programs. Each different item in every column has an unique ID
1010
# # starting from zero.
11-
# # CPU,Core,Socket,Node
12-
# 0,0,0,0
13-
# 1,0,0,0
14-
# 2,1,0,0
15-
# 3,1,0,0
16-
# 4,2,1,1
17-
# 5,2,1,1
18-
# 6,3,1,1
19-
# 7,3,1,1
11+
# CPU NODE SOCKET CORE L1d:L1i:L2:L3 ONLINE MAXMHZ MINMHZ MHZ
12+
# 0 0 0 0 0:0:0:0 yes 5000.0000 800.0000 2400.000
13+
# 1 0 0 0 0:0:0:0 yes 5000.0000 800.0000 2400.000
14+
# 2 0 0 1 0:0:0:0 yes 5000.0000 800.0000 2400.000
15+
# 3 0 0 1 0:0:0:0 yes 5000.0000 800.0000 2400.000
16+
# 4 1 1 2 0:0:0:0 yes 5000.0000 800.0000 2400.000
17+
# 5 1 1 2 0:0:0:0 yes 5000.0000 800.0000 2400.000
18+
# 6 1 1 3 0:0:0:0 yes 5000.0000 800.0000 2400.000
19+
# 7 1 1 3 0:0:0:0 yes 5000.0000 800.0000 2400.000
2020

21-
# 0,0,0,
22-
# 1,0,0,
23-
# 2,1,0,
24-
# 3,1,0,
25-
# 4,2,0,
26-
# 5,2,0,
27-
# 6,3,0,
28-
# 7,3,0,
21+
# CPU SOCKET CORE L1d:L1i:L2:L3 ONLINE MAXMHZ MINMHZ MHZ
22+
# 0 0 0 0:0:0:0 yes 5000.0000 800.0000 2400.000
23+
# 1 0 0 0:0:0:0 yes 5000.0000 800.0000 2400.000
24+
# 2 0 1 0:0:0:0 yes 5000.0000 800.0000 2400.000
25+
# 3 0 1 0:0:0:0 yes 5000.0000 800.0000 2400.000
26+
# 4 0 2 0:0:0:0 yes 5000.0000 800.0000 2400.000
27+
# 5 0 2 0:0:0:0 yes 5000.0000 800.0000 2400.000
28+
# 6 0 3 0:0:0:0 yes 5000.0000 800.0000 2400.000
29+
# 7 0 3 0:0:0:0 yes 5000.0000 800.0000 2400.000
2930

30-
# 0,0,0,0
31-
# 1,1,0,0
32-
# 2,2,1,1
33-
# 3,3,1,1
34-
# 4,0,0,0
35-
# 5,1,0,0
36-
# 6,2,1,1
37-
# 7,3,1,1
31+
# CPU NODE SOCKET CORE L1d:L1i:L2:L3 ONLINE MAXMHZ MINMHZ MHZ
32+
# 0 0 0 0 0:0:0:0 yes 5000.0000 800.0000 2400.000
33+
# 1 0 0 1 0:0:0:0 yes 5000.0000 800.0000 2400.000
34+
# 2 1 1 2 0:0:0:0 yes 5000.0000 800.0000 2400.000
35+
# 3 1 1 3 0:0:0:0 yes 5000.0000 800.0000 2400.000
36+
# 4 0 0 0 0:0:0:0 yes 5000.0000 800.0000 2400.000
37+
# 5 0 0 1 0:0:0:0 yes 5000.0000 800.0000 2400.000
38+
# 6 1 1 2 0:0:0:0 yes 5000.0000 800.0000 2400.000
39+
# 7 1 1 3 0:0:0:0 yes 5000.0000 800.0000 2400.000
40+
41+
# CPU NODE SOCKET CORE L1d:L1i:L2:L3 ONLINE MAXMHZ MINMHZ MHZ
42+
# 0 0 0 0 0:0:0:0 yes 5000.0000 800.0000 2400.000
43+
# 1 0 0 0 0:0:0:0 yes 5000.0000 800.0000 2400.000
44+
# 2 0 0 1 0:0:0:0 yes 5000.0000 800.0000 2400.000
45+
# 3 0 0 1 0:0:0:0 yes 5000.0000 800.0000 2400.000
46+
# 4 0 0 2 0:0:0:0 yes 5000.0000 800.0000 2400.000
47+
# 5 0 0 2 0:0:0:0 yes 5000.0000 800.0000 2400.000
48+
# 6 0 0 3 0:0:0:0 yes 5000.0000 800.0000 2400.000
49+
# 7 0 0 3 0:0:0:0 yes 5000.0000 800.0000 2400.000
50+
# 8 0 0 4 0:0:0:0 yes 3800.0000 800.0000 2400.000
51+
# 9 0 0 5 0:0:0:0 yes 3800.0000 800.0000 2400.000
52+
# 10 0 0 6 0:0:0:0 yes 3800.0000 800.0000 2400.000
53+
# 11 0 0 7 0:0:0:0 yes 3800.0000 800.0000 2400.000
3854

3955
class CoreInfo():
4056
'''
4157
Class to store core-specific information, including:
4258
- [int] CPU index
4359
- [int] Core index
4460
- [int] Numa node index
61+
- [int] Socket index
4562
- [bool] is a physical core or not
63+
- [float] maxmhz
64+
- [bool] is a performance core
4665
'''
47-
def __init__(self, lscpu_txt=''):
66+
def __init__(self, lscpu_txt='', headers={}):
4867
self.cpu = -1
4968
self.core = -1
69+
self.socket = -1
5070
self.node = -1
5171
self.is_physical_core = True
52-
if lscpu_txt != '':
53-
self.parse_raw(lscpu_txt)
72+
self.maxmhz = 0
73+
self.is_p_core = True
74+
if lscpu_txt != '' and len(headers) > 0:
75+
self.parse_raw(lscpu_txt, headers)
5476

55-
def parse_raw(self, lscpu_txt):
56-
cols = lscpu_txt.split(',')
57-
idx_col_node = 3
58-
if cols[idx_col_node] == '':
59-
idx_col_node = 2
60-
self.cpu = int(cols[0])
61-
self.core = int(cols[1])
62-
self.node = int(cols[idx_col_node])
77+
def parse_raw(self, cols, headers):
78+
self.cpu = int(cols[headers['cpu']])
79+
self.core = int(cols[headers['core']])
80+
if 'node' in headers:
81+
self.node = int(cols[headers['node']])
82+
self.socket = int(cols[headers['socket']])
83+
else:
84+
self.node = int(cols[headers['socket']])
85+
self.socket = int(cols[headers['socket']])
86+
if 'maxmhz' in headers:
87+
self.maxmhz = float(cols[headers['maxmhz']])
6388

6489
def __str__(self):
65-
return f'{self.cpu}|{self.core}|{self.node}|{self.is_physical_core}'
90+
return f'{self.cpu}\t{self.core}\t{self.socket}\t{self.node}\t{self.is_physical_core}\t{self.maxmhz}\t{self.is_p_core}'
6691

6792
class CPUPool(list):
6893
'''
@@ -109,8 +134,8 @@ def __init__(self, logger=None, lscpu_txt=''):
109134
'''
110135
Retrieve CPU information from lscpu.
111136
'''
112-
if lscpu_txt == '':
113-
args = ['lscpu', '--parse=CPU,Core,Socket,Node']
137+
if lscpu_txt.strip() == '':
138+
args = ['lscpu', '--all', '--extended']
114139
env_lang = os.getenv('LANG', 'UNSET')
115140
os.environ['LANG'] = 'C'
116141
lscpu_info = subprocess.check_output(args, env=os.environ, universal_newlines=True)
@@ -125,27 +150,49 @@ def __init__(self, logger=None, lscpu_txt=''):
125150
Filter out lines that are really useful.
126151
'''
127152
lscpu_info = lscpu_info.strip().split('\n')
153+
headers = {}
154+
num_cols = 0
128155
for line in lscpu_info:
129-
line = line.strip()
130-
if re.match('^([\d]+,[\d]+,[\d]+,[\d]?)', line):
131-
self.pool_all.append(CoreInfo(line))
156+
line = re.sub(' +', ' ', line.lower().strip())
157+
if 'cpu' in line and 'socket' in line and 'core' in line:
158+
t = line.split(' ')
159+
num_cols = len(t)
160+
for i in range(num_cols):
161+
if t[i] in ['cpu', 'core', 'socket', 'node', 'maxmhz']:
162+
headers[t[i]] = i
163+
else:
164+
t = line.split(' ')
165+
if len(t) == num_cols and t[headers['cpu']].isdigit() and t[headers['core']].isdigit() and t[headers['socket']].isdigit():
166+
self.pool_all.append(CoreInfo(t, headers))
132167
assert len(self.pool_all) > 0, 'cpuinfo is empty'
133168

134-
'''
135-
Loop through all cores and determine is_physical_core for each of them.
136-
'''
137-
phy_cores = [c.core for c in self.pool_all]
138-
phy_cores_unique = set(phy_cores)
139-
if len(phy_cores) // len(phy_cores_unique) > 1:
140-
core_cur = -1
141-
self.pool_all.sort(key=lambda x: (x.core, x.cpu))
142-
for c in self.pool_all:
143-
if core_cur != c.core:
144-
core_cur = c.core
145-
else:
146-
c.is_physical_core = False
169+
# Determine logical cores
170+
core_cur = -1
171+
self.pool_all.sort(key=lambda x: (x.core, x.cpu))
172+
for c in self.pool_all:
173+
if core_cur != c.core:
174+
core_cur = c.core
175+
else:
176+
c.is_physical_core = False
147177
self.pool_all.sort(key=lambda x: x.cpu)
148178

179+
# Determine e cores
180+
maxmhzs = list(set([c.maxmhz for c in self.pool_all]))
181+
maxmhzs.sort()
182+
mmaxmhzs = max(maxmhzs)
183+
if mmaxmhzs > 0:
184+
maxmhzs_norm = [f/mmaxmhzs for f in maxmhzs]
185+
separator_idx = -1
186+
for i in range(1, len(maxmhzs_norm)):
187+
if maxmhzs_norm[i] - maxmhzs_norm[i-1] >= 0.15:
188+
separator_idx = i
189+
break
190+
if separator_idx > -1:
191+
e_core_mhzs = maxmhzs[:separator_idx]
192+
for c in self.pool_all:
193+
if c.maxmhz in e_core_mhzs:
194+
c.is_p_core = False
195+
149196
def verbose(self, level, msg):
150197
if self.logger:
151198
logging_fn = {
@@ -162,6 +209,7 @@ def verbose(self, level, msg):
162209
- ninstances [int]: Number of instances. Should be a non negative integer, 0 by default. When it is 0, it will be set according to usage scenarios automatically in the function.
163210
- ncores_per_instance [int]: Number of cores per instance. Should be a non negative integer, 0 by default. When it is 0, it will be set according to usage scenarios automatically in the function.
164211
- use_logical_cores [bool]: Use logical cores on the workloads or not, False by default. When set to False, only physical cores are used.
212+
- use_e_cores [bool]: Use Efficient-Cores, False by default. When set to False, only Performance-Cores are used.
165213
- skip_cross_node_cores [bool]: Allow instances to be executed on cores across NUMA nodes, False by default.
166214
- nodes_list [list]: A list containing all node ids that the execution is expected to be running on.
167215
- cores_list [list]: A list containing all cpu ids that the execution is expected to be running on.
@@ -172,16 +220,19 @@ def gen_pools_ondemand(
172220
ninstances=0,
173221
ncores_per_instance=0,
174222
use_logical_cores=False,
223+
use_e_cores=False,
175224
skip_cross_node_cores=False,
176225
nodes_list=[],
177226
cores_list=[],
178227
return_mode='auto'):
179228
# Generate an aggregated CPU pool
180229
if len(cores_list) > 0:
181230
cores_available = [c.cpu for c in self.pool_all]
182-
assert set(cores_list).issubset(set(cores_available)), f'Designated cores list contains invalid cores.'
231+
assert set(cores_list).issubset(set(cores_available)), f'Designated cores list {cores_list} contains invalid cores.'
183232
if use_logical_cores:
184233
self.verbose('warning', 'Argument --use-logical-cores won\'t take effect when --cores-list is set.')
234+
if use_e_cores:
235+
self.verbose('warning', 'Argument --use-e-cores won\'t take effect when --cores-list is set.')
185236
pool = [c for c in self.pool_all if c.cpu in cores_list]
186237
nodes = list(set([c.node for c in pool]))
187238
ncores_per_node = -1
@@ -197,12 +248,17 @@ def gen_pools_ondemand(
197248
else:
198249
if len(nodes_list) > 0:
199250
nodes_available = set([c.node for c in self.pool_all])
200-
assert set(nodes_list).issubset(nodes_available), f'Designated nodes list contains invalid nodes.'
251+
assert set(nodes_list).issubset(nodes_available), f'Designated nodes list {nodes_list} contains invalid nodes out from {nodes_available}.'
201252
pool = [c for c in self.pool_all if c.node in nodes_list]
202253
else:
203254
pool = self.pool_all
204255
if not use_logical_cores:
205256
pool = [c for c in pool if c.is_physical_core]
257+
if not use_e_cores:
258+
pool = [c for c in pool if c.is_p_core]
259+
e_cores = [c.cpu for c in pool if not c.is_p_core]
260+
if len(e_cores) > 0:
261+
self.verbose('warning', f'Efficient-Cores are detected ({e_cores}). Disabled for performance consideration. You can enable them with argument --use-e-cores.')
206262

207263
# Determine ninstances and ncores_per_instance for grouping
208264
assert ncores_per_instance >= 0, 'Argument --ncores-per-instance cannot be a negative value.'
@@ -256,8 +312,10 @@ def gen_pools_ondemand(
256312
self.pools_ondemand.append(pool_local)
257313

258314
if __name__ == "__main__":
259-
pools = CPUPoolList()
260-
pools.gen_pools_ondemand(use_logical_cores=False, nodes_list=[0,1], return_mode='auto', ninstances=3, ncores_per_instance=0, skip_cross_node_cores=False)
315+
lscpu_txt = '''
316+
'''
317+
pools = CPUPoolList(lscpu_txt = lscpu_txt)
318+
pools.gen_pools_ondemand(use_logical_cores=False, return_mode='auto', ninstances=3, ncores_per_instance=0, use_e_cores=True, skip_cross_node_cores=False)
261319
print(f'capacity pool_auto: {pools.pool_all.get_pool_txt(return_mode="auto")}')
262320
print(f'capacity pool_list: {pools.pool_all.get_pool_txt(return_mode="list")}')
263321
print(f'capacity pool_range: {pools.pool_all.get_pool_txt(return_mode="range")}')

intel_extension_for_pytorch/cpu/launch/launcher_base.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,13 @@ def add_common_params(self, parser):
3939
type=str,
4040
help='Specify nodes list for multiple instances to run on, in format of list of single node ids "node_id,node_id,..." or list of node ranges "node_id-node_id,...". By default all nodes will be used.',
4141
)
42+
group.add_argument(
43+
'--use-e-cores',
44+
'--use_e_cores',
45+
action='store_true',
46+
default=False,
47+
help='Use Efficient-Cores on the workloads or not. By default, only Performance-Cores are used.',
48+
)
4249
group.add_argument(
4350
'--memory-allocator',
4451
'--memory_allocator',

intel_extension_for_pytorch/cpu/launch/launcher_distributed.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,7 @@ def launch(self, args):
162162
ninstances=args.nprocs_per_node,
163163
ncores_per_instance=ncores_per_instance,
164164
use_logical_cores=True,
165+
use_e_cores=args.use_e_cores,
165166
nodes_list=nodes_list
166167
)
167168

intel_extension_for_pytorch/cpu/launch/launcher_multi_instances.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -159,6 +159,7 @@ def launch(self, args):
159159
ninstances = args.ninstances,
160160
ncores_per_instance = args.ncores_per_instance,
161161
use_logical_cores = args.use_logical_cores,
162+
use_e_cores = args.use_e_cores,
162163
skip_cross_node_cores = args.skip_cross_node_cores,
163164
nodes_list = nodes_list,
164165
cores_list = cores_list

0 commit comments

Comments
 (0)