-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathprinceCPU.lua
More file actions
247 lines (205 loc) · 7.04 KB
/
princeCPU.lua
File metadata and controls
247 lines (205 loc) · 7.04 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
#!/bin/env lua
local princeCPU = { }
local princeUtils = require "princeUtils"
local princeUsers = require "princeUsers"
local princeStakeholders = require "princeStakeholders"
local slurm_log = princeUtils.slurm_log
local user_log = princeUtils.user_log
local cpus = 0
local memory = 0
local nodes = 0
local time_limit = princeUtils.unlimited_time
local ave_memory = nil
local partition_to_partition_group = { }
local partition_groups = {
group_20_62_16 = { partitions = "c26,c27,c28,c29,c30,c31",
min_cpus = 1, max_cpus = 20, max_nodes = 16,
min_memory = 0, max_memory = 62,
min_ave_memory = 0, max_ave_memory = 6
},
group_28_125 = { partitions = "c01_17",
min_cpus = 1, max_cpus = 28, max_nodes = 68,
min_memory = 0, max_memory = 125,
min_ave_memory = 0, max_ave_memory = 20
},
group_28_250 = { partitions = "c18_25",
min_cpus = 1, max_cpus = 28, max_nodes = 32,
min_memory = 0, max_memory = 250,
min_ave_memory = 0, max_ave_memory = 100
},
group_28 = { partitions = "c01_25",
min_cpus = 1, max_cpus = 28, max_nodes = 100,
min_memory = 0, max_memory = 125,
min_ave_memory = 0, max_ave_memory = 20
},
group_bigmem = { partitions = "bigmem",
min_cpus = 1, max_cpus = 48, max_nodes = 1,
min_memory = 50, max_memory = 1500,
min_ave_memory = 10, max_ave_memory = 1500
},
group_knl = { partitions = "knl",
min_cpus = 1, max_cpus = 64, max_nodes = 1,
min_memory = 0, max_memory = 186,
min_ave_memory = 0, max_ave_memory = 186
},
group_c32_38 = { partitions = "c32_38",
min_cpus = 1, max_cpus = 20, max_nodes = 112,
min_memory = 0, max_memory = 62,
min_ave_memory = 0, max_ave_memory = 6
},
group_c39_41 = { partitions = "c39_41",
min_cpus = 1, max_cpus = 20, max_nodes = 40,
min_memory = 0, max_memory = 188,
min_ave_memory = 0, max_ave_memory = 60,
},
group_c32_41 = { partitions = "c32_41",
min_cpus = 1, max_cpus = 20, max_nodes = 150,
min_memory = 0, max_memory = 62,
min_ave_memory = 0, max_ave_memory = 6
},
group_c42 = { partitions = "c42",
min_cpus = 1, max_cpus = 40, max_nodes = 1,
min_memory = 0, max_memory = 187.5,
min_ave_memory = 0, max_ave_memory = 20
},
group_c43_45 = { partitions = "c43_45",
min_cpus = 1, max_cpus = 40, max_nodes = 12,
min_memory = 0, max_memory = 187.5,
min_ave_memory = 0, max_ave_memory = 20
},
xwang = { partitions = "xwang_cpu",
min_cpus = 1, max_cpus = 28, max_nodes = 4,
min_memory = 0, max_memory = 250,
min_ave_memory = 0, max_ave_memory = 250,
users = princeStakeholders.cns_wang_users
},
xwang_cascade = { partitions = "xwang_cpu_cascade",
min_cpus = 1, max_cpus = 40, max_nodes = 12,
min_memory = 0, max_memory = 187.5,
min_ave_memory = 0, max_ave_memory = 187.50,
users = princeStakeholders.cns_wang_users
},
kussell = { partitions = "kussell_cpu",
min_cpus = 1, max_cpus = 40, max_nodes = 4,
min_memory = 0, max_memory = 187.5,
min_ave_memory = 0, max_ave_memory = 187.5,
users = princeStakeholders.kussell_users
},
jupyterhub = { partitions = "jupyterhub_cpu",
min_cpus = 1, max_cpus = 4, max_nodes = 1,
min_memory = 0, max_memory = 4,
min_ave_memory = 0, max_ave_memory = 4
},
gpu_partitions = { partitions = "k80_4,p40_4,p100_4,v100_pci_2,k80_8,v100_sxm2_4,v100_sxm2_4_2,dgx1",
min_cpus = 1, max_cpus = 4, max_nodes = 1,
min_memory = 0, max_memory = 70,
min_ave_memory = 0, max_ave_memory = 70,
time_limit = princeUtils.hours_to_mins(5)
}
}
local partition_group_names = { "xwang_cascade",
"xwang",
"kussell",
"group_c32_38",
"group_c32_41",
"group_c39_41",
"group_20_62_16",
"group_28_125",
"group_28",
"group_28_250",
"group_c42",
"group_c43_45",
"group_bigmem",
"gpu_partitions",
}
local function setup_partition_to_partition_group()
if not princeUtils.is_empty(partition_to_partition_group) then return end
slurm_log("Setup partition to partition group")
for key, val in pairs(partition_groups) do
local tmp = princeUtils.split(val.partitions, ",")
for i = 1, #tmp do
partition_to_partition_group[tmp[i]] = key
end
end
end
local function fit_into_partition_group(group_name)
local group = partition_groups[group_name]
if group ~= nil then
if group.users ~= nil and not princeUtils.in_table(group.users, princeUsers.nyu_netid()) then
return false
end
if group.time_limit ~= nil and time_limit > group.time_limit then
return false
end
if nodes <= group.max_nodes and
group.min_cpus <= cpus and cpus <= group.max_cpus and
group.min_memory <= memory and memory <= group.max_memory and
group.min_ave_memory <= ave_memory and ave_memory <= group.max_ave_memory then
return true
end
end
return false
end
local function partition_is_valid(part_name)
setup_partition_to_partition_group()
local group_name = partition_to_partition_group[part_name]
if group_name ~= nil then
return fit_into_partition_group(group_name)
end
return false
end
local function assign_partitions()
if cpus == 28 and memory <= 125 then return "c01_17" end
local partitions = nil
for _, group_name in pairs(partition_group_names) do
if fit_into_partition_group(group_name) then
if partitions == nil then
partitions = partition_groups[group_name].partitions
else
partitions = partitions .. "," .. partition_groups[group_name].partitions
end
end
end
return partitions
end
local function extra_checks_are_valid()
if 250 < memory and memory <= 500 and cpus > 20 then
user_log("For job with memory between 250GB and 500GB per node, please request no more than 20 CPU cores per node")
return false
end
return true
end
local function partitions_are_valid(partitions)
if partitions == nil then
user_log("No CPU partitions set")
return false
else
for _, part_name in pairs(princeUtils.split(partitions, ",")) do
if not partition_is_valid(part_name) then
user_log("partition '%s' is not valid for this job", part_name)
return false
end
end
end
if not extra_checks_are_valid() then return false end
if princeUsers.nyu_netid() ~= "wang" and princeUsers.nyu_netid() ~= "hpcadmin" then
if cpus == 28 and memory <= 125 and partitions ~= "c01_17" then
user_log("For jobs with 28 CPU cores and <= 125GB memory, please use c01_17 partition only")
return false
end
end
return true
end
local function setup_parameters(args)
cpus = args.cpus or 1
memory = args.memory/1024 or 2 -- in GB only
nodes = args.nodes or 1
time_limit = args.time_limit or 60 -- 1 hour
ave_memory = memory/cpus
end
-- functions
princeCPU.setup_parameters = setup_parameters
princeCPU.assign_partitions = assign_partitions
princeCPU.partitions_are_valid = partitions_are_valid
slurm_log("To load princeCPU.lua")
return princeCPU