MallTimes.py 9.94 KB
Newer Older
1
2
3
4
import sys
import glob
import numpy as np
import pandas as pd
5
6
7
8
9
10
11
12
13
14
from enum import Enum

class G_enum(Enum):
    TOTAL_RESIZES = 0
    TOTAL_GROUPS = 1
    TOTAL_STAGES = 2
    GRANULARITY = 3
    SDR = 4
    ADR = 5
    DR = 6
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
    RED_METHOD = 7
    RED_STRATEGY = 8
    SPAWN_METHOD = 9
    SPAWN_STRATEGY = 10
    GROUPS = 11
    FACTOR_S = 12
    DIST = 13
    STAGE_TYPES = 14
    STAGE_TIMES = 15
    STAGE_BYTES = 16
    ITERS = 17
    ASYNCH_ITERS = 18
    T_ITER = 19
    T_STAGES = 20
    T_SPAWN = 21
    T_SPAWN_REAL = 22
    T_SR = 23
    T_AR = 24
    T_MALLEABILITY = 25
    T_TOTAL = 26
    #Malleability specific
    NP = 0
    NC = 1
    #Iteration specific
    IS_DYNAMIC = 11
    N_PARENTS = 17
41

42
43
44
45
46
47

columnsG = ["Total_Resizes", "Total_Groups", "Total_Stages", "Granularity", "SDR", "ADR", "DR", "Redistribution_Method", \
            "Redistribution_Strategy", "Spawn_Method", "Spawn_Strategy", "Groups", "FactorS", "Dist", "Stage_Types", "Stage_Times", \
            "Stage_Bytes", "Iters", "Asynch_Iters", "T_iter", "T_stages", "T_spawn", "T_spawn_real", "T_SR", "T_AR", "T_Malleability", "T_total"] #27

#-----------------------------------------------
48
# Obtains the value of a given index in a splited line
49
50
51
52
53
54
55
56
57
58
59
60
61
62
# and returns it as a float values if possible, string otherwise
def get_value(line, index, separator=True):
  if separator:
    value = line[index].split('=')[1].split(',')[0]
  else:
    value = line[index]

  try:
    value = float(value)
    if value.is_integer():
      value = int(value)
  except ValueError:
    return value
  return value
63

64
#-----------------------------------------------
65
# Obtains the general parameters of an execution and
66
67
# stores them for creating a global dataframe
def record_config_line(lineS, dataG_it):
68
69
  ordered_indexes = [G_enum.TOTAL_RESIZES.value, G_enum.TOTAL_STAGES.value, \
          G_enum.GRANULARITY.value, G_enum.SDR.value, G_enum.ADR.value]
70
71
72
73
74
75
  offset_line = 2
  for i in range(len(ordered_indexes)):
    value = get_value(lineS, i+offset_line)
    index = ordered_indexes[i]
    dataG_it[index] = value

76
  dataG_it[G_enum.TOTAL_GROUPS.value] = dataG_it[G_enum.TOTAL_RESIZES.value]+1
77
78
79
80
81
82

  #FIXME Modificar cuando ADR ya no sea un porcentaje
  dataG_it[G_enum.DR.value] = dataG_it[G_enum.SDR.value] + dataG_it[G_enum.ADR.value]

  # Init lists for each column
  array_groups = [G_enum.GROUPS.value, G_enum.FACTOR_S.value, G_enum.DIST.value, G_enum.ITERS.value, \
83
84
85
          G_enum.ASYNCH_ITERS.value, G_enum.T_ITER.value, G_enum.T_STAGES.value, G_enum.RED_METHOD.value, \
          G_enum.RED_STRATEGY.value, G_enum.SPAWN_METHOD.value, G_enum.SPAWN_STRATEGY.value,]
  array_resizes = [ G_enum.T_SPAWN.value, G_enum.T_SPAWN_REAL.value, G_enum.T_SR.value, G_enum.T_AR.value, G_enum.T_MALLEABILITY.value]
86
87
88
89
  array_stages = [G_enum.STAGE_TYPES.value, \
          G_enum.STAGE_TIMES.value, G_enum.STAGE_BYTES.value]
  for index in array_groups:
    dataG_it[index] = [None]*dataG_it[G_enum.TOTAL_GROUPS.value]
90
91
  for group in range(dataG_it[G_enum.TOTAL_GROUPS.value]):
    dataG_it[G_enum.T_ITER.value][group] = []
92
93
94
95
96
97
98

  for index in array_resizes:
    dataG_it[index] = [None]*dataG_it[G_enum.TOTAL_RESIZES.value]

  for index in array_stages:
    dataG_it[index] = [None]*dataG_it[G_enum.TOTAL_STAGES.value]

99
#-----------------------------------------------
100
101
102
103
104
105
106
107
108
109
# Obtains the parameters of a stage line 
# and stores it in the dataframe
# Is needed to indicate in which stage is
# being performed
def record_stage_line(lineS, dataG_it, stage):
  array_stages = [G_enum.STAGE_TYPES.value, \
          G_enum.STAGE_TIMES.value, G_enum.STAGE_BYTES.value]
  offset_lines = 2
  for i in range(len(array_stages)):
    value = get_value(lineS, i+offset_lines)
110
    index = array_stages[i]
111
112
    dataG_it[index][stage] = value

113
#-----------------------------------------------
114
115
116
117
# Obtains the parameters of a resize line
# and stores them in the dataframe
# Is needed to indicate to which group refers
# the resize line
118
# Group 0: Iters=3, Procs=80, Factors=0.037500, Dist=2, RM=0, SM=0, RS=0, SS=0
119
120
def record_group_line(lineS, dataG_it, group):
  array_groups = [G_enum.ITERS.value, G_enum.GROUPS.value, G_enum.FACTOR_S.value, G_enum.DIST.value, \
121
          G_enum.RED_METHOD.value, G_enum.SPAWN_METHOD.value, G_enum.RED_STRATEGY.value, G_enum.SPAWN_STRATEGY.value]
122
  offset_lines = 2
123
  for i in range(len(array_groups)):
124
    value = get_value(lineS, i+offset_lines)
125
    index = array_groups[i]
126
127
    dataG_it[index][group] = value

128
#-----------------------------------------------
129
def record_time_line(lineS, dataG_it):
130
131
  T_names = ["T_spawn:", "T_spawn_real:", "T_SR:", "T_AR:", "T_Malleability:", "T_total:"]
  T_values = [G_enum.T_SPAWN.value, G_enum.T_SPAWN_REAL.value, G_enum.T_SR.value, G_enum.T_AR.value, G_enum.T_MALLEABILITY.value, G_enum.T_TOTAL.value]
132
133
134
  if not (lineS[0] in T_names): # Execute only if line represents a Time
      return

135
136
  index = T_names.index(lineS[0])
  index = T_values[index]
137
  offset_lines = 1
138
139
140
141
142
143
144
145

  len_index = 1
  if dataG_it[index] != None:
    len_index = len(dataG_it[index])
    for i in range(len_index):
      dataG_it[index][i] = get_value(lineS, i+offset_lines, False)
  else:
      dataG_it[index] = get_value(lineS, offset_lines, False)
146
147

#-----------------------------------------------
148
149
150
151
152
def record_multiple_times_line(lineS, dataG_it, group):
  T_names = ["T_iter:", "T_stage"]
  T_values = [G_enum.T_ITER.value, G_enum.T_STAGES.value]
  if not (lineS[0] in T_names): # Execute only if line represents a Time
      return
153

154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
  index = T_names.index(lineS[0])
  index = T_values[index]

  offset_lines = 1
  if index == G_enum.T_STAGES.value:
    offset_lines += 1
    total_iters = len(lineS)-offset_lines
    stage = int(lineS[1].split(":")[0])
    if stage == 0:
      dataG_it[index][group] = [None] * total_iters
      for i in range(total_iters):
        dataG_it[index][group][i] = [None] * dataG_it[G_enum.TOTAL_STAGES.value]
    for i in range(total_iters):
        dataG_it[index][group][i][stage] = get_value(lineS, i+offset_lines, False)
  else:
    total_iters = len(lineS)-offset_lines
    for i in range(total_iters):
      dataG_it[index][group].append(get_value(lineS, i+offset_lines, False))
  
#-----------------------------------------------
def read_local_file(f, dataG, it, runs_in_file):
  offset = 0
  real_it = 0
  group = 0

  for line in f:
180
181
182
    lineS = line.split()

    if len(lineS) > 0:
183
184
185
186
187
188
189
      if lineS[0] == "Group": # GROUP number
        offset += 1
        real_it = it - (runs_in_file-offset)
        group = int(lineS[1].split(":")[0])
      elif lineS[0] == "Async_Iters:":
        offset_line = 1
        dataG[real_it][G_enum.ASYNCH_ITERS.value][group] = get_value(lineS, offset_line, False)
190
      else:
191
        record_multiple_times_line(lineS, dataG[real_it], group)
192

193
#-----------------------------------------------
194
195
def read_global_file(f, dataG, it):
  runs_in_file=0
196
197
  for line in f: 
    lineS = line.split()
198

199
200
201
    if len(lineS) > 0:
      if lineS[0] == "Config": # CONFIG LINE
        it += 1
202
203
        runs_in_file += 1
        group = 0
204
205
        stage = 0

206
207
208
        dataG.append([None]*len(columnsG))
        record_config_line(lineS, dataG[it])

209
210
211
      elif lineS[0] == "Stage":
        record_stage_line(lineS, dataG[it], stage)
        stage+=1
212
213
214
      elif lineS[0] == "Group":
        record_group_line(lineS, dataG[it], group)
        group+=1
215
216
217
      else:
        record_time_line(lineS, dataG[it])

218
219
220
221
  return it,runs_in_file

#-----------------------------------------------

222
223

#-----------------------------------------------
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
def convert_to_tuples(dfG):
  array_list_items = [G_enum.GROUPS.value, G_enum.FACTOR_S.value, G_enum.DIST.value, G_enum.ITERS.value, \
          G_enum.ASYNCH_ITERS.value, G_enum.RED_METHOD.value, G_enum.RED_STRATEGY.value, G_enum.SPAWN_METHOD.value, \
          G_enum.SPAWN_STRATEGY.value, G_enum.T_SPAWN.value, G_enum.T_SPAWN_REAL.value, G_enum.T_SR.value, \
          G_enum.T_AR.value, G_enum.STAGE_TYPES.value, G_enum.STAGE_TIMES.value, G_enum.STAGE_BYTES.value]
            #TODO Falta T_malleability?
  array_multiple_list_items = [G_enum.T_ITER.value, G_enum.T_STAGES.value]
  for item in array_list_items:
    name = columnsG[item]
    values = dfG[name].copy()
    for index in range(len(values)):
      values[index] = tuple(values[index])
    dfG[name] = values

  for item in array_multiple_list_items:
    name = columnsG[item]
    values = dfG[name].copy()
    for i in range(len(values)):
      for j in range(len(values[i])):
        if(type(values[i][j][0]) == list):
          for r in range(len(values[i][j])):
            values[i][j][r] = tuple(values[i][j][r])
        values[i][j] = tuple(values[i][j])
      values[i] = tuple(values[i])
    dfG[name] = values

#-----------------------------------------------

252
if len(sys.argv) < 2:
253
    print("The files name is missing\nUsage: python3 MallTimes.py commonName directory OutName")
254
255
    exit(1)

256
common_name = sys.argv[1]
257
258
259
260
if len(sys.argv) >= 3:
    BaseDir = sys.argv[2]
    print("Searching in directory: "+ BaseDir)
else:
261
    BaseDir = "./"
262
263
264
265
266

if len(sys.argv) >= 4:
  name = sys.argv[3]
else:
  name = "data"
267
print("File name will be: " + name + "G.pkl")
268
269

insideDir = "Run"
270
271
lista = glob.glob(BaseDir + insideDir + "*/" + common_name + "*_Global.out")
lista += (glob.glob(BaseDir + common_name + "*_Global.out")) # Se utiliza cuando solo hay un nivel de directorios
272
273
274
print("Number of files found: "+ str(len(lista)));

it = -1
275
dataG = []
276
277
278

for elem in lista:
  f = open(elem, "r")
iker_martin's avatar
iker_martin committed
279
280
  id_run = elem.split("_Global.out")[0].split(common_name)[-1] 
  lista_local = glob.glob(BaseDir + common_name + id_run + "_G*NP*.out")
281
282

  it,runs_in_file = read_global_file(f, dataG, it)
283
  f.close()
284
285
286
287
288
  for elem_local in lista_local:
    f_local = open(elem_local, "r")
    read_local_file(f_local, dataG, it, runs_in_file)
    f_local.close()

289

290
dfG = pd.DataFrame(dataG, columns=columnsG)
291
292
293
convert_to_tuples(dfG)
print(dfG)
dfG.to_pickle(name + 'G.pkl')
294

295
#dfM = pd.DataFrame(dataM, columns=columnsM)
296
297

#Poner en TC el valor real y en TH el necesario para la app
298
299
300
#cond = dfM.TH != 0
#dfM.loc[cond, ['TC', 'TH']] = dfM.loc[cond, ['TH', 'TC']].values
#dfM.to_csv(name + 'M.csv')