CheckRun.sh 6.59 KB
Newer Older
1
2
3
#!/bin/bash

dir="/home/martini/malleability_benchmark/"
4
5
6
7
8
9
10
11
12
13
14
15
16
17
cores=20

# Checks if all the runs in the current working directory performed under a 
# Slurm manager have been performed correctly and if some runs can be corrected 
# they are launched again
# Parameter 1 - Maximum index of the runs
# Parameter 2 - Amount of repetitions per index/run
# Parameter 3 - Total stages in all runs. #FIXME The amount of stages must be equal across all the runs, must be modified in the future.
# Parameter 4 - Total groups of processes in all runs #FIXME The amount of groups must be equal across all the runs, must be modified in the future.
# Parameter 5 - Maximum valid iteration time across all runs. If an iteration time
#               is higher, that particular repetition inside the run is cleaned and
#               launched again.
#====== Do not modify the following values =======

18
19
20
21
codeDir="Codes/"
execDir="Exec/"
ResultsDir="Results/"

22
23
24
25
maxIndex=$1
totalEjGrupo=$2 #Total de ejecuciones por grupo
total_stages=$3
total_groups=$4
26
maxTime=$5 #Maximo tiempo que se considera válido
27

28
29
30
31
32
33
exec_lines_basic=6
iter_lines_basic=3
exec_total_lines=$(($exec_lines_basic+$total_stages+$total_groups))
iter_total_lines=$(($iter_lines_basic+$total_stages*2+1))
exec_remove=$(($exec_lines_basic+$total_stages+$total_groups-1))
iter_remove=$(($iter_lines_basic+$total_stages-1))
34

iker_martin's avatar
iker_martin committed
35
if [ "$#" -lt "5" ]
36
then
37
38
  echo "Not enough arguments"
  echo "Usage -> bash CheckRun maxIndes total_repetitions total_groups total_stages max_iteration_time"
39
40
41
  exit -1
fi

42
43
#Check if there are fatal errors during executions
grep -i -e fatal -e error -e abort -e == slurm* > errores2.txt
44
qty=$(wc -l errores2.txt | cut -d ' ' -f1)
45
if [ "$qty" -gt "0" ]
46
then
47
48
  echo "Found Fatal errors during execution. Aborting"
  echo "Read file errors2 to see the errors and in which files"
49
50
51
  exit -2
fi

52
53
54
55
56
57
#Check if the number of output files is correct.
#If the number is not correct is a fatal error and the user
# is informed in which runs the amount does not match, and
# then the scripts exit.
#The user must figure out what to do with those runs.
qtyG=$(ls R*_Global.out | wc -l)
58
qtyG=$(($qtyG * 2))
59
60
qtyL=$(ls R*_G*N*.out | wc -l)
if [ "$qtyG" == "$qtyL" ]
61
then
62
  echo "Number of G($qtyG) and L($qtyL) files match"
63
else
64
65
66
  echo "Lacking Local($qtyL) or global($qtyG) files. Aborting"
  echo "Lacking Local($qtyL) or global($qtyG) files. Aborting" > errores2.txt
  for ((i=0; i<$maxIndex; i++))
67
  do
68
69
    qtyEx=$(grep T_total R"$i"_Global.out | wc -l)
    qtyIt=$(grep T_iter R"$i"_G*N*.out | wc -l)
70
    qtyEx=$(($qtyEx * 2))
71
    if [ "$qtyEx" -ne "$qtyIt" ] 
72
73
    then
      diff=$(($totalEjGrupo-$qtyEx))
74
75
      echo "Files do not match at Run $i -- diff=$diff"
      echo "Files do not match at Run $i -- diff=$diff" >> errores2.txt
76
77
    fi
  done
78
79
  exit -1
fi
80
rm errores2.txt
81

82
83
84
85
86
87
88
89
90
91
92
93
# Check if there is any negative execution time
# Only invalid IDs are stored
rm -f errores.txt
touch errores.txt
exec_ids=($(grep -n "T_total" R*_Global.out | grep - | cut -d '_' -f1 | cut -d 'R' -f2))
exec_line=($(grep -n "T_total" R*_Global.out | grep - | cut -d ':' -f2))
for ((i=${#exec_ids[@]}-1; i>=0; i--))
do
  first_line=$((${exec_line[$i]}-$exec_remove))
  last_line=$(($first_line+$exec_total_lines-1))
  echo "${exec_ids[$i]}:$first_line:$last_line" >> errores.txt
done
94

95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
# Check if there is any iter time higher than expected
# Only invalid IDs are stored
iter_times=($(grep "T_iter" R*_G*N*.out | cut -d ' ' -f2))
iter_ids=($(grep "T_iter" R*_G*N*.out | cut -d '_' -f1 | cut -d 'R' -f2))
iter_line=($(grep -n "T_iter" R*_G*N*.out | cut -d ':' -f2))
for ((i=${#iter_times[@]}-1; i>=0; i--))
do
  is_invalid=$(echo ${iter_times[$i]}'>'$maxTime | bc -l)
  if [ $is_invalid -eq 1 ]
  then
    first_line=$((${iter_line[$i]}-$iter_remove))
    # Translate line number to Global file
    first_line=$(($first_line/$iter_total_lines))
    first_line=$(($first_line*$exec_total_lines+1))
    last_line=$(($first_line+$exec_total_lines-1))
    echo "${iter_ids[$i]}:$first_line:$last_line" >> errores.txt
  fi
done

#Clean data from collected erroneous executions
115
qty=$(wc -l errores.txt | cut -d ' ' -f1)
116
if [ "$qty" -gt 0 ];
117
118
119
then
  echo "Se han encontrado errores de ejecución leves. Volviendo a ejecutar"

120
  while IFS="" read -r lineRun || [ -n "$lineRun" ]
121
  do
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
    #Obtain data of erroneous execution
    run=$(echo $lineRun | cut -d ':' -f1)
    echo "Run $run had an erroneous execution, cleaning bad data."

    #1 - Delete erroneous lines in Global file
    first_line=$(echo $lineRun | cut -d ':' -f2)
    last_line=$(echo $lineRun | cut -d ':' -f3)
    sed -i ''$first_line','$last_line'd' R${run}_Global.out

    #2 - Translate line numbers to Local files type
    first_line=$(($first_line/$exec_total_lines))
    first_line=$(($first_line*$iter_total_lines+1))
    last_line=$(($first_line+$iter_total_lines-1))
    #3 - Delete erroneous lines in Local files
    for ((j=0; j<total_groups; j++)); 
    do
      sed -i ''$first_line','$last_line'd' R${run}_G${j}*
139
140
    done

141
  done < errores.txt
142
fi
143

144
145
146
#Check if all repetitions for each Run have been executed
#If any run lacks repetitions, the job is automatically launched again
#If a run has even executed a repetition, is not launched as it could be in the waiting queue
147
qty_missing=0
148
for ((run=0; run<$maxIndex; run++))
149
do
150
  if [ -f "R${run}_Global.out" ]
151
  then
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
    qtyEx=$(grep T_total R"$run"_Global.out | wc -l)
    if [ "$qtyEx" -ne "$totalEjGrupo" ];
    then
      diff=$(($totalEjGrupo-$qtyEx))
      qty_missing=$(($qty_missing+$diff))
      config_file="config$run.ini"

      #1 - Obtain maximum number of processes for the run
      max_numP=-1
      for ((j=0; j<total_groups; j++)); 
      do
        resize_info=$(grep "\[resize$j\]" -n $config_file | cut -d ":" -f1)
        first_line=$(echo $resize_info | cut -d " " -f1)
        last_line=$(echo $resize_info | cut -d " " -f2)
        range_lines=$(( last_line - first_line ))
        numP=$(head -$last_line $config_file | tail -$range_lines | cut -d ';' -f1 | grep Procs | cut -d '=' -f2)
	if [ "$numP" -gt "$max_numP" ];
	then
	  max_numP=$numP
	fi
      done

      #3 - Obtain needed nodes for the number of processes
      node_qty=$(($max_numP / $cores))
      if [ "$node_qty" -eq "0" ];
      then
        node_qty=1
      fi

      #3 - Launch execution
      echo "Run$run lacks $diff repetitions"
183
184
      use_extrae=0
      sbatch -N $node_qty $dir$execDir./generalRun.sh $dir $config_file $use_extrae $run $diff
185
186
187
    fi
  else
    echo "File R${run}_Global.out does not exist -- Could it be it must still be executed?"
188
189
190
  fi
done

191
192

if [ "$qty_missing" -eq "0" ];
193
then
194
195
196
  echo "SUCCESS"
else
  echo "REPEATING - A total of $qty_missing executions are being repeated"
197
fi