CheckRun.sh 6.6 KB
Newer Older
1
2
#!/bin/bash

3
dir="/home/usuario/Documentos/malleability_benchmark"
4
5
6
7
8
9
10
11
12
13
14
15
16
17
cores=20

# Checks if all the runs in the current working directory performed under a 
# Slurm manager have been performed correctly and if some runs can be corrected 
# they are launched again
# Parameter 1 - Maximum index of the runs
# Parameter 2 - Amount of repetitions per index/run
# Parameter 3 - Total stages in all runs. #FIXME The amount of stages must be equal across all the runs, must be modified in the future.
# Parameter 4 - Total groups of processes in all runs #FIXME The amount of groups must be equal across all the runs, must be modified in the future.
# Parameter 5 - Maximum valid iteration time across all runs. If an iteration time
#               is higher, that particular repetition inside the run is cleaned and
#               launched again.
#====== Do not modify the following values =======

18
19
20
21
codeDir="Codes/"
execDir="Exec/"
ResultsDir="Results/"

22
23
24
25
maxIndex=$1
totalEjGrupo=$2 #Total de ejecuciones por grupo
total_stages=$3
total_groups=$4
26
maxTime=$5 #Maximo tiempo que se considera válido
27

28
29
30
31
32
33
exec_lines_basic=6
iter_lines_basic=3
exec_total_lines=$(($exec_lines_basic+$total_stages+$total_groups))
iter_total_lines=$(($iter_lines_basic+$total_stages*2+1))
exec_remove=$(($exec_lines_basic+$total_stages+$total_groups-1))
iter_remove=$(($iter_lines_basic+$total_stages-1))
34

35
36
echo $#
if [ "$#" -lt "5" ]
37
then
38
39
  echo "Not enough arguments"
  echo "Usage -> bash CheckRun maxIndes total_repetitions total_groups total_stages max_iteration_time"
40
41
42
  exit -1
fi

43
44
#Check if there are fatal errors during executions
grep -i -e fatal -e error -e abort -e == slurm* > errores2.txt
45
qty=$(wc -l errores2.txt | cut -d ' ' -f1)
46
if [ "$qty" -gt "0" ]
47
then
48
49
  echo "Found Fatal errors during execution. Aborting"
  echo "Read file errors2 to see the errors and in which files"
50
51
52
  exit -2
fi

53
54
55
56
57
58
#Check if the number of output files is correct.
#If the number is not correct is a fatal error and the user
# is informed in which runs the amount does not match, and
# then the scripts exit.
#The user must figure out what to do with those runs.
qtyG=$(ls R*_Global.out | wc -l)
59
qtyG=$(($qtyG * 2))
60
61
qtyL=$(ls R*_G*N*.out | wc -l)
if [ "$qtyG" == "$qtyL" ]
62
then
63
  echo "Number of G($qtyG) and L($qtyL) files match"
64
else
65
66
67
  echo "Lacking Local($qtyL) or global($qtyG) files. Aborting"
  echo "Lacking Local($qtyL) or global($qtyG) files. Aborting" > errores2.txt
  for ((i=0; i<$maxIndex; i++))
68
  do
69
70
    qtyEx=$(grep T_total R"$i"_Global.out | wc -l)
    qtyIt=$(grep T_iter R"$i"_G*N*.out | wc -l)
71
    qtyEx=$(($qtyEx * 2))
72
    if [ "$qtyEx" -ne "$qtyIt" ] 
73
74
    then
      diff=$(($totalEjGrupo-$qtyEx))
75
76
      echo "Files do not match at Run $i -- diff=$diff"
      echo "Files do not match at Run $i -- diff=$diff" >> errores2.txt
77
78
    fi
  done
79
80
  exit -1
fi
81
rm errores2.txt
82

83
84
85
86
87
88
89
90
91
92
93
94
# Check if there is any negative execution time
# Only invalid IDs are stored
rm -f errores.txt
touch errores.txt
exec_ids=($(grep -n "T_total" R*_Global.out | grep - | cut -d '_' -f1 | cut -d 'R' -f2))
exec_line=($(grep -n "T_total" R*_Global.out | grep - | cut -d ':' -f2))
for ((i=${#exec_ids[@]}-1; i>=0; i--))
do
  first_line=$((${exec_line[$i]}-$exec_remove))
  last_line=$(($first_line+$exec_total_lines-1))
  echo "${exec_ids[$i]}:$first_line:$last_line" >> errores.txt
done
95

96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
# Check if there is any iter time higher than expected
# Only invalid IDs are stored
iter_times=($(grep "T_iter" R*_G*N*.out | cut -d ' ' -f2))
iter_ids=($(grep "T_iter" R*_G*N*.out | cut -d '_' -f1 | cut -d 'R' -f2))
iter_line=($(grep -n "T_iter" R*_G*N*.out | cut -d ':' -f2))
for ((i=${#iter_times[@]}-1; i>=0; i--))
do
  is_invalid=$(echo ${iter_times[$i]}'>'$maxTime | bc -l)
  if [ $is_invalid -eq 1 ]
  then
    first_line=$((${iter_line[$i]}-$iter_remove))
    # Translate line number to Global file
    first_line=$(($first_line/$iter_total_lines))
    first_line=$(($first_line*$exec_total_lines+1))
    last_line=$(($first_line+$exec_total_lines-1))
    echo "${iter_ids[$i]}:$first_line:$last_line" >> errores.txt
  fi
done

#Clean data from collected erroneous executions
116
qty=$(wc -l errores.txt | cut -d ' ' -f1)
117
if [ "$qty" -gt 0 ];
118
119
120
then
  echo "Se han encontrado errores de ejecución leves. Volviendo a ejecutar"

121
  while IFS="" read -r lineRun || [ -n "$lineRun" ]
122
  do
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
    #Obtain data of erroneous execution
    run=$(echo $lineRun | cut -d ':' -f1)
    echo "Run $run had an erroneous execution, cleaning bad data."

    #1 - Delete erroneous lines in Global file
    first_line=$(echo $lineRun | cut -d ':' -f2)
    last_line=$(echo $lineRun | cut -d ':' -f3)
    sed -i ''$first_line','$last_line'd' R${run}_Global.out

    #2 - Translate line numbers to Local files type
    first_line=$(($first_line/$exec_total_lines))
    first_line=$(($first_line*$iter_total_lines+1))
    last_line=$(($first_line+$iter_total_lines-1))
    #3 - Delete erroneous lines in Local files
    for ((j=0; j<total_groups; j++)); 
    do
      sed -i ''$first_line','$last_line'd' R${run}_G${j}*
140
141
    done

142
  done < errores.txt
143
fi
144

145
146
147
#Check if all repetitions for each Run have been executed
#If any run lacks repetitions, the job is automatically launched again
#If a run has even executed a repetition, is not launched as it could be in the waiting queue
148
qty_missing=0
149
for ((run=0; run<$maxIndex; run++))
150
do
151
  if [ -f "R${run}_Global.out" ]
152
  then
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
    qtyEx=$(grep T_total R"$run"_Global.out | wc -l)
    if [ "$qtyEx" -ne "$totalEjGrupo" ];
    then
      diff=$(($totalEjGrupo-$qtyEx))
      qty_missing=$(($qty_missing+$diff))
      config_file="config$run.ini"

      #1 - Obtain maximum number of processes for the run
      max_numP=-1
      for ((j=0; j<total_groups; j++)); 
      do
        resize_info=$(grep "\[resize$j\]" -n $config_file | cut -d ":" -f1)
        first_line=$(echo $resize_info | cut -d " " -f1)
        last_line=$(echo $resize_info | cut -d " " -f2)
        range_lines=$(( last_line - first_line ))
        numP=$(head -$last_line $config_file | tail -$range_lines | cut -d ';' -f1 | grep Procs | cut -d '=' -f2)
	if [ "$numP" -gt "$max_numP" ];
	then
	  max_numP=$numP
	fi
      done

      #3 - Obtain needed nodes for the number of processes
      node_qty=$(($max_numP / $cores))
      if [ "$node_qty" -eq "0" ];
      then
        node_qty=1
      fi

      #3 - Launch execution
      echo "Run$run lacks $diff repetitions"
184
185
      use_extrae=0
      sbatch -N $node_qty $dir$execDir./generalRun.sh $dir $config_file $use_extrae $run $diff
186
187
188
    fi
  else
    echo "File R${run}_Global.out does not exist -- Could it be it must still be executed?"
189
190
191
  fi
done

192
193

if [ "$qty_missing" -eq "0" ];
194
then
195
196
197
  echo "SUCCESS"
else
  echo "REPEATING - A total of $qty_missing executions are being repeated"
198
fi