CheckRun.sh 7.03 KB
Newer Older
1
2
#!/bin/bash

Iker Martín Álvarez's avatar
Iker Martín Álvarez committed
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
partition="P1"

# Checks if all the runs in the current working directory performed under a 
# Slurm manager have been performed correctly and if some runs can be corrected 
# they are launched again
# Parameter 1 - Common name of the configuration files
# Parameter 2 - Maximum index of the runs
# Parameter 3 - Amount of repetitions per index/run
# Parameter 4 - Total stages in all runs. #FIXME The amount of stages must be equal across all the runs, must be modified in the future.
# Parameter 5 - Total groups of processes in all runs #FIXME The amount of groups must be equal across all the runs, must be modified in the future.
# Parameter 6 - Maximum valid iteration time across all runs. If an iteration time
#               is higher, that particular repetition inside the run is cleaned and
#               launched again.
# Parameter 7(Optional) - Maximum amount of time in seconds needed by a single execution. Default value is 0, which indicates infinite time. 
#               Must be a positive integer.
#====== Do not modify the following values =======

scriptDir="$(dirname "$0")"
source $scriptDir/../Codes/build/config.txt
cores=$(bash $PROTEO_HOME$execDir/BashScripts/getCores.sh $partition)

if [ "$#" -lt "6" ]
Iker Martín's avatar
Iker Martín committed
25
then
Iker Martín Álvarez's avatar
Iker Martín Álvarez committed
26
27
  echo "Not enough arguments"
  echo "Usage -> bash CheckRun.sh Common_Name maxIndex total_repetitions total_stages total_groups max_iteration_time [limit_time]"
Iker Martín's avatar
Iker Martín committed
28
29
  exit -1
fi
30

Iker Martín Álvarez's avatar
Iker Martín Álvarez committed
31
32
33
34
35
36
37
38
common_name=$1
maxIndex=$2
totalEjGrupo=$3 #Total de ejecuciones por grupo
total_stages=$4
total_groups=$5
maxTime=$6 #Maximo tiempo que se considera válido
limit_time_exec=0
if [ $# -ge 7 ] #Max time per execution in seconds
39
then
Iker Martín Álvarez's avatar
Iker Martín Álvarez committed
40
  limit_time_exec=$7
41
42
fi

Iker Martín Álvarez's avatar
Iker Martín Álvarez committed
43
44
45
46
47
48
49
50
limit_time=0
exec_lines_basic=6
iter_lines_basic=3
exec_total_lines=$(($exec_lines_basic+$total_stages+$total_groups))
iter_total_lines=$(($iter_lines_basic+$total_stages*2+1))
exec_remove=$(($exec_lines_basic+$total_stages+$total_groups-1))
iter_remove=$(($iter_lines_basic+$total_stages))

51

Iker Martín Álvarez's avatar
Iker Martín Álvarez committed
52
53
54
55
#Check if there are fatal errors during executions
grep -i -e fatal -e error -e abort -e == slurm* > errores2.txt
qty=$(wc -l errores2.txt | cut -d ' ' -f1)
if [ "$qty" -gt "0" ]
56
then
Iker Martín Álvarez's avatar
Iker Martín Álvarez committed
57
58
59
  echo "Found Fatal errors during execution. Aborting"
  echo "Read file errors2 to see the errors and in which files"
  echo "FAILURE"
60
61
62
  exit -2
fi

Iker Martín Álvarez's avatar
Iker Martín Álvarez committed
63
64
65
66
67
68
69
70
71
#Check if the number of output files is correct.
#If the number is not correct is a fatal error and the user
# is informed in which runs the amount does not match, and
# then the scripts exits.
#The user must figure out what to do with those runs.
qtyG=$(ls R*_Global.out | wc -l)
qtyG=$(($qtyG * $total_groups))
qtyL=$(ls R*_G*N*.out | wc -l)
if [ "$qtyG" == "$qtyL" ]
Iker Martín's avatar
Iker Martín committed
72
then
Iker Martín Álvarez's avatar
Iker Martín Álvarez committed
73
  echo "Number of G($qtyG) and L($qtyL) files match"
Iker Martín's avatar
Iker Martín committed
74
else
Iker Martín Álvarez's avatar
Iker Martín Álvarez committed
75
76
77
  echo "Lacking Local($qtyL) or global($qtyG) files. Aborting"
  echo "Lacking Local($qtyL) or global($qtyG) files. Aborting" > errores2.txt
  for ((i=0; i<$maxIndex; i++))
Iker Martín's avatar
Iker Martín committed
78
  do
Iker Martín Álvarez's avatar
Iker Martín Álvarez committed
79
80
    qtyEx=$(grep T_total R"$i"_Global.out | wc -l)
    qtyIt=$(grep T_iter R"$i"_G*N*.out | wc -l)
Iker Martín's avatar
Iker Martín committed
81
    qtyEx=$(($qtyEx * 2))
Iker Martín Álvarez's avatar
Iker Martín Álvarez committed
82
    if [ "$qtyEx" -ne "$qtyIt" ] 
Iker Martín's avatar
Iker Martín committed
83
84
    then
      diff=$(($totalEjGrupo-$qtyEx))
Iker Martín Álvarez's avatar
Iker Martín Álvarez committed
85
86
      echo "Files do not match at Run $i -- diff=$diff"
      echo "Files do not match at Run $i -- diff=$diff" >> errores2.txt
Iker Martín's avatar
Iker Martín committed
87
88
    fi
  done
Iker Martín Álvarez's avatar
Iker Martín Álvarez committed
89
  echo "FAILURE"
Iker Martín's avatar
Iker Martín committed
90
91
  exit -1
fi
Iker Martín Álvarez's avatar
Iker Martín Álvarez committed
92
rm errores2.txt
Iker Martín's avatar
Iker Martín committed
93

Iker Martín Álvarez's avatar
Iker Martín Álvarez committed
94
95
96
97
98
99
100
101
102
103
104
105
# Check if there is any negative execution time
# Only invalid IDs are stored
rm -f tmp.txt
touch tmp.txt
exec_ids=($(grep -n "T_total" R*_Global.out | grep - | cut -d '_' -f1 | cut -d 'R' -f2))
exec_line=($(grep -n "T_total" R*_Global.out | grep - | cut -d ':' -f2))
for ((i=${#exec_ids[@]}-1; i>=0; i--))
do
  first_line=$((${exec_line[$i]}-$exec_remove))
  last_line=$(($first_line+$exec_total_lines-1))
  echo "${exec_ids[$i]}:$first_line:$last_line" >> tmp.txt
done
Iker Martín's avatar
Iker Martín committed
106

Iker Martín Álvarez's avatar
Iker Martín Álvarez committed
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
# Check if there is any iter time higher than expected
# Only invalid IDs are stored
iter_times=($(grep "T_iter" R*_G*N*.out | cut -d ' ' -f2))
iter_ids=($(grep "T_iter" R*_G*N*.out | cut -d '_' -f1 | cut -d 'R' -f2))
iter_line=($(grep -n "T_iter" R*_G*N*.out | cut -d ':' -f2))
for ((i=${#iter_times[@]}-1; i>=0; i--))
do
  is_invalid=$(echo ${iter_times[$i]}'>'$maxTime | bc -l)
  if [ $is_invalid -eq 1 ]
  then
    first_line=$((${iter_line[$i]}-$iter_remove))
    # Translate line number to Global file
    first_line=$(($first_line/$iter_total_lines))
    first_line=$(($first_line*$exec_total_lines+1))
    last_line=$(($first_line+$exec_total_lines-1))
    echo "${iter_ids[$i]}:$first_line:$last_line" >> tmp.txt
  fi
done

#Clean data from collected erroneous executions
qty=$(wc -l tmp.txt | cut -d ' ' -f1)
if [ "$qty" -gt 0 ];
129
then
Iker Martín Álvarez's avatar
Iker Martín Álvarez committed
130
131
  echo "Found minor execution errors. Executing again. Review file errores.txt."
  echo "CHECKRUN -- Found errors" >> errores.txt
132

133
  while IFS="" read -r lineRun || [ -n "$lineRun" ]
134
  do
Iker Martín Álvarez's avatar
Iker Martín Álvarez committed
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
    #Obtain data of erroneous execution
    run=$(echo $lineRun | cut -d ':' -f1)
    echo "Run $run had an erroneous execution, cleaning bad data."
    echo "Run$run----------------------------------------------" >> errores.txt

    #1 - Delete erroneous lines in Global file
    first_line=$(echo $lineRun | cut -d ':' -f2)
    last_line=$(echo $lineRun | cut -d ':' -f3)

    sed -n ''$first_line','$last_line'p' R${run}_Global.out >> errores.txt
    sed -i ''$first_line','$last_line'd' R${run}_Global.out

    #2 - Translate line numbers to Local files type
    first_line=$(($first_line/$exec_total_lines))
    first_line=$(($first_line*$iter_total_lines+1))
    last_line=$(($first_line+$iter_total_lines-1))
    #3 - Delete erroneous lines in Local files
    for ((j=0; j<total_groups; j++)); 
    do
      sed -n ''$first_line','$last_line'p' R${run}_G${j}* >> errores.txt
      sed -i ''$first_line','$last_line'd' R${run}_G${j}*
156
    done
Iker Martín Álvarez's avatar
Iker Martín Álvarez committed
157
    echo "--------------------------------------------------" >> errores.txt
158

Iker Martín Álvarez's avatar
Iker Martín Álvarez committed
159
  done < tmp.txt
Iker Martín's avatar
Iker Martín committed
160
161
fi

Iker Martín Álvarez's avatar
Iker Martín Álvarez committed
162
163
164
#Check if all repetitions for each Run have been executed
#If any run lacks repetitions, the job is automatically launched again
#If a run has even executed a repetition, is not launched as it could be in the waiting queue
Iker Martín's avatar
Iker Martín committed
165
qty_missing=0
Iker Martín Álvarez's avatar
Iker Martín Álvarez committed
166
167
use_extrae=0
for ((run=0; run<$maxIndex; run++))
Iker Martín's avatar
Iker Martín committed
168
do
Iker Martín Álvarez's avatar
Iker Martín Álvarez committed
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
  diff=0

  if [ -f "R${run}_Global.out" ]
  then
    qtyEx=$(grep T_total R"$run"_Global.out | wc -l)
    if [ "$qtyEx" -ne "$totalEjGrupo" ];
    then
      diff=$(($totalEjGrupo-$qtyEx))
      echo "Run$run lacks $diff repetitions"
    fi
  else
    diff=$(($totalEjGrupo))
    echo "Run$run results not found -- Trying to execute"
  fi

  if [ $diff -ne 0 ] #Execute if needed
Iker Martín's avatar
Iker Martín committed
185
  then
Iker Martín Álvarez's avatar
Iker Martín Álvarez committed
186
187
188
189
190
191
192
193
194
195
196
197
198
    qty_missing=$(($qty_missing+$diff))

    if [ $limit_time_exec -ne 0 ] #Max time per execution in seconds
    then
      limit_time=$(($limit_time_exec*$diff/60+1))
    fi

    #2 - Obtain number of nodes needed
    config_file="$common_name$run.ini"
    node_qty=$(bash $PROTEO_HOME$execDir/BashScripts/getMaxNodesNeeded.sh $config_file $cores)

    #3 - Launch execution
    sbatch -p $partition -N $node_qty -t $limit_time $PROTEO_HOME$execDir/generalRun.sh $cores $config_file $use_extrae $run $diff
Iker Martín's avatar
Iker Martín committed
199
200
201
  fi
done

Iker Martín Álvarez's avatar
Iker Martín Álvarez committed
202
if [ "$qty_missing" -eq "0" ];
Iker Martín's avatar
Iker Martín committed
203
then
Iker Martín Álvarez's avatar
Iker Martín Álvarez committed
204
205
206
  echo "SUCCESS"
else
  echo "REPEATING - A total of $qty_missing executions are being repeated"
207
fi