CheckRun.sh 7.1 KB
Newer Older
1
2
#!/bin/bash

3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
partition="P1"

# Checks if all the runs in the current working directory performed under a 
# Slurm manager have been performed correctly and if some runs can be corrected 
# they are launched again
# Parameter 1 - Common name of the configuration files
# Parameter 2 - Maximum index of the runs
# Parameter 3 - Amount of repetitions per index/run
# Parameter 4 - Total stages in all runs. #FIXME The amount of stages must be equal across all the runs, must be modified in the future.
# Parameter 5 - Total groups of processes in all runs #FIXME The amount of groups must be equal across all the runs, must be modified in the future.
# Parameter 6 - Maximum valid iteration time across all runs. If an iteration time
#               is higher, that particular repetition inside the run is cleaned and
#               launched again.
# Parameter 7(Optional) - Maximum amount of time in seconds needed by a single execution. Default value is 0, which indicates infinite time. 
#               Must be a positive integer.
#====== Do not modify the following values =======

20
21
scriptDir="$(dirname "$0")"
source $scriptDir/../Codes/build/config.txt
iker_martin's avatar
iker_martin committed
22
23
24
codeDir="/Codes/"
execDir="/Exec/"
ResultsDir="/Results/"
25
cores=$(bash $dir$execDir/BashScripts/getCores.sh $partition)
26

27
if [ "$#" -lt "6" ]
28
then
29
30
  echo "Not enough arguments"
  echo "Usage -> bash CheckRun Common_Name maxIndex total_repetitions total_groups total_stages max_iteration_time [limit_time]"
31
32
33
  exit -1
fi

34
35
36
37
38
39
40
41
common_name=$1
maxIndex=$2
totalEjGrupo=$3 #Total de ejecuciones por grupo
total_stages=$4
total_groups=$5
maxTime=$6 #Maximo tiempo que se considera válido
limit_time_exec=0
if [ $# -ge 7 ] #Max time per execution in seconds
42
then
43
  limit_time_exec=$7
44
45
fi

46
47
48
49
50
51
52
53
limit_time=0
exec_lines_basic=7
iter_lines_basic=3
exec_total_lines=$(($exec_lines_basic+$total_stages+$total_groups))
iter_total_lines=$(($iter_lines_basic+$total_stages*2+1))
exec_remove=$(($exec_lines_basic+$total_stages+$total_groups-1))
iter_remove=$(($iter_lines_basic+$total_stages))

54

55
56
57
58
#Check if there are fatal errors during executions
grep -i -e fatal -e error -e abort -e == slurm* > errores2.txt
qty=$(wc -l errores2.txt | cut -d ' ' -f1)
if [ "$qty" -gt "0" ]
59
then
60
61
62
  echo "Found Fatal errors during execution. Aborting"
  echo "Read file errors2 to see the errors and in which files"
  echo "FAILURE"
63
64
65
  exit -2
fi

66
67
68
69
70
71
#Check if the number of output files is correct.
#If the number is not correct is a fatal error and the user
# is informed in which runs the amount does not match, and
# then the scripts exits.
#The user must figure out what to do with those runs.
qtyG=$(ls R*_Global.out | wc -l)
72
qtyG=$(($qtyG * $total_groups))
73
74
qtyL=$(ls R*_G*N*.out | wc -l)
if [ "$qtyG" == "$qtyL" ]
75
then
76
  echo "Number of G($qtyG) and L($qtyL) files match"
77
else
78
79
80
  echo "Lacking Local($qtyL) or global($qtyG) files. Aborting"
  echo "Lacking Local($qtyL) or global($qtyG) files. Aborting" > errores2.txt
  for ((i=0; i<$maxIndex; i++))
81
  do
82
83
    qtyEx=$(grep T_total R"$i"_Global.out | wc -l)
    qtyIt=$(grep T_iter R"$i"_G*N*.out | wc -l)
84
    qtyEx=$(($qtyEx * 2))
85
    if [ "$qtyEx" -ne "$qtyIt" ] 
86
87
    then
      diff=$(($totalEjGrupo-$qtyEx))
88
89
      echo "Files do not match at Run $i -- diff=$diff"
      echo "Files do not match at Run $i -- diff=$diff" >> errores2.txt
90
91
    fi
  done
92
  echo "FAILURE"
93
94
  exit -1
fi
95
rm errores2.txt
96

97
98
99
100
101
102
103
104
105
106
107
108
# Check if there is any negative execution time
# Only invalid IDs are stored
rm -f tmp.txt
touch tmp.txt
exec_ids=($(grep -n "T_total" R*_Global.out | grep - | cut -d '_' -f1 | cut -d 'R' -f2))
exec_line=($(grep -n "T_total" R*_Global.out | grep - | cut -d ':' -f2))
for ((i=${#exec_ids[@]}-1; i>=0; i--))
do
  first_line=$((${exec_line[$i]}-$exec_remove))
  last_line=$(($first_line+$exec_total_lines-1))
  echo "${exec_ids[$i]}:$first_line:$last_line" >> tmp.txt
done
109

110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
# Check if there is any iter time higher than expected
# Only invalid IDs are stored
iter_times=($(grep "T_iter" R*_G*N*.out | cut -d ' ' -f2))
iter_ids=($(grep "T_iter" R*_G*N*.out | cut -d '_' -f1 | cut -d 'R' -f2))
iter_line=($(grep -n "T_iter" R*_G*N*.out | cut -d ':' -f2))
for ((i=${#iter_times[@]}-1; i>=0; i--))
do
  is_invalid=$(echo ${iter_times[$i]}'>'$maxTime | bc -l)
  if [ $is_invalid -eq 1 ]
  then
    first_line=$((${iter_line[$i]}-$iter_remove))
    # Translate line number to Global file
    first_line=$(($first_line/$iter_total_lines))
    first_line=$(($first_line*$exec_total_lines+1))
    last_line=$(($first_line+$exec_total_lines-1))
    echo "${iter_ids[$i]}:$first_line:$last_line" >> tmp.txt
  fi
done

#Clean data from collected erroneous executions
qty=$(wc -l tmp.txt | cut -d ' ' -f1)
if [ "$qty" -gt 0 ];
132
then
133
134
  echo "Found minor execution errors. Executing again. Review file errores.txt."
  echo "CHECKRUN -- Found errors" >> errores.txt
135

136
  while IFS="" read -r lineRun || [ -n "$lineRun" ]
137
  do
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
    #Obtain data of erroneous execution
    run=$(echo $lineRun | cut -d ':' -f1)
    echo "Run $run had an erroneous execution, cleaning bad data."
    echo "Run$run----------------------------------------------" >> errores.txt

    #1 - Delete erroneous lines in Global file
    first_line=$(echo $lineRun | cut -d ':' -f2)
    last_line=$(echo $lineRun | cut -d ':' -f3)

    sed -n ''$first_line','$last_line'p' R${run}_Global.out >> errores.txt
    sed -i ''$first_line','$last_line'd' R${run}_Global.out

    #2 - Translate line numbers to Local files type
    first_line=$(($first_line/$exec_total_lines))
    first_line=$(($first_line*$iter_total_lines+1))
    last_line=$(($first_line+$iter_total_lines-1))
    #3 - Delete erroneous lines in Local files
    for ((j=0; j<total_groups; j++)); 
    do
      sed -n ''$first_line','$last_line'p' R${run}_G${j}* >> errores.txt
      sed -i ''$first_line','$last_line'd' R${run}_G${j}*
159
    done
160
    echo "--------------------------------------------------" >> errores.txt
161

162
  done < tmp.txt
163
fi
164

165
166
167
#Check if all repetitions for each Run have been executed
#If any run lacks repetitions, the job is automatically launched again
#If a run has even executed a repetition, is not launched as it could be in the waiting queue
168
qty_missing=0
169
for ((run=0; run<$maxIndex; run++))
170
do
171
  if [ -f "R${run}_Global.out" ]
172
  then
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
    qtyEx=$(grep T_total R"$run"_Global.out | wc -l)
    if [ "$qtyEx" -ne "$totalEjGrupo" ];
    then
      #1 - Obtain config file name and repetitions to perform
      diff=$(($totalEjGrupo-$qtyEx))
      qty_missing=$(($qty_missing+$diff))
      config_file="$common_name$run.ini"
      if [ $limit_time_exec -ne 0 ] #Max time per execution in seconds
      then
        limit_time=$(($limit_time_exec*$diff/60+1))
      fi

      #2 - Obtain number of nodes needed
      node_qty=$(bash $dir$execDir/BashScripts/getMaxNodesNeeded.sh $config_file $dir $cores)

      #3 - Launch execution
      echo "Run$run lacks $diff repetitions"
      use_extrae=0
      sbatch -p $partition -N $node_qty -t $limit_time $dir$execDir./generalRun.sh $dir $cores $config_file $use_extrae $run $diff
    fi
  else
    echo "File R${run}_Global.out does not exist -- Could it be it must still be executed?"
195
196
197
  fi
done

198
if [ "$qty_missing" -eq "0" ];
199
then
200
201
202
  echo "SUCCESS"
else
  echo "REPEATING - A total of $qty_missing executions are being repeated"
203
fi