Commit a2bef31c authored by iker_martin's avatar iker_martin
Browse files

Major refactor in Checkrun.sh: Added description; Allows limit time for...

Major refactor in Checkrun.sh: Added description; Allows limit time for launched jobs; Detected errors by the script are now saved at errores.txt so users can see them; Now calls auxiliary scripts for common steps; Other minor changes.
parent fa54937a
#!/bin/bash #!/bin/bash
dir="/home/martini/malleability_benchmark/" dir="/home/martini/malleability_benchmark/"
cores=20 partition="P1"
# Checks if all the runs in the current working directory performed under a # Checks if all the runs in the current working directory performed under a
# Slurm manager have been performed correctly and if some runs can be corrected # Slurm manager have been performed correctly and if some runs can be corrected
# they are launched again # they are launched again
# Parameter 1 - Maximum index of the runs # Parameter 1 - Common name of the configuration files
# Parameter 2 - Amount of repetitions per index/run # Parameter 2 - Maximum index of the runs
# Parameter 3 - Total stages in all runs. #FIXME The amount of stages must be equal across all the runs, must be modified in the future. # Parameter 3 - Amount of repetitions per index/run
# Parameter 4 - Total groups of processes in all runs #FIXME The amount of groups must be equal across all the runs, must be modified in the future. # Parameter 4 - Total stages in all runs. #FIXME The amount of stages must be equal across all the runs, must be modified in the future.
# Parameter 5 - Maximum valid iteration time across all runs. If an iteration time # Parameter 5 - Total groups of processes in all runs #FIXME The amount of groups must be equal across all the runs, must be modified in the future.
# Parameter 6 - Maximum valid iteration time across all runs. If an iteration time
# is higher, that particular repetition inside the run is cleaned and # is higher, that particular repetition inside the run is cleaned and
# launched again. # launched again.
# Parameter 7(Optional) - Maximum amount of time in seconds needed by a single execution. Default value is 0, which indicates infinite time.
# Must be a positive integer.
#====== Do not modify the following values ======= #====== Do not modify the following values =======
codeDir="Codes/" codeDir="Codes/"
execDir="Exec/" execDir="Exec/"
ResultsDir="Results/" ResultsDir="Results/"
cores=$(bash $dir$execDir/BashScripts/getCores.sh $partition)
maxIndex=$1 if [ "$#" -lt "6" ]
totalEjGrupo=$2 #Total de ejecuciones por grupo then
total_stages=$3 echo "Not enough arguments"
total_groups=$4 echo "Usage -> bash CheckRun Common_Name maxIndex total_repetitions total_groups total_stages max_iteration_time [limit_time]"
maxTime=$5 #Maximo tiempo que se considera válido exit -1
fi
common_name=$1
maxIndex=$2
totalEjGrupo=$3 #Total de ejecuciones por grupo
total_stages=$4
total_groups=$5
maxTime=$6 #Maximo tiempo que se considera válido
limit_time=$((0))
if [ $# -ge 7 ] #Max time per execution in seconds
then
limit_time=$(($7*$qty/60+1))
fi
exec_lines_basic=6 exec_lines_basic=6
iter_lines_basic=3 iter_lines_basic=3
exec_total_lines=$(($exec_lines_basic+$total_stages+$total_groups)) exec_total_lines=$(($exec_lines_basic+$total_stages+$total_groups))
iter_total_lines=$(($iter_lines_basic+$total_stages*2+1)) iter_total_lines=$(($iter_lines_basic+$total_stages*2+1))
exec_remove=$(($exec_lines_basic+$total_stages+$total_groups-1)) exec_remove=$(($exec_lines_basic+$total_stages+$total_groups-1))
iter_remove=$(($iter_lines_basic+$total_stages-1)) iter_remove=$(($iter_lines_basic+$total_stages))
if [ "$#" -lt "5" ]
then
echo "Not enough arguments"
echo "Usage -> bash CheckRun maxIndes total_repetitions total_groups total_stages max_iteration_time"
exit -1
fi
#Check if there are fatal errors during executions #Check if there are fatal errors during executions
grep -i -e fatal -e error -e abort -e == slurm* > errores2.txt grep -i -e fatal -e error -e abort -e == slurm* > errores2.txt
...@@ -46,13 +58,14 @@ if [ "$qty" -gt "0" ] ...@@ -46,13 +58,14 @@ if [ "$qty" -gt "0" ]
then then
echo "Found Fatal errors during execution. Aborting" echo "Found Fatal errors during execution. Aborting"
echo "Read file errors2 to see the errors and in which files" echo "Read file errors2 to see the errors and in which files"
echo "FAILURE"
exit -2 exit -2
fi fi
#Check if the number of output files is correct. #Check if the number of output files is correct.
#If the number is not correct is a fatal error and the user #If the number is not correct is a fatal error and the user
# is informed in which runs the amount does not match, and # is informed in which runs the amount does not match, and
# then the scripts exit. # then the scripts exits.
#The user must figure out what to do with those runs. #The user must figure out what to do with those runs.
qtyG=$(ls R*_Global.out | wc -l) qtyG=$(ls R*_Global.out | wc -l)
qtyG=$(($qtyG * 2)) qtyG=$(($qtyG * 2))
...@@ -75,21 +88,22 @@ else ...@@ -75,21 +88,22 @@ else
echo "Files do not match at Run $i -- diff=$diff" >> errores2.txt echo "Files do not match at Run $i -- diff=$diff" >> errores2.txt
fi fi
done done
echo "FAILURE"
exit -1 exit -1
fi fi
rm errores2.txt rm errores2.txt
# Check if there is any negative execution time # Check if there is any negative execution time
# Only invalid IDs are stored # Only invalid IDs are stored
rm -f errores.txt rm -f tmp.txt
touch errores.txt touch tmp.txt
exec_ids=($(grep -n "T_total" R*_Global.out | grep - | cut -d '_' -f1 | cut -d 'R' -f2)) exec_ids=($(grep -n "T_total" R*_Global.out | grep - | cut -d '_' -f1 | cut -d 'R' -f2))
exec_line=($(grep -n "T_total" R*_Global.out | grep - | cut -d ':' -f2)) exec_line=($(grep -n "T_total" R*_Global.out | grep - | cut -d ':' -f2))
for ((i=${#exec_ids[@]}-1; i>=0; i--)) for ((i=${#exec_ids[@]}-1; i>=0; i--))
do do
first_line=$((${exec_line[$i]}-$exec_remove)) first_line=$((${exec_line[$i]}-$exec_remove))
last_line=$(($first_line+$exec_total_lines-1)) last_line=$(($first_line+$exec_total_lines-1))
echo "${exec_ids[$i]}:$first_line:$last_line" >> errores.txt echo "${exec_ids[$i]}:$first_line:$last_line" >> tmp.txt
done done
# Check if there is any iter time higher than expected # Check if there is any iter time higher than expected
...@@ -107,25 +121,29 @@ do ...@@ -107,25 +121,29 @@ do
first_line=$(($first_line/$iter_total_lines)) first_line=$(($first_line/$iter_total_lines))
first_line=$(($first_line*$exec_total_lines+1)) first_line=$(($first_line*$exec_total_lines+1))
last_line=$(($first_line+$exec_total_lines-1)) last_line=$(($first_line+$exec_total_lines-1))
echo "${iter_ids[$i]}:$first_line:$last_line" >> errores.txt echo "${iter_ids[$i]}:$first_line:$last_line" >> tmp.txt
fi fi
done done
#Clean data from collected erroneous executions #Clean data from collected erroneous executions
qty=$(wc -l errores.txt | cut -d ' ' -f1) qty=$(wc -l tmp.txt | cut -d ' ' -f1)
if [ "$qty" -gt 0 ]; if [ "$qty" -gt 0 ];
then then
echo "Se han encontrado errores de ejecución leves. Volviendo a ejecutar" echo "Found minor execution errors. Executing again. Review file errores.txt."
echo "CHECKRUN -- Found errors" >> errores.txt
while IFS="" read -r lineRun || [ -n "$lineRun" ] while IFS="" read -r lineRun || [ -n "$lineRun" ]
do do
#Obtain data of erroneous execution #Obtain data of erroneous execution
run=$(echo $lineRun | cut -d ':' -f1) run=$(echo $lineRun | cut -d ':' -f1)
echo "Run $run had an erroneous execution, cleaning bad data." echo "Run $run had an erroneous execution, cleaning bad data."
echo "Run$run----------------------------------------------" >> errores.txt
#1 - Delete erroneous lines in Global file #1 - Delete erroneous lines in Global file
first_line=$(echo $lineRun | cut -d ':' -f2) first_line=$(echo $lineRun | cut -d ':' -f2)
last_line=$(echo $lineRun | cut -d ':' -f3) last_line=$(echo $lineRun | cut -d ':' -f3)
sed -n ''$first_line','$last_line'p' R${run}_Global.out >> errores.txt
sed -i ''$first_line','$last_line'd' R${run}_Global.out sed -i ''$first_line','$last_line'd' R${run}_Global.out
#2 - Translate line numbers to Local files type #2 - Translate line numbers to Local files type
...@@ -135,10 +153,12 @@ then ...@@ -135,10 +153,12 @@ then
#3 - Delete erroneous lines in Local files #3 - Delete erroneous lines in Local files
for ((j=0; j<total_groups; j++)); for ((j=0; j<total_groups; j++));
do do
sed -n ''$first_line','$last_line'p' R${run}_G${j}* >> errores.txt
sed -i ''$first_line','$last_line'd' R${run}_G${j}* sed -i ''$first_line','$last_line'd' R${run}_G${j}*
done done
echo "--------------------------------------------------" >> errores.txt
done < errores.txt done < tmp.txt
fi fi
#Check if all repetitions for each Run have been executed #Check if all repetitions for each Run have been executed
...@@ -152,43 +172,24 @@ do ...@@ -152,43 +172,24 @@ do
qtyEx=$(grep T_total R"$run"_Global.out | wc -l) qtyEx=$(grep T_total R"$run"_Global.out | wc -l)
if [ "$qtyEx" -ne "$totalEjGrupo" ]; if [ "$qtyEx" -ne "$totalEjGrupo" ];
then then
#1 - Obtain config file name and repetitions to perform
diff=$(($totalEjGrupo-$qtyEx)) diff=$(($totalEjGrupo-$qtyEx))
qty_missing=$(($qty_missing+$diff)) qty_missing=$(($qty_missing+$diff))
config_file="config$run.ini" config_file="$common_name$run.ini"
#1 - Obtain maximum number of processes for the run #2 - Obtain number of nodes needed
max_numP=-1 node_qty=$(bash $dir$execDir/BashScripts/getMaxNodesNeeded.sh $config_file $cores)
for ((j=0; j<total_groups; j++));
do
resize_info=$(grep "\[resize$j\]" -n $config_file | cut -d ":" -f1)
first_line=$(echo $resize_info | cut -d " " -f1)
last_line=$(echo $resize_info | cut -d " " -f2)
range_lines=$(( last_line - first_line ))
numP=$(head -$last_line $config_file | tail -$range_lines | cut -d ';' -f1 | grep Procs | cut -d '=' -f2)
if [ "$numP" -gt "$max_numP" ];
then
max_numP=$numP
fi
done
#3 - Obtain needed nodes for the number of processes
node_qty=$(($max_numP / $cores))
if [ "$node_qty" -eq "0" ];
then
node_qty=1
fi
#3 - Launch execution #3 - Launch execution
echo "Run$run lacks $diff repetitions" echo "Run$run lacks $diff repetitions"
use_extrae=0 use_extrae=0
sbatch -N $node_qty $dir$execDir./generalRun.sh $dir $config_file $use_extrae $run $diff sbatch -p $partition -N $node_qty -t $limit_time $dir$execDir./generalRun.sh $dir $cores $config_file $use_extrae $run $diff
fi fi
else else
echo "File R${run}_Global.out does not exist -- Could it be it must still be executed?" echo "File R${run}_Global.out does not exist -- Could it be it must still be executed?"
fi fi
done done
if [ "$qty_missing" -eq "0" ]; if [ "$qty_missing" -eq "0" ];
then then
echo "SUCCESS" echo "SUCCESS"
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment