Commit 32a4ca93 authored by Iker Martín Álvarez's avatar Iker Martín Álvarez
Browse files

Merge branch 'malleability-refactor' into 'dev'

Malleability focus refactor of Proteo

See merge request martini/malleability_benchmark!5
parents f1511cb4 06573694
#!/bin/bash
dir="/home/martini/malleability_benchmark"
scriptDir="$(dirname "$0")"
source $scriptDir/../../Codes/build/config.txt
codeDir="/Codes/build"
export EXTRAE_CONFIG_FILE=extrae.xml
......
......@@ -8,6 +8,8 @@ GENERAL_SECTION = "[general]"
RESIZE_SECTION = "[resize"
STAGE_SECTION = "[stage"
END_SECTION_DELIMITER = ";end"
DIFFERENT_VALUE_DELIMITER=':'
LIST_VALUE_DELIMITER=','
class Config_section(Enum):
INVALID=0
......@@ -21,6 +23,7 @@ class Config_section(Enum):
P_SDR="SDR"
P_ADR="ADR"
P_RIGID="Rigid"
P_CAPTURE_METHOD="Capture_Method"
P_STAGE_TYPE="Stage_Type"
P_STAGE_BYTES="Stage_Bytes"
......@@ -60,29 +63,39 @@ def is_a_stage_section(line):
return True
return False
def convert_to_number(number):
res = None
try:
res = float(number)
except ValueError:
if isinstance(number, str):
res = number
else:
print("Unable to convert to number - Not a fatal error")
if isinstance(res, float):
try:
res = int(number)
except ValueError:
print("Unable to convert float to int - Not a fatal error")
return res
def process_line(line, data):
key,value = line.split('=')
if(not Config_section.has_key(key)):
print("Unknown parameter " + key)
return False
if(',' in value):
value = value.split(',')
for i in range(len(value)):
try:
value[i] = float(value[i])
if value[i] == int(value[i]):
value[i] = int(value[i])
except ValueError:
print("Unable to convert to number - Not a fatal error")
else:
try:
value = float(value)
if value == int(value):
value = int(value)
except ValueError:
print("Unable to convert to number - Not a fatal error")
value = value.split(DIFFERENT_VALUE_DELIMITER) # Some keys have values that will be swapped between files
for i in range(len(value)):
value[i] = value[i].split(LIST_VALUE_DELIMITER) # Final config files could have multiple values for the same key
for j in range(len(value[i])):
value[i][j] = convert_to_number(value[i][j])
if len(value[i]) > 1:
value[i] = tuple(value[i])
elif len(value[i]) == 1:
value[i] = value[i][j]
if len(value) == 1:
value = value[0]
data[key]=value
return True
......@@ -119,58 +132,64 @@ def process_file(file_name):
f.close()
return general_data,stages_data,resizes_data
def key_line_write(f, keys, values):
for i in range(len(keys)):
f.write(keys[i] + "=")
if type(values[i]) == tuple:
f.write(str(values[i][0]))
for j in range(1,len(values[i])):
f.write("," + str(values[i][j]) )
else:
f.write(str(values[i]))
f.write("\n")
def general_section_write(f, general_data):
f.write(GENERAL_SECTION + "\n")
keys = list(general_data.keys())
values = list(general_data.values())
for i in range(len(keys)):
f.write(keys[i] + "=" + str(values[i]) + "\n")
key_line_write(f, keys, values)
f.write(END_SECTION_DELIMITER + " " + GENERAL_SECTION + "\n")
def stage_section_write(f, stage_data, section_index):
f.write(STAGE_SECTION + str(section_index) + "]\n")
keys = list(stage_data.keys())
values = list(stage_data.values())
for i in range(len(keys)):
f.write(keys[i] + "=" + str(values[i]) + "\n")
key_line_write(f, keys, values)
f.write(END_SECTION_DELIMITER + " " + STAGE_SECTION + str(section_index) + "]\n")
def resize_section_write(f, resize_data, section_index):
f.write(RESIZE_SECTION + str(section_index) + "]\n")
keys = list(resize_data.keys())
values = list(resize_data.values())
for i in range(len(keys)):
f.write(keys[i] + "=" + str(values[i]) + "\n")
key_line_write(f, keys, values)
f.write(END_SECTION_DELIMITER + " " + RESIZE_SECTION + str(section_index) + "]\n")
def write_output_file(datasets, common_output_name, output_index):
file_name = common_output_name + str(output_index) + ".ini"
total_stages=int(datasets[0][Config_section.P_TOTAL_STAGES.value])
total_resizes=int(datasets[0][Config_section.P_TOTAL_RESIZES.value])+1
total_groups=int(datasets[0][Config_section.P_TOTAL_RESIZES.value])+1
f = open(file_name, "w")
general_section_write(f, datasets[0])
for i in range(total_stages):
stage_section_write(f, datasets[i+1], i)
for i in range(total_resizes):
for i in range(total_groups):
resize_section_write(f, datasets[i+1+total_stages], i)
f.close()
def check_sections_assumptions(datasets):
total_resizes=int(datasets[0][Config_section.P_TOTAL_RESIZES.value])+1
total_groups=int(datasets[0][Config_section.P_TOTAL_RESIZES.value])+1
total_stages=int(datasets[0][Config_section.P_TOTAL_STAGES.value])
adr = datasets[0][Config_section.P_ADR.value]
for i in range(total_resizes):
#Not valid if trying to use thread strategy and adr(Async data) is 0
if adr==0 and (datasets[total_stages+1+i][Config_section.P_RESIZE_SPAWN_STRATEGY.value] == 2 or datasets[total_stages+1+i][Config_section.P_RESIZE_REDISTRIBUTION_STRATEGY.value] == 2):
return False
#Not valid if the strategies are different
if datasets[total_stages+1+i][Config_section.P_RESIZE_SPAWN_STRATEGY.value] != datasets[total_stages+1+i][Config_section.P_RESIZE_REDISTRIBUTION_STRATEGY.value]:
return False
for i in range(total_groups):
#Not valid if resize is to the same amount of processes
if i>0:
if datasets[total_stages+1+i][Config_section.P_RESIZE_PROCS.value] == datasets[total_stages+i][Config_section.P_RESIZE_PROCS.value]:
......@@ -224,14 +243,9 @@ def create_output_files(common_output_name, general_data, resize_data, stage_dat
datasets.append(dataset)
write_datasets.append(dataset.copy())
directory = "/Desglosed-" + str(date.today())
path = os.getcwd() + directory
os.mkdir(path, mode=0o775)
os.chdir(path)
lists=[] # Stores lists of those variables with multiple values
keys=[] # Stores keys of those variables with multiple values
indexes=[] # Stores actual index for each variable with multiple values
indexes=[] # Stores actual index for each variable with multiple values. Always starts at 0.
mindexes=[] # Stores len of lists of each variable with multiple values
ds_indexes=[] # Stores the index of the dataset where the variable is stored
#For each variable with a list of elements
......@@ -246,6 +260,10 @@ def create_output_files(common_output_name, general_data, resize_data, stage_dat
indexes.append(0)
mindexes.append(len(values_aux[j]))
directory = "/Desglosed-" + str(date.today())
path = os.getcwd() + directory
os.mkdir(path, mode=0o775)
os.chdir(path)
#Get the first set of values
for i in range(len(lists)):
......@@ -259,7 +277,8 @@ def create_output_files(common_output_name, general_data, resize_data, stage_dat
output_index=0
adr_corrected=False
while True:
finished = False
while not finished:
if(check_sections_assumptions(write_datasets)):
write_output_file(write_datasets, common_output_name, output_index)
# for i in range(len(write_datasets)):
......@@ -267,9 +286,7 @@ def create_output_files(common_output_name, general_data, resize_data, stage_dat
# print("\n\n\n------------------------------------------" + str(output_index) + " ADR=" + str(adr_corrected))
output_index+=1
finished = read_parameter(0)
if finished:
break
#=====================================================
if(len(sys.argv) < 3):
print("Not enough arguments given.\nExpected usage: python3 read_multiple.py file.ini output_name")
......
#!/bin/bash
scriptDir="$(dirname "$0")"
source $scriptDir/../../Codes/build/config.txt
codeDir="/Codes/build"
valgrind --leak-check=full --show-leak-kinds=all --track-origins=yes --log-file=vg.tp.%p $dir$codeDir/./a.out
......@@ -8,7 +8,7 @@
# Parameter 1 - Base directory of the malleability benchmark
# Parameter 2 - Number of cores in a single machine
# Parameter 3 - Configuration file name for the emulation.
# Parameter 4 - Use Extrae(1) or not(0).
# Parameter 4 - Use Valgrind(1), Extrae(2) or nothing(0).
# Parameter 5 - Index to use for the output files. Must be a positive integer.
# Parameter 6 - Amount of executions per file. Must be a positive number.
#====== Do not modify these values =======
......@@ -22,7 +22,7 @@ echo "START TEST"
#$1 == baseDir
#$2 == cores
#$3 == configFile
#$4 == use_extrae
#$4 == use_external
#$5 == outFileIndex
#$6 == qty
......@@ -37,7 +37,7 @@ fi
dir=$1
cores=$2
configFile=$3
use_extrae=$4
use_external=$4
outFileIndex=$5
qty=1
if [ $# -ge 5 ]
......@@ -46,36 +46,42 @@ then
fi
nodelist=$SLURM_JOB_NODELIST
nodes=$SLURM_JOB_NUM_NODES
if [ -z "$nodelist" ];
then
echo "Internal ERROR in generalRun.sh - Nodelist not provided"
exit -1
fi
if [ -z "$nodes" ];
then
nodes=1
fi
numP=$(bash $dir$execDir/BashScripts/getNumPNeeded.sh $configFile 0)
initial_nodelist=$(bash $dir$execDir/BashScripts/createInitialNodelist.sh $numP $cores $nodelist)
#EXECUTE RUN
echo "Nodes=$nodelist"
if [ $use_extrae -ne 1 ]
if [ $use_external -eq 0 ] #NORMAL
then
for ((i=0; i<qty; i++))
do
echo "Run $i starts"
mpirun -hosts $initial_nodelist -np $numP $dir$codeDir/a.out $configFile $outFileIndex
echo "Run $i ends"
done
elif [ $use_external -eq 1 ] #VALGRIND
then
cp $dir$execDir/Valgrind/worker_valgrind.sh .
for ((i=0; i<qty; i++))
do
mpirun -hosts $initial_nodelist -np $numP $dir$codeDir/a.out $configFile $outFileIndex $nodelist $nodes
echo "Run $i starts"
mpirun -hosts $initial_nodelist -np $numP valgrind --leak-check=full --show-leak-kinds=all --track-origins=yes --trace-children=yes --log-file=vg.sp.%p.$SLURM_JOB_ID.$i $dir$codeDir/a.out $configFile $outIndex
echo "Run $i ends"
done
else
else #EXTRAE
cp $dir$execDir/Extrae/extrae.xml .
cp $dir$execDir/Extrae/trace.sh .
cp $dir$execDir/Extrae/trace_worker.sh .
cp $dir$execDir/Extrae/worker_extrae.sh .
for ((i=0; i<qty; i++))
do
#FIXME Extrae not tested keeping in mind the initial nodelist - Could have some errors
srun -n$numP --mpi=pmi2 ./trace.sh $dir$codeDir/a.out $configFile $outFileIndex $nodelist $nodes
srun -n$numP --mpi=pmi2 ./trace.sh $dir$codeDir/a.out $configFile $outFileIndex
done
fi
......
......@@ -19,7 +19,7 @@ echo "START TEST"
#$1 == baseDir
#$2 == cores
#$3 == configFile
#$4 == use_extrae
#$4 == use_external
#$5 == outFileIndex
#$6 == qty
......@@ -34,13 +34,13 @@ fi
dir=$1
cores=$2
configFile=$3
use_extrae=0
use_external=0
outFileIndex=0
qty=1
if [ $# -ge 4 ]
then
use_extrae=$4
use_external=$4
fi
if [ $# -ge 5 ]
......@@ -55,7 +55,6 @@ fi
numP=$(bash $dir$execDir/BashScripts/getNumPNeeded.sh $configFile 0)
nodelist=$SLURM_JOB_NODELIST
nodes=$SLURM_JOB_NUM_NODES
if [ -z "$nodelist" ];
then
nodelist="localhost"
......@@ -63,26 +62,33 @@ then
else
initial_nodelist=$(bash $dir$execDir/BashScripts/createInitialNodelist.sh $numP $cores $nodelist)
fi
if [ -z "$nodes" ];
then
nodes=1
fi
#EXECUTE RUN
echo "Nodes=$nodelist"
if [ $use_extrae -ne 1 ]
if [ $use_external -eq 0 ]
then
for ((i=0; i<qty; i++))
do
echo "Run $i starts"
mpirun -hosts $initial_nodelist -np $numP $dir$codeDir/a.out $configFile $outFileIndex
echo "Run $i ends"
done
elif [ $use_external -eq 1 ] #VALGRIND
then
cp $dir$execDir/Valgrind/worker_valgrind.sh .
for ((i=0; i<qty; i++))
do
mpirun -hosts $initial_nodelist -np $numP $dir$codeDir/a.out $configFile $outFileIndex $nodelist $nodes
echo "Run $i starts"
mpirun -hosts $initial_nodelist -np $numP valgrind --leak-check=full --show-leak-kinds=all --track-origins=yes --trace-children=yes --log-file=vg.sp.%p.$SLURM_JOB_ID.$i $dir$codeDir/a.out $configFile $outIndex
echo "Run $i ends"
done
else
cp $dir$execDir/Extrae/extrae.xml .
cp $dir$execDir/Extrae/trace.sh .
cp $dir$execDir/Extrae/trace_worker.sh .
cp $dir$execDir/Extrae/worker_extrae.sh .
for ((i=0; i<qty; i++))
do
mpirun -hosts $initial_nodelist -np $numP ./trace.sh $dir$codeDir/a.out $configFile $outFileIndex $nodelist $nodes
mpirun -hosts $initial_nodelist -np $numP ./trace.sh $dir$codeDir/a.out $configFile $outFileIndex
done
fi
......
#!/bin/bash
dir="/home/martini/malleability_benchmark/"
# Creates a directory with all possible and valid combinations of configuration files
# that can be created from a given complex configuration file.
# Parameter 1: Complex configuration file name.
# Parameter 2: Common output name of the output configuration files. It will be appended an index to each of them.
#====== Do not modify these values =======
codeDir="Codes/"
execDir="Exec/"
ResultsDir="Results/"
scriptDir="$(dirname "$0")"
source $scriptDir/../Codes/build/config.txt
codeDir="/Codes"
execDir="/Exec"
ResultsDir="/Results"
complex_file=$1
output_name=$2
......
#!/bin/bash
dir="/home/martini/malleability_benchmark"
partition="P1"
exclude="c00,c01,c02"
......@@ -9,6 +8,8 @@ exclude="c00,c01,c02"
# Parameter 2(Optional) - Maximum amount of time in seconds needed by a single execution. Default value is 0, which indicates infinite time. Must be a positive integer.
#====== Do not modify these values =======
scriptDir="$(dirname "$0")"
source $scriptDir/../Codes/build/config.txt
codeDir="/Codes/build"
execDir="/Exec"
ResultsDir="/Results"
......
#!/bin/bash
dir="/home/martini/malleability_benchmark"
partition="P1"
exclude="c00,c01,c02"
......@@ -9,11 +8,13 @@ exclude="c00,c01,c02"
# Parameter 1: Configuration file name for the emulation.
# Parameter 2(Optional): Index to use for the output files. Must be a positive integer.
# Parameter 3(Optional): Number of repetitions to perform. Must be a positive integer.
# Parameter 4(Optional): Use Extrae(1) or not(0).
# Parameter 4(Optional): Use Valgrind(1), Extrae(2) or nothing(0).
# Parameter 5(Optional): Maximum amount of time in seconds needed by a single execution. Default value is 0, which indicates infinite time. Must be a positive integer.
# Parameter 6(Optional): Path where the output files should be saved.
#====== Do not modify these values =======
scriptDir="$(dirname "$0")"
source $scriptDir/../Codes/build/config.txt
codeDir="/Codes/build"
execDir="/Exec"
ResultsDir="/Results"
......@@ -29,14 +30,14 @@ fi
#$1 == configFile
#$2 == outFileIndex
#$3 == Qty of repetitions
#$4 == Use extrae NO(0) YES(1)
#$4 == Use external NO(0) Valgrind(1), Extrae(2)
#$5 == Max time per execution(s)
#$6 == Output path
config_file=$1
outFileIndex=0
qty=1
use_extrae=0
use_external=0
if [ $# -ge 2 ]
then
......@@ -48,7 +49,7 @@ then
fi
if [ $# -ge 4 ]
then
use_extrae=$4
use_external=$4
fi
limit_time=$((0))
if [ $# -ge 5 ] #Max time per execution in seconds
......@@ -63,7 +64,7 @@ fi
#Obtain amount of nodes neeeded
node_qty=$(bash $dir$execDir/BashScripts/getMaxNodesNeeded.sh $config_file $dir $cores)
#Run with the expected amount of nodes
sbatch -p $partition --exclude=$exclude -N $node_qty -t $limit_time $dir$execDir/generalRun.sh $dir $cores $config_file $use_extrae $outFileIndex $qty
sbatch -p $partition --exclude=$exclude -N $node_qty -t $limit_time $dir$execDir/generalRun.sh $dir $cores $config_file $use_external $outFileIndex $qty
if ! [ -z "$output" ]
then
......@@ -71,10 +72,13 @@ then
echo "Moving data to $output\nMoved files:"
ls R${outFileIndex}_G*
mv R${outFileIndex}_G* $output
if [ "$use_extrae" -eq 1 ]
if [ "$use_external" -eq 2 ] # Extrae additional output
then
mv a.out.* $output
mv TRACE* $output
mv set-0/ $output
elif [ "$use_external" -eq 1 ] # Valgrind additional output
then
mv vg.* $output
fi
fi
#!/bin/bash
dir="/home/martini/malleability_benchmark"
cores=20
# Executes a given configuration file. This script can be called with Slurm commands to
......@@ -8,10 +7,12 @@ cores=20
# Parameter 1: Configuration file name for the emulation.
# Parameter 2(Optional): Index to use for the output files. Must be a positive integer.
# Parameter 3(Optional): Number of repetitions to perform. Must be a positive integer.
# Parameter 4(Optional): Use Extrae(1) or not(0).
# Parameter 4(Optional): Use Valgrind(1), Extrae(2) or nothing(0).
# Parameter 5(Optional): Path where the output files should be saved.
#====== Do not modify these values =======
scriptDir="$(dirname "$0")"
source $scriptDir/../Codes/build/config.txt
codeDir="/Codes/build"
execDir="/Exec"
ResultsDir="/Results"
......@@ -26,13 +27,13 @@ fi
#$1 == configFile
#$2 == outFileIndex
#$3 == Qty of repetitions
#$4 == Use extrae NO(0) YES(1)
#$4 == Use external NO(0) Valgrind(1), Extrae(2)
#$5 == Output path
config_file=$1
outFileIndex=0
qty=1
use_extrae=0
use_external=0
if [ $# -ge 2 ]
then
......@@ -44,14 +45,14 @@ then
fi
if [ $# -ge 4 ]
then
use_extrae=$4
use_external=$4
fi
if [ $# -ge 5 ]
then
output=$5
fi
bash $dir$execDir/generalRunCostum.sh $dir $cores $config_file $use_extrae $outFileIndex $qty
bash $dir$execDir/generalRunCostum.sh $dir $cores $config_file $use_external $outFileIndex $qty
if ! [ -z "$output" ]
then
......@@ -59,10 +60,13 @@ then
echo "Moving data to $output\nMoved files:"
ls R${outFileIndex}_G*
mv R${outFileIndex}_G* $output
if [ "$use_extrae" -eq 1 ]
if [ "$use_external" -eq 2 ] # Extrae additional output
then
mv a.out.* $output
mv TRACE* $output
mv set-0/ $output
elif [ "$use_external" -eq 1 ] # Valgrind additional output
then
mv vg.* $output
fi
fi
# malleability_benchmark
# Proteo - Dev branch
Benchmark for a MPI malleable application
\ No newline at end of file
## Overview
This branch contains the codebase used for Proteo developing branch.
## Branch Structure
This branch is divided into the following 4 directories:
- **Analysis**: Contains the scripts and notebook to perform analysis of Proteo executions.
- **Codes**: Contains all the codes used to compile Proteo.
- **Exec**: Contains the scripts to execute Proteo in different ways and check if the runs have completed successfully.
- **Results**: Contains the configuration files used to emulate the malleable emulation of the CG.
## Installation
### Prerequisites
Before installing, ensure you have the following prerequisites:
- MPI (MPICH) installed on your system. This code has been tested with MPICH versions 3.4.1 and 4.0.3 with the OFI netmod.
- Slurm is installed on your system. This code has been tested with slurm-wlm 19.05.5.
The following requisites are optional and only needed to process and analyse the data:
- Python 3(Optional). Only if you want to perform the post-mortem processing or analyse the data.
- Numpy 1.24.3(Optional). Only if you want to perform the post-mortem processing or analyse the data.
- Pandas 1.5.3(Optional). Only if you want to perform the post-mortem processing or analyse the data.
- Seaborn 0.12.2(Optional). Only if you want to analyse the data.
- Matplotlib 3.7.1(Optional). Only if you want to analyse the data.
- Scipy 1.10.1(Optional). Only if you want to analyse the data.
- scikit-posthocs 0.7.0(Optional). Only if you want to analyse the data.
### Steps
1. Clone the repository to your local machine:
```bash
$ git clone http://lorca.act.uji.es/gitlab/martini/malleability_benchmark.git
$ cd malleability_benchmark
$ git checkout JournalSupercomputing23/24
```
2. Compile the code using the `make` command:
```bash
$ cd Codes/
$ make install_slurm
```
This command compiles the code using the MPI (MPICH) library.
3. Test the installation:
```bash
$ cd ../Results
$ bash ../Exec/singleRun.sh test.ini
```
This test launches an Slurm Job with a basic configuration file that performs a reconfiguration from 10 to 2 processes.
As soon as it ends, 4 files will appear, one is the slurm output, and the other 3 are Proteo's output.
Example of a successful run with expected output:
```bash
$ ls
R0_G0NP10ID0.out R0_G1NP2ID0.out R0_Global.out slurm-X.out
$ bash ../Exec/CheckRun.sh test 1 1 4 2 2 100
Number of G(2) and L(2) files match
SUCCESS
```
The slurm-X.out is the output produced by the Job, while the files beggining with an "R" are the output of Proteo, and their description can be found in the manual from this branch.
Lastly, the script Checkrun.sh indicates wether the execution has been performed correctly or not. The value should be SUCCESS or REPEATING, in either case Proteo has been compiled correctly. If the value is FAILURE, a major error appeared and it is recommended to contact the code mantainer.
### Clean Up
To clean the installation and remove compiled binaries, use:
```bash
$ make clean
```
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment