Commit 5cd121d6 authored by iker_martin's avatar iker_martin
Browse files

Hotfix -- No se recogian correctamente las iteraciones en Merge Shrink

parent f4a8b977
...@@ -71,6 +71,7 @@ int main(int argc, char *argv[]) { ...@@ -71,6 +71,7 @@ int main(int argc, char *argv[]) {
} }
init_group_struct(argv, argc, myId, numP); init_group_struct(argv, argc, myId, numP);
//FIXME No funciona en OpenMPI
im_child = init_malleability(myId, numP, ROOT, comm, argv[0], nodelist, num_cpus, num_nodes); im_child = init_malleability(myId, numP, ROOT, comm, argv[0], nodelist, num_cpus, num_nodes);
if(!im_child) { //TODO REFACTOR Simplificar inicio if(!im_child) { //TODO REFACTOR Simplificar inicio
...@@ -249,7 +250,6 @@ void iterate(double *matrix, int n, int async_comm, int iter) { ...@@ -249,7 +250,6 @@ void iterate(double *matrix, int n, int async_comm, int iter) {
operations = time / Top; //FIXME Calcular una sola vez operations = time / Top; //FIXME Calcular una sola vez
for(i=0; i < operations; i++) { for(i=0; i < operations; i++) {
aux += computePiSerial(n); aux += computePiSerial(n);
} }
......
...@@ -157,7 +157,7 @@ int check_slurm_comm(int myId, int root, int numP, MPI_Comm *child, MPI_Comm com ...@@ -157,7 +157,7 @@ int check_slurm_comm(int myId, int root, int numP, MPI_Comm *child, MPI_Comm com
int state=-10; int state=-10;
//printf("[%d][3] Test min\n", myId); fflush(stdout); //printf("[%d][3] Test min\n", myId); fflush(stdout);
//pthread_mutex_lock(&spawn_mutex); //pthread_mutex_lock(&spawn_mutex); // TODO Descomentar
MPI_Allreduce(&commState, &state, 1, MPI_INT, MPI_MIN, comm); MPI_Allreduce(&commState, &state, 1, MPI_INT, MPI_MIN, comm);
//pthread_mutex_unlock(&spawn_mutex); //pthread_mutex_unlock(&spawn_mutex);
...@@ -173,7 +173,7 @@ int check_slurm_comm(int myId, int root, int numP, MPI_Comm *child, MPI_Comm com ...@@ -173,7 +173,7 @@ int check_slurm_comm(int myId, int root, int numP, MPI_Comm *child, MPI_Comm com
} else if (slurm_data->spawn_is_single) { } else if (slurm_data->spawn_is_single) {
//pthread_mutex_lock(&spawn_mutex); //pthread_mutex_lock(&spawn_mutex); // TODO Descomentar
MPI_Bcast(&commState, 1, MPI_INT, root, comm); MPI_Bcast(&commState, 1, MPI_INT, root, comm);
//pthread_mutex_unlock(&spawn_mutex); //pthread_mutex_unlock(&spawn_mutex);
int threads_not_spawned = pthread_equal(pthread_self(), spawn_thread); int threads_not_spawned = pthread_equal(pthread_self(), spawn_thread);
......
...@@ -170,15 +170,20 @@ int malleability_checkpoint() { ...@@ -170,15 +170,20 @@ int malleability_checkpoint() {
} else if(state == MAL_SPAWN_PENDING || state == MAL_SPAWN_SINGLE_PENDING) { // Comprueba si el spawn ha terminado y comienza la redistribucion } else if(state == MAL_SPAWN_PENDING || state == MAL_SPAWN_SINGLE_PENDING) { // Comprueba si el spawn ha terminado y comienza la redistribucion
double end_real_time; double end_real_time;
state = check_slurm_comm(mall->myId, mall->root, mall->numP, &(mall->intercomm), mall->comm, mall->thread_comm, &end_real_time);
if (state == MAL_SPAWN_COMPLETED) {
mall_conf->results->spawn_time[mall_conf->grp] = MPI_Wtime() - mall_conf->results->spawn_start;
if(mall_conf->spawn_type == COMM_SPAWN_PTHREAD || mall_conf->spawn_type == COMM_SPAWN_MERGE_PTHREAD) {
mall_conf->results->spawn_real_time[mall_conf->grp] = end_real_time - mall_conf->results->spawn_start;
}
//TODO Si es MERGE SHRINK, metodo diferente de redistribucion de datos if(mall_conf->spawn_type == COMM_SPAWN_MERGE_PTHREAD && mall->numP > mall->numC) {
state = start_redistribution(); state = shrink_redistribution(); //TODO REFACTOR
} else {
state = check_slurm_comm(mall->myId, mall->root, mall->numP, &(mall->intercomm), mall->comm, mall->thread_comm, &end_real_time);
if (state == MAL_SPAWN_COMPLETED) {
mall_conf->results->spawn_time[mall_conf->grp] = MPI_Wtime() - mall_conf->results->spawn_start;
if(mall_conf->spawn_type == COMM_SPAWN_PTHREAD || mall_conf->spawn_type == COMM_SPAWN_MERGE_PTHREAD) {
mall_conf->results->spawn_real_time[mall_conf->grp] = end_real_time - mall_conf->results->spawn_start;
}
//TODO Si es MERGE SHRINK, metodo diferente de redistribucion de datos
state = start_redistribution();
}
} }
} else if(state == MAL_DIST_PENDING) { } else if(state == MAL_DIST_PENDING) {
...@@ -641,16 +646,71 @@ int end_redistribution() { ...@@ -641,16 +646,71 @@ int end_redistribution() {
return result; return result;
} }
///=============================================
///=============================================
///=============================================
double time_adapt;
int state_shrink=0; //TODO Refactor
pthread_t thread_shrink;
MPI_Comm comm_shrink;
int thread_shrink_creation();
void *thread_shrink_work();
/*
* Crea una hebra para ejecutar una comunicación en segundo plano.
*/
int thread_shrink_creation() {
if(pthread_create(&thread_shrink, NULL, thread_shrink_work, NULL)) {
printf("Error al crear el hilo\n");
MPI_Abort(MPI_COMM_WORLD, -1);
return -1;
}
return MAL_SPAWN_PENDING;
}
void* thread_shrink_work() {
proc_adapt_shrink(mall->numC, &comm_shrink, mall->myId);
state_shrink=2;
pthread_exit(NULL);
}
///=============================================
///=============================================
///=============================================
int shrink_redistribution() { int shrink_redistribution() {
double time_adapt = MPI_Wtime(); int global_state;
MPI_Comm aux_comm; MPI_Comm aux_comm;
MPI_Comm_dup(mall->comm, &aux_comm);
proc_adapt_shrink( mall->numC, &(mall->comm), mall->myId); if(mall_conf->spawn_type == COMM_SPAWN_MERGE_PTHREAD) {
if(state_shrink == 0) {
time_adapt = MPI_Wtime();
state_shrink = 1;
MPI_Comm_dup(mall->comm, &comm_shrink);
thread_shrink_creation();
return MAL_SPAWN_PENDING;
} else if(state_shrink>0) {
MPI_Allreduce(&state_shrink, &global_state, 1, MPI_INT, MPI_MIN, mall->comm);
if(global_state < 2) return MAL_SPAWN_PENDING;
if(pthread_join(thread_shrink, NULL)) {
printf("Error al esperar al hilo\n");
MPI_Abort(MPI_COMM_WORLD, -1);
return -10;
}
MPI_Comm_dup(mall->comm, &aux_comm);
mall->comm = comm_shrink;
}
} else {
time_adapt = MPI_Wtime();
MPI_Comm_dup(mall->comm, &aux_comm);
proc_adapt_shrink( mall->numC, &(mall->comm), mall->myId);
}
//TODO REFACTOR -- Que solo la llamada de collect iters este fuera de los hilos
zombies_collect_suspended(aux_comm, mall->myId, mall->numP, mall->numC, mall->root, (void *) mall_conf->results, mall->user_comm); zombies_collect_suspended(aux_comm, mall->myId, mall->numP, mall->numC, mall->root, (void *) mall_conf->results, mall->user_comm);
MPI_Comm_free(&aux_comm);
if(mall->myId < mall->numC) { if(mall->myId < mall->numC) {
MPI_Comm_free(&aux_comm);
MPI_Comm_dup(mall->comm, &aux_comm); MPI_Comm_dup(mall->comm, &aux_comm);
mall->thread_comm = aux_comm; mall->thread_comm = aux_comm;
MPI_Comm_dup(mall->comm, &aux_comm); MPI_Comm_dup(mall->comm, &aux_comm);
......
...@@ -19,14 +19,18 @@ ...@@ -19,14 +19,18 @@
#define COMM_PHY_NODES 1 #define COMM_PHY_NODES 1
#define COMM_PHY_CPU 2 #define COMM_PHY_CPU 2
// TODO Separar PTHREAD // SPAWN METHODS
#define COMM_SPAWN_SERIAL 0 #define COMM_SPAWN_SERIAL 0
#define COMM_SPAWN_PTHREAD 1 #define COMM_SPAWN_PTHREAD 1
#define COMM_SPAWN_MERGE 2 #define COMM_SPAWN_MERGE 2
#define COMM_SPAWN_MERGE_PTHREAD 3 #define COMM_SPAWN_MERGE_PTHREAD 3
//#define COMM_SPAWN_BASELINE 0
//#define COMM_SPAWN_MERGE 1
//SPAWN STRATEGIES
#define COMM_SPAWN_MULTIPLE 0 #define COMM_SPAWN_MULTIPLE 0
#define COMM_SPAWN_SINGLE 1 #define COMM_SPAWN_SINGLE 1
//#define COMM_SPAWN_SERIAL 0
//#define COMM_SPAWN_PTHREAD 1
#define MAL_USE_NORMAL 0 #define MAL_USE_NORMAL 0
#define MAL_USE_IBARRIER 1 #define MAL_USE_IBARRIER 1
......
...@@ -15,3 +15,4 @@ elif [ $dist == "cpu" ]; then ...@@ -15,3 +15,4 @@ elif [ $dist == "cpu" ]; then
fi fi
$dir/Recordnodelist.o $numP $dist $dir/Recordnodelist.o $numP $dist
echo $numP
#!/bin/bash #!/bin/bash
#SBATCH -p P1
#SBATCH -N 1 #SBATCH -N 1
#SBATCH --exclude=c01,c00,c02 #SBATCH --exclude=c01,c00,c02
...@@ -17,6 +18,7 @@ module load mpich-3.4.1-noucx ...@@ -17,6 +18,7 @@ module load mpich-3.4.1-noucx
numP=$(bash recordMachinefile.sh $1) numP=$(bash recordMachinefile.sh $1)
mpirun -print-all-exitcodes -f hostfile.o$SLURM_JOB_ID $dir$codeDir/a.out $1 $2 $nodelist $nodes mpirun -print-all-exitcodes -f hostfile.o$SLURM_JOB_ID $dir$codeDir/a.out $1 $2 $nodelist $nodes
#mpirun -np $numP $dir$codeDir/a.out $1 $2 $nodelist $nodes
rm hostfile.o$SLURM_JOB_ID rm hostfile.o$SLURM_JOB_ID
echo "END RUN" echo "END RUN"
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment