Commit b1cc8715 authored by iker_martin's avatar iker_martin
Browse files

Anyadido recoger coste de crear hilos para operaciones de spawn en segundo plano

parent a8edcbc0
...@@ -60,14 +60,14 @@ void recv_results(results_data *results, int root, int resizes, MPI_Comm interco ...@@ -60,14 +60,14 @@ void recv_results(results_data *results, int root, int resizes, MPI_Comm interco
* En concreto son tres escales y un vector de tamaño "resizes" * En concreto son tres escales y un vector de tamaño "resizes"
*/ */
void def_results_type(results_data *results, int resizes, MPI_Datatype *results_type) { void def_results_type(results_data *results, int resizes, MPI_Datatype *results_type) {
int i, counts = 4; int i, counts = 5;
int blocklengths[4] = {1, 1, 1, 1}; int blocklengths[] = {1, 1, 1, 1, 1};
MPI_Aint displs[counts], dir; MPI_Aint displs[counts], dir;
MPI_Datatype types[counts]; MPI_Datatype types[counts];
// Rellenar vector types // Rellenar vector types
types[0] = types[1] = types[2] = types[3] = MPI_DOUBLE; types[0] = types[1] = types[2] = types[3] = types[4] = MPI_DOUBLE;
blocklengths[3] = resizes; blocklengths[3] = blocklengths[4] = resizes;
// Rellenar vector displs // Rellenar vector displs
MPI_Get_address(results, &dir); MPI_Get_address(results, &dir);
...@@ -75,7 +75,8 @@ void def_results_type(results_data *results, int resizes, MPI_Datatype *results_ ...@@ -75,7 +75,8 @@ void def_results_type(results_data *results, int resizes, MPI_Datatype *results_
MPI_Get_address(&(results->sync_start), &displs[0]); MPI_Get_address(&(results->sync_start), &displs[0]);
MPI_Get_address(&(results->async_start), &displs[1]); MPI_Get_address(&(results->async_start), &displs[1]);
MPI_Get_address(&(results->exec_start), &displs[2]); MPI_Get_address(&(results->exec_start), &displs[2]);
MPI_Get_address(&(results->spawn_time[0]), &displs[3]); //TODO Revisar si se puede simplificar MPI_Get_address(&(results->spawn_thread_time[0]), &displs[3]);
MPI_Get_address(&(results->spawn_time[0]), &displs[4]); //TODO Revisar si se puede simplificar //FIXME Si hay mas de un spawn error?
for(i=0;i<counts;i++) displs[i] -= dir; for(i=0;i<counts;i++) displs[i] -= dir;
...@@ -129,6 +130,11 @@ void print_global_results(results_data *results, int resizes) { ...@@ -129,6 +130,11 @@ void print_global_results(results_data *results, int resizes) {
printf("%lf ", results->spawn_time[i]); printf("%lf ", results->spawn_time[i]);
} }
printf("\nTthread: ");
for(i=0; i< resizes - 1; i++) {
printf("%lf ", results->spawn_thread_time[i]);
}
printf("\nTsync: "); printf("\nTsync: ");
for(i=1; i < resizes; i++) { for(i=1; i < resizes; i++) {
printf("%lf ", results->sync_time[i]); printf("%lf ", results->sync_time[i]);
...@@ -158,6 +164,7 @@ void init_results_data(results_data **results, int resizes, int iters_size) { ...@@ -158,6 +164,7 @@ void init_results_data(results_data **results, int resizes, int iters_size) {
*results = malloc(1 * sizeof(results_data)); *results = malloc(1 * sizeof(results_data));
(*results)->spawn_time = calloc(resizes, sizeof(double)); (*results)->spawn_time = calloc(resizes, sizeof(double));
(*results)->spawn_thread_time = calloc(resizes, sizeof(double));
(*results)->sync_time = calloc(resizes, sizeof(double)); (*results)->sync_time = calloc(resizes, sizeof(double));
(*results)->async_time = calloc(resizes, sizeof(double)); (*results)->async_time = calloc(resizes, sizeof(double));
...@@ -188,6 +195,7 @@ void realloc_results_iters(results_data *results, int needed) { ...@@ -188,6 +195,7 @@ void realloc_results_iters(results_data *results, int needed) {
*/ */
void free_results_data(results_data **results) { void free_results_data(results_data **results) {
free((*results)->spawn_time); free((*results)->spawn_time);
free((*results)->spawn_thread_time);
free((*results)->sync_time); free((*results)->sync_time);
free((*results)->async_time); free((*results)->async_time);
......
...@@ -7,8 +7,8 @@ typedef struct { ...@@ -7,8 +7,8 @@ typedef struct {
double *iters_time; double *iters_time;
int *iters_type, iter_index, iters_size; int *iters_type, iter_index, iters_size;
// Spawn, Sync and Async time // Spawn, Thread, Sync, Async and Exec time
double spawn_start, *spawn_time; double spawn_start, *spawn_time, *spawn_thread_time;
double sync_start, *sync_time; double sync_start, *sync_time;
double async_start, *async_time; double async_start, *async_time;
double exec_start, exec_time; double exec_start, exec_time;
......
...@@ -17,7 +17,7 @@ void Sons_init(); ...@@ -17,7 +17,7 @@ void Sons_init();
int checkpoint(int iter, int state, MPI_Request **comm_req); int checkpoint(int iter, int state, MPI_Request **comm_req);
int TC(int numS, int comm_type); int TC(int numS, int comm_type);
int start_redistribution(int numS, MPI_Request **comm_req); int start_redistribution(int iter, int numS, MPI_Request **comm_req);
int check_redistribution(int iter, MPI_Request **comm_req); int check_redistribution(int iter, MPI_Request **comm_req);
int end_redistribution(int iter); int end_redistribution(int iter);
...@@ -129,6 +129,7 @@ int work() { ...@@ -129,6 +129,7 @@ int work() {
state = MAL_COMM_UNINITIALIZED; state = MAL_COMM_UNINITIALIZED;
res = 0; res = 0;
//if(group->myId == ROOT) printf("Iter_start %d\n", group->iter_start);
for(iter=group->iter_start; iter < maxiter; iter++) { for(iter=group->iter_start; iter < maxiter; iter++) {
iterate(matrix, config_file->matrix_tam, state); iterate(matrix, config_file->matrix_tam, state);
} }
...@@ -172,15 +173,15 @@ int checkpoint(int iter, int state, MPI_Request **comm_req) { ...@@ -172,15 +173,15 @@ int checkpoint(int iter, int state, MPI_Request **comm_req) {
state = TC(group->numS, comm_type); state = TC(group->numS, comm_type);
if (state == COMM_FINISHED){ if (state == COMM_FINISHED){
state = start_redistribution(group->numS, comm_req); state = start_redistribution(0, group->numS, comm_req);
} }
} else if(state == COMM_IN_PROGRESS) { // Comprueba si el spawn ha terminado y comienza la redistribucion } else if(state == COMM_IN_PROGRESS) { // Comprueba si el spawn ha terminado y comienza la redistribucion
state = check_slurm_comm(group->myId, ROOT, &(group->children)); state = check_slurm_comm(group->myId, ROOT, group->numP, &(group->children));
if (state == COMM_FINISHED) { if (state == COMM_FINISHED) {
results->spawn_time[group->grp] = MPI_Wtime() - results->spawn_start; results->spawn_time[group->grp] = MPI_Wtime() - results->spawn_start;
state = start_redistribution(group->numS, comm_req); state = start_redistribution(iter, group->numS, comm_req);
} }
} else if(state == MAL_ASYNC_PENDING) { } else if(state == MAL_ASYNC_PENDING) {
...@@ -207,6 +208,10 @@ int TC(int numS, int comm_type){ ...@@ -207,6 +208,10 @@ int TC(int numS, int comm_type){
comm_state = init_slurm_comm(group->argv, group->myId, numS, ROOT, dist, comm_type, MPI_COMM_WORLD, &(group->children)); comm_state = init_slurm_comm(group->argv, group->myId, numS, ROOT, dist, comm_type, MPI_COMM_WORLD, &(group->children));
if(comm_type == COMM_SPAWN_SERIAL) if(comm_type == COMM_SPAWN_SERIAL)
results->spawn_time[group->grp] = MPI_Wtime() - results->spawn_start; results->spawn_time[group->grp] = MPI_Wtime() - results->spawn_start;
else if(comm_type == COMM_SPAWN_PTHREAD) {
results->spawn_thread_time[group->grp] = MPI_Wtime() - results->spawn_start;
results->spawn_start = MPI_Wtime();
}
return comm_state; return comm_state;
} }
...@@ -224,7 +229,7 @@ int TC(int numS, int comm_type){ ...@@ -224,7 +229,7 @@ int TC(int numS, int comm_type){
* Finalmente se envian datos sobre los resultados a los hijos y se desconectan ambos * Finalmente se envian datos sobre los resultados a los hijos y se desconectan ambos
* grupos de procesos. * grupos de procesos.
*/ */
int start_redistribution(int numS, MPI_Request **comm_req) { int start_redistribution(int iter, int numS, MPI_Request **comm_req) {
int rootBcast = MPI_PROC_NULL; int rootBcast = MPI_PROC_NULL;
if(group->myId == ROOT) rootBcast = MPI_ROOT; if(group->myId == ROOT) rootBcast = MPI_ROOT;
...@@ -242,7 +247,7 @@ int start_redistribution(int numS, MPI_Request **comm_req) { ...@@ -242,7 +247,7 @@ int start_redistribution(int numS, MPI_Request **comm_req) {
return MAL_ASYNC_PENDING; return MAL_ASYNC_PENDING;
} }
} }
return end_redistribution(0); return end_redistribution(iter);
} }
/* /*
...@@ -396,6 +401,7 @@ void Sons_init() { ...@@ -396,6 +401,7 @@ void Sons_init() {
results->async_time[group->grp] = MPI_Wtime(); results->async_time[group->grp] = MPI_Wtime();
MPI_Bcast(&(group->iter_start), 1, MPI_INT, ROOT, group->parents); MPI_Bcast(&(group->iter_start), 1, MPI_INT, ROOT, group->parents);
} }
MPI_Bcast(&(group->iter_start), 1, MPI_INT, ROOT, group->parents); //FIXME Quitar -- Que tenga en cuenta Pthread y async
if(config_file->sdr) { // Recibir datos sincronos if(config_file->sdr) { // Recibir datos sincronos
recv_sync(&(group->sync_array), config_file->sdr, group->myId, group->numP, ROOT, group->parents, numP_parents); recv_sync(&(group->sync_array), config_file->sdr, group->myId, group->numP, ROOT, group->parents, numP_parents);
results->sync_time[group->grp] = MPI_Wtime(); results->sync_time[group->grp] = MPI_Wtime();
...@@ -449,7 +455,8 @@ void iterate(double *matrix, int n, int async_comm) { ...@@ -449,7 +455,8 @@ void iterate(double *matrix, int n, int async_comm) {
} }
actual_time = MPI_Wtime(); // Guardar tiempos actual_time = MPI_Wtime(); // Guardar tiempos
if(async_comm == MAL_ASYNC_PENDING) { // Se esta realizando una redistribucion de datos asincrona // TODO Que diferencie entre ambas en el IO
if(async_comm == MAL_ASYNC_PENDING || async_comm == COMM_IN_PROGRESS) { // Se esta realizando una redistribucion de datos asincrona
operations=0; operations=0;
} }
......
...@@ -109,11 +109,34 @@ int init_slurm_comm(char **argv, int myId, int numP, int root, int type_dist, in ...@@ -109,11 +109,34 @@ int init_slurm_comm(char **argv, int myId, int numP, int root, int type_dist, in
* Comprueba si una configuracion para crear un nuevo grupo de procesos esta lista, * Comprueba si una configuracion para crear un nuevo grupo de procesos esta lista,
* y en caso de que lo este, se devuelve el communicador a estos nuevos procesos. * y en caso de que lo este, se devuelve el communicador a estos nuevos procesos.
*/ */
int check_slurm_comm(int myId, int root, MPI_Comm *child) { int check_slurm_comm(int myId, int root, int numP, MPI_Comm *child) { // TODO Borrar numP si no se usa
int state=-10; int state=-10;
if(slurm_data->type_creation == COMM_SPAWN_PTHREAD) { if(slurm_data->type_creation == COMM_SPAWN_PTHREAD) {
MPI_Allreduce(&commSlurm, &state, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);
//MPI_Allreduce(&commSlurm, &state, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);
if(myId == root) {
int i, recv_state;
state = commSlurm;
for(i=0; i<numP; i++) { //Recv states
if(i != myId) {
MPI_Recv(&recv_state, 1, MPI_INT, i, 120, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
if(recv_state == COMM_IN_PROGRESS) {
state = recv_state;
}
}
}
for(i=0; i<numP; i++) { //Send state
if(i != myId) {
MPI_Send(&state, 1, MPI_INT, i, 120, MPI_COMM_WORLD);
}
}
} else {
MPI_Send(&commSlurm, 1, MPI_INT, root, 120, MPI_COMM_WORLD);
MPI_Recv(&state, 1, MPI_INT, root, 120, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
}
if(state != COMM_FINISHED) return state; // Continue only if asynchronous process creation has ended if(state != COMM_FINISHED) return state; // Continue only if asynchronous process creation has ended
} else { } else {
......
...@@ -15,4 +15,4 @@ ...@@ -15,4 +15,4 @@
#define COMM_SPAWN_PTHREAD 1 #define COMM_SPAWN_PTHREAD 1
int init_slurm_comm(char **argv, int myId, int numP, int root, int type_dist, int type_creation, MPI_Comm comm, MPI_Comm *child); int init_slurm_comm(char **argv, int myId, int numP, int root, int type_dist, int type_creation, MPI_Comm comm, MPI_Comm *child);
int check_slurm_comm(int myId, int root, MPI_Comm *child); int check_slurm_comm(int myId, int root, int numP, MPI_Comm *child);
...@@ -2,6 +2,9 @@ ...@@ -2,6 +2,9 @@
#SBATCH -N 1 #SBATCH -N 1
dir="/home/martini/malleability_benchmark"
codeDir="/Codes"
echo "MPICH" echo "MPICH"
module load mpich-3.4.1-noucx module load mpich-3.4.1-noucx
#export HYDRA_DEBUG=1 #export HYDRA_DEBUG=1
...@@ -9,8 +12,8 @@ module load mpich-3.4.1-noucx ...@@ -9,8 +12,8 @@ module load mpich-3.4.1-noucx
numP=$(bash recordMachinefile.sh $1) numP=$(bash recordMachinefile.sh $1)
mpiexec -f hostfile.o$SLURM_JOB_ID ./a.out $1 $2 #mpirun -f hostfile.o$SLURM_JOB_ID ./a.out $1 $2
mpirun -f hostfile.o$SLURM_JOB_ID $dir$codeDir/a.out $1 $2
rm hostfile.o$SLURM_JOB_ID rm hostfile.o$SLURM_JOB_ID
echo "END RUN" echo "END RUN"
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment