Commit e466e997 authored by iker_martin's avatar iker_martin
Browse files

Updated maleable CG to use the new MaM interface

parent b7bcaffe
......@@ -12,17 +12,12 @@ cores=20
numP=$1
matrix=$2
numC=$3
msm=$4
mss=$5
mrm=$6
is_sync=$8
qty=$9
qty=$4
initial_nodelist=$(bash $dirCG/BashScripts/createInitialNodelist.sh $numP $cores $nodelist)
echo "Test numP=$numP numC=$numC Meths=$msm $mrm $mss -- Is_synch=$is_sync qty=$qty"
echo "Test numP=$numP numC=$numC -- qty=$qty"
for ((i=0; i<qty; i++))
do
mpirun -hosts $initial_nodelist -np $numP $dirCG/build/a.out $matrix $numC $msm $mss $mrm $mss $is_sync $nodelist $nodes
mpirun -hosts $initial_nodelist -np $numP $dirCG/build/a.out $matrix $numC
done
echo "End"
......
......@@ -6,34 +6,22 @@
scriptDir="$(dirname "$0")"
source $scriptDir/config.txt
dirM="${dirCG}"
dirM="${dirCG}../SparseMatrix"
export dirCG
matrix="Queen_4147.rb"
#matrix="Queen_4147.rb"
#matrix="audikw_1.rb"
#matrix="bcsstk01.rsa"
matrix="bcsstk01.rsa"
module load /home/martini/MODULES/modulefiles/mpich-4.0.3-ofi
nodelist=$SLURM_JOB_NODELIST
nodes=$SLURM_JOB_NUM_NODES
cores=20
numP=$1
numC=$2
msm=$3
mrm=$4
mss=$5
send_sync=$6
qty=1
if [ $# -gt 6 ]
if [ $# -gt 2 ]
then
qty=$7
qty=$3
fi
initial_nodelist=$(bash $dirCG/BashScripts/createInitialNodelist.sh $numP $cores $nodelist)
echo "Test numP=$numP numC=$numC Meths=$msm $mrm $mss -- Is_synch=$send_sync qty=$qty"
for ((i=0; i<qty; i++))
do
mpirun -hosts $initial_nodelist -np $numP $dirCG/build/a.out $dirM/$matrix $numC $msm $mss $mrm $mss $send_sync $nodelist $nodes
done
sbatch -p P1 -N 1 $dirCG/Exec/generalRun.sh $numP $dirM/$matrix $numC $qty
echo "End"
......@@ -8,7 +8,7 @@
#include "SparseMatrices.h"
#include <mpi.h>
#include <string.h>
#include "../malleability/malleabilityManager.h"
#include "../malleability/MAM.h"
#include <unistd.h>
......@@ -47,6 +47,22 @@ struct Dist_data {
MPI_Datatype scalars, arrays;
};
typedef struct {
SparseMatrix other_subm;
int *array_vptr, *array_vpos, initiated;
double start_time, *array_vval;
MPI_Comm comm;
MPI_Request reqs[2];
} user_redist_t;
static const user_redist_t empty_user_data = {
.array_vptr = NULL,
.array_vpos = NULL,
.array_vval = NULL,
.initiated = 0,
.comm = MPI_COMM_NULL
};
void dumb(Compute_data *computeData, struct Dist_data *dist_data); //FIXME Delte me
void init_app(Compute_data *computeData, struct Dist_data *dist_data, char* argv[]);
......@@ -55,29 +71,28 @@ void get_rows_dist(Compute_data *computeData, int numP, int n);
void mat_alloc(Compute_data *computeData, SparseMatrix mat, struct Dist_data dist_data);
void computeSolution(Compute_data computeData, double **subsol, SparseMatrix mat, int myId, double **full_vec);
void pre_compute(Compute_data *computeData, struct Dist_data dist_data, double *subsol, double *full_vec);
int compute(Compute_data *computeData, struct Dist_data *dist_data, int sm);
void free_computeData(Compute_data *computeData, int terminate);
int compute(Compute_data *computeData, struct Dist_data *dist_data, user_redist_t *user_data);
void free_computeData(Compute_data *computeData);
//===================================MALLEABILITY FUNCTIONS====================================================
int n_check = 30;
int dist_old(struct Dist_data *dist_data, Compute_data *computeData, int num_children, int sm, int ss, int rm, int rs, int send_sync);
void dist_new(struct Dist_data *dist_data, Compute_data *computeData);
void update_dist_data(struct Dist_data *dist_data);
void print_global_results();
void originals_set_data(struct Dist_data *dist_data, Compute_data *computeData, int num_target);
void user_func(void *args);
void targets_distribution(mam_user_reconf_t user_reconf, user_redist_t *user_data);
void targets_update(struct Dist_data *dist_data, Compute_data *computeData, user_redist_t *user_data);
void print_global_results(double start_time);
//----------------------------------------------------------------------------------------------------
void get_dist(int total_r, int id, int numP, struct Dist_data *dist_data);
void set_counts(int id, int numP, struct Dist_data data_dist, int *sendcounts);
void getIds_intercomm(struct Dist_data dist_data, int numP_other, int **idS);
void prepare_redist_counts(int *counts, int *displs, int numP_other, int offset, struct Dist_data dist_data, int *vptr);
void set_counts(int id, int numP, struct Dist_data data_dist, int offset, int *sendcounts);
void getIds_intercomm(struct Dist_data dist_data, int numP_other, int *idS);
//----------------------------------------------------------------------------------------------------
int main (int argc, char *argv[]) {
int terminate;
int req, num_nodes, num_cpus = 20;
int sm, ss, rm, rs, send_sync;
char *nodelist = NULL;
int sm;
int req;
Compute_data computeData;
user_redist_t user_data;
computeData.z = NULL; computeData.d_full = NULL, computeData.d = NULL;
computeData.vec = NULL; computeData.res = NULL;
......@@ -86,56 +101,46 @@ int main (int argc, char *argv[]) {
computeData.subm.vptr = NULL;
computeData.vlen = NULL;
send_sync=1;
sm = 1; ss = 1; rm = 0; rs = 1;
int numP, myId, num_children = 0;
int num_targets = 1;
struct Dist_data dist_data;
if (argc >= 10) {
num_children = atoi(argv[2]);
sm = atoi(argv[3]);
ss = atoi(argv[4]);
rm = atoi(argv[5]);
rs = atoi(argv[6]);
send_sync = atoi(argv[7]);
nodelist = argv[8];
num_nodes = atoi(argv[9]);
num_cpus = num_nodes * num_cpus;
if (argc >= 3) {
num_targets = atoi(argv[2]);
}
MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &req);
MPI_Comm_size(MPI_COMM_WORLD, &numP);
MPI_Comm_rank(MPI_COMM_WORLD, &myId);
MPI_Comm_size(MPI_COMM_WORLD, &dist_data.numP);
MPI_Comm_rank(MPI_COMM_WORLD, &dist_data.myId);
dist_data.comm = MPI_COMM_WORLD;
user_data = empty_user_data;
user_data.comm = dist_data.comm;
int new_group = init_malleability(myId, numP, ROOT, dist_data.comm, argv[0], nodelist, num_cpus, num_nodes);
update_dist_data(&dist_data);
int new_group = MAM_Init(ROOT, &dist_data.comm, argv[0], user_func, (void *) &user_data);
if( !new_group ) { //First set of processes
init_app(&computeData, &dist_data, argv);
dist_old(&dist_data, &computeData, num_children, sm, ss, rm, rs, send_sync);
originals_set_data(&dist_data, &computeData, num_targets);
user_data.array_vptr = computeData.subm.vptr;
user_data.array_vpos = computeData.subm.vpos;
user_data.array_vval = computeData.subm.vval;
MPI_Barrier(MPI_COMM_WORLD);
set_global_time(MPI_Wtime());
user_data.start_time = MPI_Wtime();
} else {
dist_new(&dist_data, &computeData);
targets_update(&dist_data, &computeData, &user_data);
}
// if(computeData.iter==0)
terminate = compute(&computeData, &dist_data, sm);
compute(&computeData, &dist_data, &user_data);
if(terminate) {
update_dist_data(&dist_data);
MPI_Barrier(dist_data.comm);
if(dist_data.myId == ROOT) {
print_global_results();
print_global_results(user_data.start_time);
printf ("End(%d) --> (%d,%20.10e)\n", computeData.n, computeData.iter, computeData.tol);
}
}
// End of CG
free_malleability();
free_computeData(&computeData, 1);
if(sm && numP > num_children && dist_data.myId == 0) {
MAM_Finalize();
free_computeData(&computeData);
if(dist_data.numP > num_targets && dist_data.myId == 0) {
MPI_Abort(MPI_COMM_WORLD, -100);
}
MPI_Finalize();
......@@ -314,7 +319,7 @@ void computeSolution(Compute_data computeData, double **subsol, SparseMatrix mat
#else
ProdSparseMatrixVector (computeData.subm, *full_vec, *subsol); // sol += A * x
#endif
/*
#ifdef DEBUG
int aux, i;
double *solD = NULL, *sol = NULL;
......@@ -351,7 +356,7 @@ void computeSolution(Compute_data computeData, double **subsol, SparseMatrix mat
RemoveDoubles (&sol);
RemoveDoubles (&solD);
#endif // DEBUG
*/
}
/*
......@@ -400,13 +405,13 @@ void pre_compute(Compute_data *computeData, struct Dist_data dist_data, double *
/*
* Bucle de computo principal
*/
int compute(Compute_data *computeData, struct Dist_data *dist_data, int sm) {
int compute(Compute_data *computeData, struct Dist_data *dist_data, user_redist_t *user_data) {
int IZERO = 0, IONE = 1;
double DONE = 1.0, DMONE = -1.0, DZERO = 0.0;
int state = MALL_NOT_STARTED;
int state = MAM_UNRESERVED;
int ended_loop = 1;
int cnt = 0;
int reconfigure = 0, rec_iter = 500;
int reconfigure = 0, rec_iter = 1;
computeData->maxiter = 1000;
......@@ -433,25 +438,28 @@ int compute(Compute_data *computeData, struct Dist_data *dist_data, int sm) {
rscal (&(dist_data->tamBl), &computeData->alpha, computeData->d, &IONE); // d = alpha * d
raxpy (&(dist_data->tamBl), &DONE, computeData->res, &IONE, computeData->d, &IONE); // d += res
//MPI_Allgatherv(computeData->d, dist_data->tamBl, MPI_DOUBLE, computeData->d_full,
// computeData->dist_rows, computeData->displs_rows, MPI_DOUBLE, dist_data->comm); // d_full = Gather(d)
computeData->tol = sqrt (computeData->beta); // tol = sqrt(beta) = norm (res)
computeData->iter++;
if (computeData->iter == rec_iter) reconfigure = 1;
if (reconfigure) {
state = malleability_checkpoint();
if ((state == MALL_COMPLETED && sm == 0) || state == MALL_ZOMBIE) { ended_loop = 0; break; }
else if(state == MALL_COMPLETED) {
MAM_Checkpoint(&state, MAM_CHECK_COMPLETION, user_func, (void *) user_data);
if(state == MAM_COMPLETED) {
reconfigure = 0;
free_computeData(computeData, 0);
update_dist_data(dist_data);
dist_new(dist_data, computeData);
free_computeData(computeData);
targets_update(dist_data, computeData, user_data);
}
}
}
if(state == MAM_PENDING) {
MAM_Checkpoint(&state, MAM_WAIT_COMPLETION, user_func, (void *) user_data);
free_computeData(computeData);
targets_update(dist_data, computeData, user_data);
}
#ifdef DEBUG
if(dist_data->myId == ROOT) printf ("Ended loop\n");
#endif
......@@ -461,14 +469,45 @@ int compute(Compute_data *computeData, struct Dist_data *dist_data, int sm) {
void dumb(Compute_data *computeData, struct Dist_data *dist_data) {
int i;
if(dist_data->myId == 0) printf("TamBL=");
fflush(stdout); MPI_Barrier(dist_data->comm);
for(i=0; i<dist_data->numP; i++) {
if(dist_data->myId == i) {
printf("%d, ", dist_data->tamBl);
}
fflush(stdout);
sleep(1);
MPI_Barrier(dist_data->comm);
}
if(dist_data->myId == 0) printf("\n");
fflush(stdout); MPI_Barrier(dist_data->comm);
if(dist_data->myId == 0) printf("Vlen=");
fflush(stdout); MPI_Barrier(dist_data->comm);
for(i=0; i<dist_data->numP; i++) {
if(dist_data->myId == i) {
for(int j=0; j<dist_data->tamBl; j++) {
printf("%d, ", computeData->vlen[j]);
}
}
fflush(stdout);
sleep(1);
MPI_Barrier(dist_data->comm);
}
if(dist_data->myId == 0) printf("\n");
fflush(stdout); MPI_Barrier(dist_data->comm);
if(dist_data->myId == 0) printf("Vptr=");
fflush(stdout); MPI_Barrier(dist_data->comm);
for(i=0; i<dist_data->numP; i++) {
if(dist_data->myId == i) {
printf("%d, ", computeData->subm.vptr[dist_data->tamBl]);
fflush(stdout);
}
fflush(stdout);
sleep(1);
MPI_Barrier(dist_data->comm);
}
if(dist_data->myId == 0) printf("\n");
......@@ -481,8 +520,9 @@ void dumb(Compute_data *computeData, struct Dist_data *dist_data) {
if(dist_data->myId == i) {
printf("%lf, ", computeData->tol);
fflush(stdout);
}
fflush(stdout);
sleep(1);
MPI_Barrier(dist_data->comm);
}
if(dist_data->myId == 0) printf("\n");
......@@ -495,8 +535,9 @@ void dumb(Compute_data *computeData, struct Dist_data *dist_data) {
if(dist_data->myId == i) {
printf("%lf, ", computeData->z[dist_data->tamBl-1]);
fflush(stdout);
}
fflush(stdout);
sleep(1);
MPI_Barrier(dist_data->comm);
}
if(dist_data->myId == 0) printf("\n");
......@@ -508,8 +549,9 @@ void dumb(Compute_data *computeData, struct Dist_data *dist_data) {
if(dist_data->myId == i) {
printf("%lf, ", computeData->d[dist_data->tamBl-1]);
fflush(stdout);
}
fflush(stdout);
sleep(1);
MPI_Barrier(dist_data->comm);
}
if(dist_data->myId == 0) printf("\n");
......@@ -521,8 +563,9 @@ void dumb(Compute_data *computeData, struct Dist_data *dist_data) {
if(dist_data->myId == i) {
printf("%lf, ", computeData->res[dist_data->tamBl-1]);
fflush(stdout);
}
fflush(stdout);
sleep(1);
MPI_Barrier(dist_data->comm);
}
if(dist_data->myId == 0) printf("\n");
......@@ -534,15 +577,16 @@ void dumb(Compute_data *computeData, struct Dist_data *dist_data) {
if(dist_data->myId == i) {
printf("%lf, ", computeData->vec[dist_data->tamBl-1]);
fflush(stdout);
}
fflush(stdout);
sleep(1);
MPI_Barrier(dist_data->comm);
}
if(dist_data->myId == 0) printf("\n");
fflush(stdout); MPI_Barrier(dist_data->comm);
}
void free_computeData(Compute_data *computeData, int terminate) {
void free_computeData(Compute_data *computeData) {
if(computeData->res != NULL) {
RemoveDoubles (&computeData->res);
}
......@@ -556,8 +600,7 @@ void free_computeData(Compute_data *computeData, int terminate) {
RemoveDoubles (&computeData->vec);
}
if(computeData->d_full != NULL && terminate) {
if(computeData->d_full != NULL) {
RemoveDoubles (&computeData->d_full);
}
if(computeData->subm.vptr != NULL) {
......@@ -599,30 +642,26 @@ void free_computeData(Compute_data *computeData, int terminate) {
*/
/*
* Función para declarar los datos a comunicar por parte de MAM
*/
int dist_old(struct Dist_data *dist_data, Compute_data *computeData, int num_children, int sm, int ss, int rm, int rs, int send_sync) {
int phy_dist = 2;
void originals_set_data(struct Dist_data *dist_data, Compute_data *computeData, int num_target) {
size_t index;
set_malleability_configuration(sm, ss, phy_dist, rm, rs);
set_children_number(num_children);
MAM_Set_target_number(num_target);
malleability_add_data(&(computeData->n), 1, MAL_INT, MAL_DATA_ALONE, 1, 1);
malleability_add_data(&(computeData->iter), 1, MAL_INT, MAL_DATA_ALONE, 1, 1);
malleability_add_data(&(computeData->tol), 1, MAL_DOUBLE, MAL_DATA_ALONE, 1, 1);
malleability_add_data(&(computeData->beta), 1, MAL_DOUBLE, MAL_DATA_ALONE, 1, 1);
malleability_add_data(&(computeData->umbral), 1, MAL_DOUBLE, MAL_DATA_ALONE, 1, 1);
MAM_Data_add(&(computeData->n), NULL, 1, MPI_INT, MAM_DATA_REPLICATED, MAM_DATA_CONSTANT);
MAM_Data_add(&(computeData->umbral), NULL, 1, MPI_DOUBLE, MAM_DATA_REPLICATED, MAM_DATA_CONSTANT);
MAM_Data_add(&(computeData->iter), NULL, 1, MPI_INT, MAM_DATA_REPLICATED, MAM_DATA_VARIABLE);
MAM_Data_add(&(computeData->tol), NULL, 1, MPI_DOUBLE, MAM_DATA_REPLICATED, MAM_DATA_VARIABLE);
MAM_Data_add(&(computeData->beta), NULL, 1, MPI_DOUBLE, MAM_DATA_REPLICATED, MAM_DATA_VARIABLE);
//malleability_add_data(computeData->d_full, computeData->n, MAL_DOUBLE, MAL_DATA_ALONE, 1, 1);
malleability_add_data(computeData->d, computeData->n, MAL_DOUBLE, MAL_DATA_ALONE, 0, 1);
MAM_Data_add(computeData->d, NULL, computeData->n, MPI_DOUBLE, MAM_DATA_DISTRIBUTED, MAM_DATA_VARIABLE);
malleability_add_data(computeData->vec, computeData->n, MAL_DOUBLE, MAL_DATA_ALONE, 0, 1);
malleability_add_data(computeData->res, computeData->n, MAL_DOUBLE, MAL_DATA_ALONE, 0, 1);
malleability_add_data(computeData->z, computeData->n, MAL_DOUBLE, MAL_DATA_ALONE, 0, 1);
MAM_Data_add(computeData->vec, NULL, computeData->n, MPI_DOUBLE, MAM_DATA_DISTRIBUTED, MAM_DATA_VARIABLE);
MAM_Data_add(computeData->res, NULL, computeData->n, MPI_DOUBLE, MAM_DATA_DISTRIBUTED, MAM_DATA_VARIABLE);
MAM_Data_add(computeData->z, NULL, computeData->n, MPI_DOUBLE, MAM_DATA_DISTRIBUTED, MAM_DATA_VARIABLE);
//FIXME SIguientes valores pueden ser asincronos
malleability_add_data(computeData->vlen, computeData->n, MAL_INT, 1+MAL_DATA_INDEPENDENT, 0, send_sync);
malleability_add_data(computeData->subm.vpos, computeData->n, MAL_INT, 1+MAL_DATA_DEPENDENT, 0, send_sync);
malleability_add_data(computeData->subm.vval, computeData->n, MAL_DOUBLE, 1+MAL_DATA_DEPENDENT, 0, send_sync);
MAM_Data_add(computeData->vlen, NULL, computeData->n, MPI_INT, MAM_DATA_DISTRIBUTED, MAM_DATA_CONSTANT);
}
/*
......@@ -634,78 +673,174 @@ int dist_old(struct Dist_data *dist_data, Compute_data *computeData, int num_chi
*/
/*
* Función llamada por un set de procesos hijos.
* Función llamada por MAM como callback.
*
* Primero los hijos obtienen de los padres una información iniciar
* con la que conocer el tamaño de sus vectores y matriz, como asi
* tambien cuantos datos van a recibir de cada padre.
* La misma realiza la redistribucion de datos por parte del usuario.
* Como se usan comunicaciones no bloqueantes, primero se inicia
* la comunicación y en las siguientes llamadas se comprueba si
* la misma ha terminado.
*/
void user_func(void *args) {
int local_flag, flag = 0;
mam_user_reconf_t user_reconf;
MAM_Get_Reconf_Info(&user_reconf);
user_redist_t *user_data = (user_redist_t *) args;
if(!user_data->initiated) {
MPI_Bcast(&user_data->start_time, 1, MPI_DOUBLE, 0, user_reconf.comm);
targets_distribution(user_reconf, user_data);
user_data->initiated = 1;
if(user_reconf.rank_state == MAM_PROC_NEW_RANK) {
MPI_Waitall(2, user_data->reqs, MPI_STATUSES_IGNORE);
flag = 1;
}
} else {
MPI_Testall(2, user_data->reqs, &local_flag, MPI_STATUSES_IGNORE);
MPI_Allreduce(&local_flag, &flag, 1, MPI_INT, MPI_MIN, user_data->comm);
}
if(flag) MAM_Resume_redistribution(NULL);
}
/*
* Funcion encargada de realizar la redistribucion de datos
* del usuario.
*
* Tras esto se preparan para recibir los datos de los padres.
* Calcula el total de elementos a enviar/recibir por cada proceso
* y tras ello llama a la funcion Ialltoallv dos veces.
*
* Además inicializa la memoria para aquellos procesos que vayan
* a recibir datos.
*/
void dist_new(struct Dist_data *dist_data, Compute_data *computeData) {
int IONE = 1, i, is_synch;
size_t entry, entries;
void targets_distribution(mam_user_reconf_t user_reconf, user_redist_t *user_data) {
int i, n, offset, elems, numP, *vlen, *rank_states;
int *scounts, *rcounts, *sdispls, *rdispls;
size_t total_qty;
void *value = NULL;
is_synch = 1;
entry = 0;
struct Dist_data dist_data;
MPI_Datatype type;
int aux_int;
int *recv_vpos = &aux_int;
double aux_double;
double *recv_vval = &aux_double;
MPI_Comm_size(user_reconf.comm, &numP);
scounts = calloc(numP, sizeof(int));
sdispls = calloc(numP, sizeof(int));
rcounts = calloc(numP, sizeof(int));
rdispls = calloc(numP, sizeof(int));
offset = 0;
rank_states = (int *) malloc(numP * sizeof(int));
MPI_Allgather(&user_reconf.rank_state, 1, MPI_INT, rank_states, 1, MPI_INT, user_reconf.comm);
MAM_Data_get_pointer(&value, 0, &total_qty, &type, MAM_DATA_DISTRIBUTED, MAM_DATA_CONSTANT);
vlen = ((int *)value);
n = (int) total_qty;
if(user_reconf.rank_state != MAM_PROC_ZOMBIE) {
MPI_Comm_rank(user_data->comm, &dist_data.myId);
dist_data.numP = user_reconf.numT;
if(user_reconf.rank_state == MAM_PROC_NEW_RANK) {
user_data->array_vpos = &aux_int;
user_data->array_vval = &aux_double;
for(i=0; i<user_reconf.numS; i++) {
if(rank_states[i] == MAM_PROC_CONTINUE) {
dist_data.myId += user_reconf.numS;
break;
}
}
}
get_dist(n, dist_data.myId, dist_data.numP, &dist_data);
CreateSparseMatrixVptr(&user_data->other_subm, dist_data.tamBl, n, 0);
user_data->other_subm.vptr[0] = 0;
//memcpy(user_data->other_subm.vptr+1, vlen, dist_data.tamBl * sizeof(int));
for(i=0; i<dist_data.tamBl; i++) {
user_data->other_subm.vptr[i+1] = vlen[i];
}
TransformLengthtoHeader(user_data->other_subm.vptr, user_data->other_subm.dim1); // The array is converted from vlen to vptr
elems = user_data->other_subm.vptr[dist_data.tamBl];
CreateSparseMatrixValues(&user_data->other_subm, dist_data.tamBl, n, elems, 0);
recv_vpos = user_data->other_subm.vpos;
recv_vval = user_data->other_subm.vval;
prepare_redist_counts(rcounts, rdispls, user_reconf.numS, offset, dist_data, user_data->other_subm.vptr);
}
malleability_get_data(&value, 0, 1, 1);
if(user_reconf.rank_state != MAM_PROC_NEW_RANK) {
MPI_Comm_rank(user_data->comm, &dist_data.myId);
dist_data.numP = user_reconf.numS;
get_dist(n, dist_data.myId, dist_data.numP, &dist_data);
offset = (user_reconf.numS + user_reconf.numT) == numP ?
user_reconf.numS : 0;
prepare_redist_counts(scounts, sdispls, user_reconf.numT, offset, dist_data, user_data->array_vptr);
}
// COMUNICACION DE DATOS //
MPI_Ialltoallv(user_data->array_vpos, scounts, sdispls, MPI_INT, recv_vpos, rcounts, rdispls, MPI_INT, user_reconf.comm, &user_data->reqs[0]);
MPI_Ialltoallv(user_data->array_vval, scounts, sdispls, MPI_DOUBLE, recv_vval, rcounts, rdispls, MPI_DOUBLE, user_reconf.comm, &user_data->reqs[1]);
free(rank_states);
free(scounts); free(sdispls); free(rcounts); free(rdispls);
}
void targets_update(struct Dist_data *dist_data, Compute_data *computeData, user_redist_t *user_data) {
int IONE = 1, i;
size_t entry, total_qty;
double start_time;
void *value = NULL;
MPI_Datatype type;
MPI_Comm_size(dist_data->comm, &dist_data->numP);
MPI_Comm_rank(dist_data->comm, &dist_data->myId);
entry = 0;
MAM_Data_get_pointer(&value, entry++, &total_qty, &type, MAM_DATA_REPLICATED, MAM_DATA_CONSTANT);
computeData->n = *((int *)value);
malleability_get_data(&value, 1, 1, 1);
MAM_Data_get_pointer(&value, entry++, &total_qty, &type, MAM_DATA_REPLICATED, MAM_DATA_CONSTANT);
computeData->umbral = *((double *)value);
get_dist(computeData->n, dist_data->myId, dist_data->numP, dist_data);
get_rows_dist(computeData, dist_data->numP, computeData->n);
entry = 0;
MAM_Data_get_pointer(&value, entry++, &total_qty, &type, MAM_DATA_REPLICATED, MAM_DATA_VARIABLE);
computeData->iter = *((int *)value);
malleability_get_data(&value, 2, 1, 1);
MAM_Data_get_pointer(&value, entry++, &total_qty, &type, MAM_DATA_REPLICATED, MAM_DATA_VARIABLE);
computeData->tol = *((double *)value);
malleability_get_data(&value, 3, 1, 1);
MAM_Data_get_pointer(&value, entry++, &total_qty, &type, MAM_DATA_REPLICATED, MAM_DATA_VARIABLE);
computeData->beta = *((double *)value);
malleability_get_data(&value, 4, 1, 1);
computeData->umbral = *((double *)value);
//malleability_get_data(&value, 5, 1, 1);
//computeData->d_full = ((double *)value);
malleability_get_data(&value, entry++, 0, 1);
entry = 0;
MAM_Data_get_pointer(&value, entry++, &total_qty, &type, MAM_DATA_DISTRIBUTED, MAM_DATA_VARIABLE);
computeData->d = ((double *)value);
malleability_get_data(&value, entry++, 0, 1);
computeData->vec = ((double *)value);
malleability_get_data(&value, entry++, 0, 1);
computeData->res = ((double *)value);
malleability_get_data(&value, entry++, 0, 1);
computeData->z = ((double *)value);
get_dist(computeData->n, dist_data->myId, dist_data->numP, dist_data);
get_rows_dist(computeData, dist_data->numP, computeData->n);
//CreateDoubles(&computeData->d, dist_data->tamBl);
//rcopy (&(dist_data->tamBl), &(computeData->d_full[dist_data->ini]), &IONE, computeData->d, &IONE); // d = d_full[ini] to d_full[ini+tamBl]
CreateDoubles(&computeData->d_full, computeData->n);
rcopy (&(dist_data->tamBl), computeData->d, &IONE, &(computeData->d_full[dist_data->ini]), &IONE); // d_full[ini] to d_full[ini+tamBl] = d
malleability_get_entries(&entries, 0, 0); //Get if there is any asynch data to recover
if(entries) { is_synch=0; entry=0; }
MAM_Data_get_pointer(&value, entry++, &total_qty, &type, MAM_DATA_DISTRIBUTED, MAM_DATA_VARIABLE);
computeData->vec = ((double *)value);
MAM_Data_get_pointer(&value, entry++, &total_qty, &type, MAM_DATA_DISTRIBUTED, MAM_DATA_VARIABLE);
computeData->res = ((double *)value);
MAM_Data_get_pointer(&value, entry++, &total_qty, &type, MAM_DATA_DISTRIBUTED, MAM_DATA_VARIABLE);
computeData->z = ((double *)value);
malleability_get_data(&value, entry++, 0, is_synch);
MAM_Data_get_pointer(&value, 0, &total_qty, &type, MAM_DATA_DISTRIBUTED, MAM_DATA_CONSTANT);
computeData->vlen = ((int *)value);
CreateSparseMatrixVptr(&(computeData->subm), dist_data->tamBl, computeData->n, 0);
computeData->subm.vptr[0] = 0;
for(i=0; i<dist_data->tamBl; i++) {
computeData->subm.vptr[i+1] = computeData->vlen[i];
}
TransformLengthtoHeader(computeData->subm.vptr, computeData->subm.dim1); // The array is converted from vlen to vptr
malleability_get_data(&value, entry++, 0, is_synch);
computeData->subm.vpos = ((int *)value);
malleability_get_data(&value, entry++, 0, is_synch);
computeData->subm.vval = ((double *)value);
}
void update_dist_data(struct Dist_data *dist_data) {
int myId, numP;
get_malleability_user_comm(&(dist_data->comm));
MPI_Comm_size(dist_data->comm, &numP);
MPI_Comm_rank(dist_data->comm, &myId);
dist_data->myId = myId;
dist_data->numP = numP;
start_time = user_data->start_time;
computeData->subm = user_data->other_subm;
*user_data = empty_user_data;
user_data->start_time = start_time;
user_data->array_vptr = computeData->subm.vptr;
user_data->array_vpos = computeData->subm.vpos;
user_data->array_vval = computeData->subm.vval;
user_data->comm = dist_data->comm;
}
/*
......@@ -747,15 +882,46 @@ void get_dist(int total_r, int id, int numP, struct Dist_data *dist_data) {
dist_data->tamBl = dist_data->fin - dist_data->ini;
}
void prepare_redist_counts(int *counts, int *displs, int numP_other, int offset, struct Dist_data dist_data, int *vptr) {
int idS[2], i, idS_zero;
int last_index, first_index;
getIds_intercomm(dist_data, numP_other, idS);
idS[0] += offset;
idS[1] += offset;
idS_zero = 0;
if(!idS[0]) {
set_counts(0, numP_other, dist_data, offset, counts);
idS_zero = 1;
}
for(i=idS[0] + idS_zero; i<idS[1]; i++) {
set_counts(i, numP_other, dist_data, offset, counts);
displs[i] = displs[i-1] + counts[i-1];
}
if(!idS[0]) {
last_index = counts[0];
first_index = 0;
counts[0] = vptr[last_index] - vptr[first_index];
}
for(i=idS[0] + idS_zero; i<idS[1]; i++) {
last_index = displs[i] + counts[i];
first_index = displs[i];
counts[i] = vptr[last_index] - vptr[first_index];
displs[i] = displs[i-1] + counts[i-1];
}
}
/*
* Obtiene para un Id de proceso, cuantos elementos va
* a enviar/recibir el proceso myId
*/
void set_counts(int id, int numP, struct Dist_data data_dist, int *sendcounts) {
void set_counts(int id, int numP, struct Dist_data data_dist, int offset, int *sendcounts) {
struct Dist_data other;
int biggest_ini, smallest_end, tot_rows;
get_dist(data_dist.tot_r, id, numP, &other);
get_dist(data_dist.tot_r, id-offset, numP, &other);
// Si el rango de valores no coincide, se pasa al siguiente proceso
if(data_dist.ini >= other.fin || data_dist.fin <= other.ini) {
......@@ -763,18 +929,10 @@ void set_counts(int id, int numP, struct Dist_data data_dist, int *sendcounts) {
}
// Obtiene el proceso con mayor ini entre los dos procesos
if(data_dist.ini > other.ini) {
biggest_ini = data_dist.ini;
} else {
biggest_ini = other.ini;
}
biggest_ini = (data_dist.ini > other.ini) ? data_dist.ini : other.ini;
// Obtiene el proceso con menor fin entre los dos procesos
if(data_dist.fin < other.fin) {
smallest_end = data_dist.fin;
} else {
smallest_end = other.fin;
}
smallest_end = (data_dist.fin < other.fin) ? data_dist.fin : other.fin;
sendcounts[id] = smallest_end - biggest_ini; // Numero de elementos a enviar/recibir del proceso Id
}
......@@ -785,7 +943,7 @@ void set_counts(int id, int numP, struct Dist_data data_dist, int *sendcounts) {
* Devuelve el primer identificador y el último (Excluido) con el que
* comunicarse.
*/
void getIds_intercomm(struct Dist_data dist_data, int numP_other, int **idS) {
void getIds_intercomm(struct Dist_data dist_data, int numP_other, int *idS) {
int idI, idE;
int tamOther = dist_data.tot_r / numP_other;
int remOther = dist_data.tot_r % numP_other;
......@@ -805,18 +963,16 @@ void getIds_intercomm(struct Dist_data dist_data, int numP_other, int **idS) {
idE = ((dist_data.fin - middle) % tamOther > 0 && idE+1 <= numP_other) ? idE+1 : idE;
}
//free(*idS);
CreateInts(idS, 2);
(*idS)[0] = idI;
(*idS)[1] = idE;
idS[0] = idI;
idS[1] = idE;
}
void print_global_results() {
void print_global_results(double start_time) {
size_t i;
double sp_time, sy_time, asy_time, mall_time, global_time;
retrieve_results(&sp_time, &sy_time, &asy_time, &mall_time, &global_time);
global_time = MPI_Wtime() - global_time;
MAM_Retrieve_times(&sp_time, &sy_time, &asy_time, &mall_time);
global_time = MPI_Wtime() - start_time;
printf("T_spawn: %lf", sp_time);
printf("\nT_SR: %lf", sy_time);
printf("\nT_AR: %lf", asy_time);
......
......@@ -5,6 +5,14 @@ C_FLAGS =
LD_FLAGS = -lm -pthread
DEF =
USE_MAL_SLURM ?= 1
USE_MAL_BARRIERS ?= 0
USE_MAL_DEBUG ?= 0
ifeq ($(USE_MAL_SLURM),1)
LD_FLAGS += -lslurm
endif
DEF = -DUSE_MAL_SLURM=$(USE_MAL_SLURM) -DUSE_MAL_BARRIERS=$(USE_MAL_BARRIERS) -DUSE_MAL_DEBUG=$(USE_MAL_DEBUG)
.PHONY : clean clear install install_slurm
......@@ -59,8 +67,3 @@ clear:
install: $(BIN)
echo "Done"
# Builds target with slurm
install_slurm: LD_FLAGS += -lslurm
install_slurm: DEF += -DUSE_SLURM
install_slurm: install
......@@ -4,19 +4,21 @@
#include <string.h>
#include "distribution_methods/block_distribution.h"
#include "CommDist.h"
#include "MAM_Configuration.h"
#include "malleabilityDataStructures.h"
void prepare_redistribution(int qty, int mal_type, int myId, int numP, int numO, int is_children_group, int is_intercomm, int is_sync, void **recv, struct Counts *s_counts, struct Counts *r_counts);
void prepare_redistribution(int qty, MPI_Datatype datatype, int numP, int numO, int is_children_group, void **recv, struct Counts *s_counts, struct Counts *r_counts);
void check_requests(struct Counts s_counts, struct Counts r_counts, MPI_Request **requests, size_t *request_qty);
void sync_point2point(void *send, void *recv, int mal_type, int is_intercomm, int myId, struct Counts s_counts, struct Counts r_counts, MPI_Comm comm);
void sync_point2point(void *send, void *recv, MPI_Datatype datatype, struct Counts s_counts, struct Counts r_counts, MPI_Comm comm);
void sync_rma(void *send, void *recv, MPI_Datatype datatype, struct Counts r_counts, int tamBl, MPI_Comm comm);
void sync_rma_lock(void *recv, MPI_Datatype datatype, struct Counts r_counts, MPI_Win win);
void sync_rma_lockall(void *recv, MPI_Datatype datatype, struct Counts r_counts, MPI_Win win);
void async_point2point(void *send, void *recv, int mal_type, struct Counts s_counts, struct Counts r_counts, MPI_Comm comm, MPI_Request *requests);
void perform_manual_communication(void *send, void *recv, int mal_type, int myId, struct Counts s_counts, struct Counts r_counts);
void *ind_send, *ind_recv; //FIXME Borrar
void recalculate_counts(struct Counts *counts, int *array, void **recv, int mal_type);
int recalculate_elems(int *array, int ini, int fin);
void async_point2point(void *send, void *recv, MPI_Datatype datatype, struct Counts s_counts, struct Counts r_counts, MPI_Comm comm, MPI_Request *requests);
void async_rma(void *send, void *recv, MPI_Datatype datatype, struct Counts r_counts, int tamBl, MPI_Comm comm, MPI_Request *requests, MPI_Win *win);
void async_rma_lock(void *recv, MPI_Datatype datatype, struct Counts r_counts, MPI_Win win, MPI_Request *requests);
void async_rma_lockall(void *recv, MPI_Datatype datatype, struct Counts r_counts, MPI_Win win, MPI_Request *requests);
/*
* Reserva memoria para un vector de hasta "qty" elementos.
......@@ -57,7 +59,6 @@ void malloc_comm_array(char **array, int qty, int myId, int numP) {
* - recv (OUT): Array where data will be written. A NULL value is allowed if the process is not going to receive data.
* If the process receives data and is NULL, the behaviour is undefined.
* - qty (IN): Sum of elements shared by all processes that will send data.
* - myId (IN): Rank of the MPI process in the local communicator. For the parents is not the rank obtained from "comm".
* - numP (IN): Size of the local group. If it is a children group, this parameter must correspond to using
* "MPI_Comm_size(comm)". For the parents is not always the size obtained from "comm".
* - numO (IN): Amount of processes in the remote group. For the parents is the target quantity of processes after the
......@@ -65,79 +66,37 @@ void malloc_comm_array(char **array, int qty, int myId, int numP) {
* - is_children_group (IN): Indicates wether this MPI rank is a children(TRUE) or a parent(FALSE).
* - comm (IN): Communicator to use to perform the redistribution.
*
* returns: An integer indicating if the operation has been completed(TRUE) or not(FALSE). //FIXME In this case is always true...
*/
int sync_communication(void *send, void **recv, int qty, int mal_type, int dependency, int myId, int numP, int numO, int is_children_group, int red_method, MPI_Comm comm) {
int is_intercomm, aux_comm_used = 0;
void sync_communication(void *send, void **recv, int qty, MPI_Datatype datatype, int numP, int numO, int is_children_group, MPI_Comm comm) {
struct Counts s_counts, r_counts;
struct Dist_data dist_data;
MPI_Datatype datatype;
MPI_Comm aux_comm = MPI_COMM_NULL;
/* PREPARE COMMUNICATION */
MPI_Comm_test_inter(comm, &is_intercomm);
prepare_redistribution(qty, mal_type, myId, numP, numO, is_children_group, is_intercomm, 1, recv, &s_counts, &r_counts);
if(is_intercomm) {
MPI_Intercomm_merge(comm, is_children_group, &aux_comm);
aux_comm_used = 1;
} else { aux_comm = comm; }
if(mal_type == MAL_INT) {
datatype = MPI_INT;
} else if(mal_type == MAL_DOUBLE) {
datatype = MPI_DOUBLE;
} else if(mal_type == MAL_CHAR) {
datatype = MPI_CHAR;
} else {
printf("Malleability -- Redistribution type not recognised\n");
MPI_Abort(MPI_COMM_WORLD, -1);
}
prepare_redistribution(qty, datatype, numP, numO, is_children_group, recv, &s_counts, &r_counts);
if(dependency == 1+MAL_DATA_DEPENDENT) {
/* PERFORM COMMUNICATION */
switch(mall_conf->red_method) {
case MALL_RED_RMA_LOCKALL:
case MALL_RED_RMA_LOCK:
if(is_children_group) {
recalculate_counts(&r_counts, (int *) ind_recv, recv, mal_type);
//get_block_dist(qty, myId, numP, &dist_data);
//print_counts(dist_data, r_counts.counts, r_counts.displs, numO, 0, "Children C ");
dist_data.tamBl = 0;
} else {
recalculate_counts(&s_counts, (int *) ind_send, recv, mal_type);
//get_block_dist(qty, myId, numP, &dist_data);
//print_counts(dist_data, s_counts.counts, s_counts.displs, numO, 0, "Parents ");
if(!is_intercomm) {
recalculate_counts(&r_counts, (int *) ind_recv, recv, mal_type);
}
}
get_block_dist(qty, mall->myId, numO, &dist_data);
}
sync_rma(send, *recv, datatype, r_counts, dist_data.tamBl, comm);
break;
/* PERFORM COMMUNICATION */
switch(red_method) {
case MALL_RED_POINT:
sync_point2point(send, *recv, mal_type, is_intercomm, myId, s_counts, r_counts, aux_comm);
sync_point2point(send, *recv, datatype, s_counts, r_counts, comm);
break;
case MALL_RED_BASELINE:
default:
MPI_Alltoallv(send, s_counts.counts, s_counts.displs, datatype, *recv, r_counts.counts, r_counts.displs, datatype, aux_comm);
MPI_Alltoallv(send, s_counts.counts, s_counts.displs, datatype, *recv, r_counts.counts, r_counts.displs, datatype, comm);
break;
}
if(aux_comm_used) {
MPI_Comm_free(&aux_comm);
}
if(dependency == 1+MAL_DATA_INDEPENDENT) {
if(is_children_group) {
ind_recv = *recv;
} else {
ind_send = send;
if(!is_intercomm) {
ind_recv = *recv;
}
}
}
freeCounts(&s_counts);
freeCounts(&r_counts);
return 1; //FIXME In this case is always true...
}
/*
......@@ -147,9 +106,6 @@ int sync_communication(void *send, void **recv, int qty, int mal_type, int depen
* - send (IN): Array with the data to send. This value can not be NULL for parents.
* - recv (OUT): Array where data will be written. A NULL value is allowed if the process is not going to
* receive data. If the process receives data and is NULL, the behaviour is undefined.
* - is_intercomm (IN): Indicates wether the communicator is an intercommunicator (TRUE) or an
* intracommunicator (FALSE).
* - myId (IN): Rank of the MPI process in the local communicator. For the parents is not the rank obtained from "comm".
* - s_counts (IN): Struct which describes how many elements will send this process to each children and
* the displacements.
* - r_counts (IN): Structure which describes how many elements will receive this process from each parent
......@@ -157,32 +113,20 @@ int sync_communication(void *send, void **recv, int qty, int mal_type, int depen
* - comm (IN): Communicator to use to perform the redistribution.
*
*/
void sync_point2point(void *send, void *recv, int mal_type, int is_intercomm, int myId, struct Counts s_counts, struct Counts r_counts, MPI_Comm comm) {
int i, j, init, end, total_sends;
size_t datasize, offset;
void sync_point2point(void *send, void *recv, MPI_Datatype datatype, struct Counts s_counts, struct Counts r_counts, MPI_Comm comm) {
int i, j, init, end, total_sends, datasize;
size_t offset, offset2;
MPI_Request *sends;
MPI_Datatype datatype;
if(mal_type == MAL_INT) {
datatype = MPI_INT;
datasize = sizeof(int);
} else if(mal_type == MAL_DOUBLE) {
datatype = MPI_DOUBLE;
datasize = sizeof(double);
} else if(mal_type == MAL_CHAR) {
datatype = MPI_CHAR;
datasize = sizeof(char);
} else {
printf("Malleability -- Redistribution type not recognised\n");
MPI_Abort(MPI_COMM_WORLD, -1);
}
MPI_Type_size(datatype, &datasize);
init = s_counts.idI;
end = s_counts.idE;
if(!is_intercomm && (s_counts.idI == myId || s_counts.idE == myId + 1)) {
perform_manual_communication(send, recv, mal_type, myId, s_counts, r_counts);
if(mall_conf->spawn_method == MALL_SPAWN_MERGE && (s_counts.idI == mall->myId || s_counts.idE == mall->myId + 1)) {
offset = s_counts.displs[mall->myId] * datasize;
offset2 = r_counts.displs[mall->myId] * datasize;
memcpy(recv+offset2, send+offset, s_counts.counts[mall->myId]);
if(s_counts.idI == myId) init = s_counts.idI+1;
if(s_counts.idI == mall->myId) init = s_counts.idI+1;
else end = s_counts.idE-1;
}
......@@ -200,9 +144,9 @@ void sync_point2point(void *send, void *recv, int mal_type, int is_intercomm, in
init = r_counts.idI;
end = r_counts.idE;
if(!is_intercomm) {
if(r_counts.idI == myId) init = r_counts.idI+1;
else if(r_counts.idE == myId + 1) end = r_counts.idE-1;
if(mall_conf->spawn_method == MALL_SPAWN_MERGE) {
if(r_counts.idI == mall->myId) init = r_counts.idI+1;
else if(r_counts.idE == mall->myId + 1) end = r_counts.idE-1;
}
for(i=init; i<end; i++) {
......@@ -212,7 +156,102 @@ void sync_point2point(void *send, void *recv, int mal_type, int is_intercomm, in
if(total_sends > 0) {
MPI_Waitall(total_sends, sends, MPI_STATUSES_IGNORE);
free(sends);
}
}
/*
* Performs synchronous MPI-RMA operations to redistribute an array in a block distribution. Is should be called after calculating
* how data should be redistributed
*
* - send (IN): Array with the data to send. This value can be NULL for children.
* - recv (OUT): Array where data will be written. A NULL value is allowed if the process is not going to receive data.
* If the process receives data and is NULL, the behaviour is undefined.
* - r_counts (IN): Structure which describes how many elements will receive this process from each parent and the
* displacements.
* - tamBl (IN): How many elements are stored in the parameter "send".
* - comm (IN): Communicator to use to perform the redistribution. Must be an intracommunicator as MPI-RMA requirements.
*
* FIXME: In libfabric one of these macros defines the maximum amount of BYTES that can be communicated in a SINGLE MPI_Get
* A window can have more bytes than the amount shown in those macros, therefore, if you want to read more than that amount
* you need to perform multiples Gets.
* prov/psm3/psm3/psm_config.h:179:#define MQ_SHM_THRESH_RNDV 16000
* prov/psm3/psm3/ptl_am/am_config.h:62:#define PSMI_MQ_RV_THRESH_CMA 16000
* prov/psm3/psm3/ptl_am/am_config.h:65:#define PSMI_MQ_RV_THRESH_NO_KASSIST 16000
*/
void sync_rma(void *send, void *recv, MPI_Datatype datatype, struct Counts r_counts, int tamBl, MPI_Comm comm) {
int datasize;
MPI_Win win;
MPI_Type_size(datatype, &datasize);
MPI_Win_create(send, (MPI_Aint)tamBl * datasize, datasize, MPI_INFO_NULL, comm, &win);
#if USE_MAL_DEBUG >= 3
DEBUG_FUNC("Created Window for synchronous RMA communication", mall->myId, mall->numP); fflush(stdout); MPI_Barrier(comm);
#endif
switch(mall_conf->red_method) {
case MALL_RED_RMA_LOCKALL:
sync_rma_lockall(recv, datatype, r_counts, win);
break;
case MALL_RED_RMA_LOCK:
sync_rma_lock(recv, datatype, r_counts, win);
break;
}
#if USE_MAL_DEBUG >= 3
DEBUG_FUNC("Completed synchronous RMA communication", mall->myId, mall->numP); fflush(stdout); MPI_Barrier(comm);
#endif
MPI_Win_free(&win);
}
/*
* Performs a passive MPI-RMA data redistribution for a single array using the passive epochs Lock/Unlock.
* - recv (OUT): Array where data will be written. A NULL value is allowed if the process is not going to receive data.
* If the process receives data and is NULL, the behaviour is undefined.
* - r_counts (IN): Structure which describes how many elements will receive this process from each parent and the
* displacements.
* - win (IN): Window to use to perform the redistribution.
*
*/
void sync_rma_lock(void *recv, MPI_Datatype datatype, struct Counts r_counts, MPI_Win win) {
int i, target_displs, datasize;
size_t offset;
MPI_Type_size(datatype, &datasize);
target_displs = r_counts.first_target_displs; //TODO Check that datasize is not needed
for(i=r_counts.idI; i<r_counts.idE; i++) {
offset = r_counts.displs[i] * datasize;
MPI_Win_lock(MPI_LOCK_SHARED, i, MPI_MODE_NOCHECK, win);
MPI_Get(recv+offset, r_counts.counts[i], datatype, i, target_displs, r_counts.counts[i], datatype, win);
MPI_Win_unlock(i, win);
target_displs=0;
}
}
/*
* Performs a passive MPI-RMA data redistribution for a single array using the passive epochs Lockall/Unlockall.
* - recv (OUT): Array where data will be written. A NULL value is allowed if the process is not going to receive data.
* If the process receives data and is NULL, the behaviour is undefined.
* - r_counts (IN): Structure which describes how many elements will receive this process from each parent and the
* displacements.
* - win (IN): Window to use to perform the redistribution.
*
*/
void sync_rma_lockall(void *recv, MPI_Datatype datatype, struct Counts r_counts, MPI_Win win) {
int i, target_displs, datasize;
size_t offset;
MPI_Type_size(datatype, &datasize);
target_displs = r_counts.first_target_displs; //TODO Check that datasize is not needed
MPI_Win_lock_all(MPI_MODE_NOCHECK, win);
for(i=r_counts.idI; i<r_counts.idE; i++) {
offset = r_counts.displs[i] * datasize;
MPI_Get(recv+offset, r_counts.counts[i], datatype, i, target_displs, r_counts.counts[i], datatype, win);
target_displs=0;
}
MPI_Win_unlock_all(win);
}
//================================================================================
......@@ -222,7 +261,6 @@ void sync_point2point(void *send, void *recv, int mal_type, int is_intercomm, in
//================================================================================
/*
* //TODO Añadir estrategia IBARRIER
* Performs a communication to redistribute an array in a block distribution with non-blocking MPI functions.
* In the redistribution is differenciated parent group from the children and the values each group indicates can be
* different.
......@@ -231,7 +269,6 @@ void sync_point2point(void *send, void *recv, int mal_type, int is_intercomm, in
* - recv (OUT): Array where data will be written. A NULL value is allowed if the process is not going to receive data.
* If the process receives data and is NULL, the behaviour is undefined.
* - qty (IN): Sum of elements shared by all processes that will send data.
* - myId (IN): Rank of the MPI process in the local communicator. For the parents is not the rank obtained from "comm".
* - numP (IN): Size of the local group. If it is a children group, this parameter must correspond to using
* "MPI_Comm_size(comm)". For the parents is not always the size obtained from "comm".
* - numO (IN): Amount of processes in the remote group. For the parents is the target quantity of processes after the
......@@ -243,87 +280,102 @@ void sync_point2point(void *send, void *recv, int mal_type, int is_intercomm, in
* - request_qty (OUT): Quantity of requests to be used. If a process sends and receives data, this value will be
* modified to the expected value.
*
* returns: An integer indicating if the operation has been completed(TRUE) or not(FALSE). //FIXME In this case is always false...
*/
int async_communication(void *send, void **recv, int qty, int mal_type, int dependency, int myId, int numP, int numO, int is_children_group, int red_method, int red_strategies, MPI_Comm comm, MPI_Request **requests, size_t *request_qty) {
int is_intercomm, aux_comm_used = 0;
void async_communication_start(void *send, void **recv, int qty, MPI_Datatype datatype, int numP, int numO, int is_children_group, MPI_Comm comm, MPI_Request **requests, size_t *request_qty, MPI_Win *win) {
struct Counts s_counts, r_counts;
MPI_Datatype datatype;
MPI_Comm aux_comm = MPI_COMM_NULL;
struct Dist_data dist_data;
/* PREPARE COMMUNICATION */
MPI_Comm_test_inter(comm, &is_intercomm);
prepare_redistribution(qty, mal_type, myId, numP, numO, is_children_group, is_intercomm, 1, recv, &s_counts, &r_counts);
prepare_redistribution(qty, datatype, numP, numO, is_children_group, recv, &s_counts, &r_counts);
check_requests(s_counts, r_counts, requests, request_qty);
if(is_intercomm) {
MPI_Intercomm_merge(comm, is_children_group, &aux_comm);
aux_comm_used = 1;
} else { aux_comm = comm; }
if(mal_type == MAL_INT) {
datatype = MPI_INT;
} else if(mal_type == MAL_DOUBLE) {
datatype = MPI_DOUBLE;
} else if(mal_type == MAL_CHAR) {
datatype = MPI_CHAR;
} else {
printf("Malleability -- Redistribution type not recognised\n");
MPI_Abort(MPI_COMM_WORLD, -1);
}
/* PERFORM COMMUNICATION */
switch(mall_conf->red_method) {
if(dependency == 1+MAL_DATA_DEPENDENT) {
case MALL_RED_RMA_LOCKALL:
case MALL_RED_RMA_LOCK:
if(is_children_group) {
recalculate_counts(&r_counts, (int *) ind_recv, recv, mal_type);
dist_data.tamBl = 0;
} else {
recalculate_counts(&s_counts, (int *) ind_send, recv, mal_type);
if(!is_intercomm) {
recalculate_counts(&r_counts, (int *) ind_recv, recv, mal_type);
}
get_block_dist(qty, mall->myId, numO, &dist_data);
}
}
/* PERFORM COMMUNICATION */
switch(red_method) {
async_rma(send, *recv, datatype, r_counts, dist_data.tamBl, comm, *requests, win);
break;
case MALL_RED_POINT:
async_point2point(send, *recv, mal_type, s_counts, r_counts, aux_comm, *requests);
async_point2point(send, *recv, datatype, s_counts, r_counts, comm, *requests);
break;
case MALL_RED_BASELINE:
default:
MPI_Ialltoallv(send, s_counts.counts, s_counts.displs, datatype, *recv, r_counts.counts, r_counts.displs, datatype, aux_comm, &((*requests)[0]));
MPI_Ialltoallv(send, s_counts.counts, s_counts.displs, datatype, *recv, r_counts.counts, r_counts.displs, datatype, comm, &((*requests)[0]));
break;
}
/* POST REQUESTS CHECKS */
if(is_children_group) {
MPI_Waitall(*request_qty, *requests, MPI_STATUSES_IGNORE);
}
freeCounts(&s_counts);
freeCounts(&r_counts);
}
if(malleability_red_contains_strat(red_strategies, MALL_RED_IBARRIER, NULL)) { //FIXME Strategy not fully implemented
MPI_Ibarrier(comm, &((*requests)[*request_qty-1]) ); //FIXME Not easy to read...
if(is_children_group) {
MPI_Wait(&((*requests)[*request_qty-1]), MPI_STATUSES_IGNORE); //FIXME Not easy to read...
}
}
/*
* Checks if a set of requests have been completed (1) or not (0).
*
* - is_children_group (IN): Indicates wether this MPI rank is a children(TRUE) or a parent(FALSE).
* - requests (IN): Pointer to array of requests to be used to determine if the communication has ended.
* - request_qty (IN): Quantity of requests in "requests".
*
* returns: An integer indicating if the operation has been completed(TRUE) or not(FALSE).
*/
int async_communication_check(int is_children_group, MPI_Request *requests, size_t request_qty) {
int completed, req_completed, test_err;
size_t i;
completed = 1;
test_err = MPI_SUCCESS;
if(aux_comm_used) {
MPI_Comm_free(&aux_comm);
}
if (is_children_group) return 1; //FIXME Deberia devolver un num negativo
if(dependency == 1+MAL_DATA_INDEPENDENT) {
if(is_children_group) {
ind_recv = *recv;
} else {
ind_send = send;
if(!is_intercomm) {
ind_recv = *recv;
}
for(i=0; i<request_qty; i++) {
test_err = MPI_Test(&(requests[i]), &req_completed, MPI_STATUS_IGNORE);
completed = completed && req_completed;
}
//test_err = MPI_Testall(request_qty, requests, &completed, MPI_STATUSES_IGNORE); //FIXME Some kind of bug with Mpich.
if (test_err != MPI_SUCCESS && test_err != MPI_ERR_PENDING) {
printf("P%d aborting -- Test Async\n", mall->myId);
MPI_Abort(MPI_COMM_WORLD, test_err);
}
freeCounts(&s_counts);
freeCounts(&r_counts);
return 0; //FIXME In this case is always false...
return completed;
}
/*
* Waits until the completion of a set of requests. If the Ibarrier strategy
* is being used, the corresponding ibarrier is posted.
*
* - comm (IN): Communicator to use to confirm finalizations of redistribution
* - requests (IN): Pointer to array of requests to be used to determine if the communication has ended.
* - request_qty (IN): Quantity of requests in "requests".
*/
void async_communication_wait(MPI_Request *requests, size_t request_qty) {
MPI_Waitall(request_qty, requests, MPI_STATUSES_IGNORE);
#if USE_MAL_DEBUG >= 3
DEBUG_FUNC("Processes Waitall completed", mall->myId, mall->numP); fflush(stdout); MPI_Barrier(MPI_COMM_WORLD);
#endif
}
/*
* Frees Requests/Windows associated to a particular redistribution.
* Should be called for each output result of calling "async_communication_start".
*
* - requests (IN): Pointer to array of requests to be used to determine if the communication has ended.
* - request_qty (IN): Quantity of requests in "requests".
* - win (IN): Window to free.
*/
void async_communication_end(MPI_Request *requests, size_t request_qty, MPI_Win *win) {
//Para la desconexión de ambos grupos de procesos es necesario indicar a MPI que esta comm
//ha terminado, aunque solo se pueda llegar a este punto cuando ha terminado
if(MAM_Contains_strat(MAM_RED_STRATEGIES, MAM_STRAT_RED_WAIT_TARGETS, NULL)) { MPI_Waitall(request_qty, requests, MPI_STATUSES_IGNORE); }
if(mall_conf->red_method == MALL_RED_RMA_LOCKALL || mall_conf->red_method == MALL_RED_RMA_LOCK) { MPI_Win_free(win); }
}
/*
......@@ -341,24 +393,10 @@ int async_communication(void *send, void **recv, int qty, int mal_type, int depe
* - requests (OUT): Pointer to array of requests to be used to determine if the communication has ended.
*
*/
void async_point2point(void *send, void *recv, int mal_type, struct Counts s_counts, struct Counts r_counts, MPI_Comm comm, MPI_Request *requests) {
int i, j = 0;
size_t datasize, offset;
MPI_Datatype datatype;
if(mal_type == MAL_INT) {
datatype = MPI_INT;
datasize = sizeof(int);
} else if(mal_type == MAL_DOUBLE) {
datatype = MPI_DOUBLE;
datasize = sizeof(double);
} else if(mal_type == MAL_CHAR) {
datatype = MPI_CHAR;
datasize = sizeof(char);
} else {
printf("Malleability -- Redistribution type not recognised\n");
MPI_Abort(MPI_COMM_WORLD, -1);
}
void async_point2point(void *send, void *recv, MPI_Datatype datatype, struct Counts s_counts, struct Counts r_counts, MPI_Comm comm, MPI_Request *requests) {
int i, j = 0, datasize;
size_t offset;
MPI_Type_size(datatype, &datasize);
for(i=s_counts.idI; i<s_counts.idE; i++) {
offset = s_counts.displs[i] * datasize;
......@@ -373,6 +411,89 @@ void async_point2point(void *send, void *recv, int mal_type, struct Counts s_cou
}
}
/*
* Performs asynchronous MPI-RMA operations to redistribute an array in a block distribution. Is should be called after calculating
* how data should be redistributed.
*
* - send (IN): Array with the data to send. This value can be NULL for children.
* - recv (OUT): Array where data will be written. A NULL value is allowed if the process is not going to receive data.
* If the process receives data and is NULL, the behaviour is undefined.
* - r_counts (IN): Structure which describes how many elements will receive this process from each parent and the
* displacements.
* - tamBl (IN): How many elements are stored in the parameter "send".
* - comm (IN): Communicator to use to perform the redistribution. Must be an intracommunicator as MPI-RMA requirements.
* - window (OUT): Pointer to a window object used for the RMA operations.
* - requests (OUT): Pointer to array of requests to be used to determine if the communication has ended.
*
*/
void async_rma(void *send, void *recv, MPI_Datatype datatype, struct Counts r_counts, int tamBl, MPI_Comm comm, MPI_Request *requests, MPI_Win *win) {
int datasize;
MPI_Type_size(datatype, &datasize);
MPI_Win_create(send, (MPI_Aint)tamBl * datasize, datasize, MPI_INFO_NULL, comm, win);
switch(mall_conf->red_method) {
case MALL_RED_RMA_LOCKALL:
async_rma_lockall(recv, datatype, r_counts, *win, requests);
break;
case MALL_RED_RMA_LOCK:
async_rma_lock(recv, datatype, r_counts, *win, requests);
break;
}
}
/*
* Performs an asynchronous and passive MPI-RMA data redistribution for a single array using the passive epochs Lock/Unlock.
* - recv (OUT): Array where data will be written. A NULL value is allowed if the process is not going to receive data.
* If the process receives data and is NULL, the behaviour is undefined.
* - r_counts (IN): Structure which describes how many elements will receive this process from each parent and the
* displacements.
* - win (IN): Window to use to perform the redistribution.
* - requests (OUT): Pointer to array of requests to be used to determine if the communication has ended.
*
*/
void async_rma_lock(void *recv, MPI_Datatype datatype, struct Counts r_counts, MPI_Win win, MPI_Request *requests) {
int i, target_displs, j = 0, datasize;
size_t offset;
MPI_Type_size(datatype, &datasize);
target_displs = r_counts.first_target_displs; //TODO Check that datasize is not needed
for(i=r_counts.idI; i<r_counts.idE; i++) {
offset = r_counts.displs[i] * datasize;
MPI_Win_lock(MPI_LOCK_SHARED, i, MPI_MODE_NOCHECK, win);
MPI_Rget(recv+offset, r_counts.counts[i], datatype, i, target_displs, r_counts.counts[i], datatype, win, &(requests[j]));
MPI_Win_unlock(i, win);
target_displs=0;
j++;
}
}
/*
* Performs an asynchronous and passive MPI-RMA data redistribution for a single array using the passive epochs Lockall/Unlockall.
* - recv (OUT): Array where data will be written. A NULL value is allowed if the process is not going to receive data.
* If the process receives data and is NULL, the behaviour is undefined.
* - r_counts (IN): Structure which describes how many elements will receive this process from each parent and the
* displacements.
* - win (IN): Window to use to perform the redistribution.
* - requests (OUT): Pointer to array of requests to be used to determine if the communication has ended.
*
*/
void async_rma_lockall(void *recv, MPI_Datatype datatype, struct Counts r_counts, MPI_Win win, MPI_Request *requests) {
int i, target_displs, j = 0, datasize;
size_t offset;
MPI_Type_size(datatype, &datasize);
target_displs = r_counts.first_target_displs; //TODO Check that datasize is not needed
MPI_Win_lock_all(MPI_MODE_NOCHECK, win);
for(i=r_counts.idI; i<r_counts.idE; i++) {
offset = r_counts.displs[i] * datasize;
MPI_Rget(recv+offset, r_counts.counts[i], datatype, i, target_displs, r_counts.counts[i], datatype, win, &(requests[j]));
target_displs=0;
j++;
}
MPI_Win_unlock_all(win);
}
/*
* ========================================================================================
* ========================================================================================
......@@ -386,65 +507,64 @@ void async_point2point(void *send, void *recv, int mal_type, struct Counts s_cou
* how many elements sends/receives to other processes for the new group.
*
* - qty (IN): Sum of elements shared by all processes that will send data.
* - myId (IN): Rank of the MPI process in the local communicator. For the parents is not the rank obtained from "comm".
* - numP (IN): Size of the local group. If it is a children group, this parameter must correspond to using
* "MPI_Comm_size(comm)". For the parents is not always the size obtained from "comm".
* - numO (IN): Amount of processes in the remote group. For the parents is the target quantity of processes after the
* resize, while for the children is the amount of parents.
* - is_children_group (IN): Indicates wether this MPI rank is a children(TRUE) or a parent(FALSE).
* - is_intercomm (IN): Indicates wether the used communicator is a intercomunicator(TRUE) or intracommunicator(FALSE).
* - recv (OUT): Array where data will be written. A NULL value is allowed if the process is not going to receive data.
* process receives data and is NULL, the behaviour is undefined.
* - s_counts (OUT): Struct where is indicated how many elements sends this process to processes in the new group.
* - r_counts (OUT): Struct where is indicated how many elements receives this process from other processes in the previous group.
*
*/
void prepare_redistribution(int qty, int mal_type, int myId, int numP, int numO, int is_children_group, int is_intercomm, int is_sync, void **recv, struct Counts *s_counts, struct Counts *r_counts) {
void prepare_redistribution(int qty, MPI_Datatype datatype, int numP, int numO, int is_children_group, void **recv, struct Counts *s_counts, struct Counts *r_counts) {
int array_size = numO;
int offset_ids = 0;
size_t datasize;
int datasize;
struct Dist_data dist_data;
if(mal_type == MAL_INT) {
datasize = sizeof(int);
} else if(mal_type == MAL_DOUBLE) {
datasize = sizeof(double);
} else if(mal_type == MAL_CHAR) {
datasize = sizeof(char);
} else {
printf("Malleability -- Redistribution type not recognised\n");
MPI_Abort(MPI_COMM_WORLD, -1);
}
if(is_intercomm && is_sync) {
offset_ids = numP; //FIXME Modify only if active?
if(mall_conf->spawn_method == MALL_SPAWN_BASELINE) {
offset_ids = MAM_Contains_strat(MAM_SPAWN_STRATEGIES, MAM_STRAT_SPAWN_INTERCOMM, NULL) ?
0 : numP;
} else {
array_size = numP > numO ? numP : numO;
}
mallocCounts(s_counts, array_size+offset_ids);
mallocCounts(r_counts, array_size+offset_ids);
MPI_Type_size(datatype, &datasize); //FIXME Right now derived datatypes are not ensured to work
if(is_children_group) {
offset_ids = 0;
prepare_comm_alltoall(myId, numP, numO, qty, offset_ids, r_counts);
prepare_comm_alltoall(mall->myId, numP, numO, qty, offset_ids, r_counts);
// Obtener distribución para este hijo
get_block_dist(qty, myId, numP, &dist_data);
get_block_dist(qty, mall->myId, numP, &dist_data);
*recv = malloc(dist_data.tamBl * datasize);
//get_block_dist(qty, myId, numP, &dist_data);
//print_counts(dist_data, r_counts->counts, r_counts->displs, numO+offset_ids, 1, "Children C ");
#if USE_MAL_DEBUG >= 4
get_block_dist(qty, mall->myId, numP, &dist_data);
print_counts(dist_data, r_counts->counts, r_counts->displs, numO+offset_ids, 0, "Targets Recv");
#endif
} else {
//get_block_dist(qty, myId, numP, &dist_data);
#if USE_MAL_DEBUG >= 4
get_block_dist(qty, mall->myId, numP, &dist_data);
#endif
prepare_comm_alltoall(myId, numP, numO, qty, offset_ids, s_counts);
if(!is_intercomm && myId < numO) {
prepare_comm_alltoall(myId, numO, numP, qty, offset_ids, r_counts);
prepare_comm_alltoall(mall->myId, numP, numO, qty, offset_ids, s_counts);
if(mall_conf->spawn_method == MALL_SPAWN_MERGE && mall->myId < numO) {
prepare_comm_alltoall(mall->myId, numO, numP, qty, offset_ids, r_counts);
// Obtener distribución para este hijo y reservar vector de recibo
get_block_dist(qty, myId, numO, &dist_data);
get_block_dist(qty, mall->myId, numO, &dist_data);
*recv = malloc(dist_data.tamBl * datasize);
//print_counts(dist_data, r_counts->counts, r_counts->displs, array_size, 1, "Children P ");
#if USE_MAL_DEBUG >= 4
print_counts(dist_data, r_counts->counts, r_counts->displs, array_size, 0, "Sources&Targets Recv");
#endif
}
//print_counts(dist_data, s_counts->counts, s_counts->displs, numO+offset_ids, 1, "Parents ");
#if USE_MAL_DEBUG >= 4
print_counts(dist_data, s_counts->counts, s_counts->displs, numO+offset_ids, 0, "Sources Send");
#endif
}
}
......@@ -455,21 +575,28 @@ void prepare_redistribution(int qty, int mal_type, int myId, int numP, int numO,
*
* - s_counts (IN): Struct where is indicated how many elements sends this process to processes in the new group.
* - r_counts (IN): Struct where is indicated how many elements receives this process from other processes in the previous group.
* - requests (OUT): Pointer to array of requests to be used to determine if the communication has ended. If the pointer
* - requests (IN/OUT): Pointer to array of requests to be used to determine if the communication has ended. If the pointer
* is null or not enough space has been reserved the pointer is allocated/reallocated.
* - request_qty (OUT): Quantity of requests to be used. If the value is smaller than the amount of communication
* - request_qty (IN/OUT): Quantity of requests to be used. If the value is smaller than the amount of communication
* functions to perform, it is modified to the minimum value.
*/
void check_requests(struct Counts s_counts, struct Counts r_counts, MPI_Request **requests, size_t *request_qty) {
size_t i, sum;
MPI_Request *aux;
switch(mall_conf->red_method) {
case MALL_RED_BASELINE:
sum = 1;
break;
case MALL_RED_POINT:
default:
sum = (size_t) s_counts.idE - s_counts.idI;
sum += (size_t) r_counts.idE - r_counts.idI;
break;
}
if (*requests != NULL && sum <= *request_qty) return; // Expected amount of requests
// FIXME Si es la estrategia Ibarrier como se tiene en cuenta en el total??
if (*requests == NULL) {
*requests = (MPI_Request *) malloc(sum * sizeof(MPI_Request));
} else { // Array exists, but is too small
......@@ -487,95 +614,3 @@ void check_requests(struct Counts s_counts, struct Counts r_counts, MPI_Request
}
*request_qty = sum;
}
/*
* Special case to perform a manual copy of data when a process has to send data to itself. Only used
* when the MPI communication is not able to hand this situation. An example is when using point to point
* communications and the process has to perform a Send and Recv to itself
* - send (IN): Array with the data to send. This value can not be NULL.
* - recv (OUT): Array where data will be written. This value can not be NULL.
* - myId (IN): Rank of the MPI process in the local communicator. For the parents is not the rank obtained from "comm".
* - s_counts (IN): Struct where is indicated how many elements sends this process to processes in the new group.
* - r_counts (IN): Struct where is indicated how many elements receives this process from other processes in the previous group.
*/
void perform_manual_communication(void *send, void *recv, int mal_type, int myId, struct Counts s_counts, struct Counts r_counts) {
int i;
if(mal_type == MAL_INT) {
int *new_recv, *new_send;
new_recv = (int *) recv;
new_send = (int *) send;
for(i=0; i<s_counts.counts[myId];i++) {
new_recv[i+r_counts.displs[myId]] = new_send[i+s_counts.displs[myId]];
}
} else if(mal_type == MAL_DOUBLE) {
double *new_recv, *new_send;
new_recv = (double *) recv;
new_send = (double *) send;
for(i=0; i<s_counts.counts[myId];i++) {
new_recv[i+r_counts.displs[myId]] = new_send[i+s_counts.displs[myId]];
}
} else if(mal_type == MAL_CHAR) {
char *new_recv, *new_send;
new_recv = (char *) recv;
new_send = (char *) send;
for(i=0; i<s_counts.counts[myId];i++) {
new_recv[i+r_counts.displs[myId]] = new_send[i+s_counts.displs[myId]];
}
} else {
printf("Malleability -- Redistribution type not recognised\n");
MPI_Abort(MPI_COMM_WORLD, -1);
}
}
/*
* Función para obtener si entre las estrategias elegidas, se utiliza
* la estrategia pasada como segundo argumento.
*
* Devuelve en "result" 1(Verdadero) si utiliza la estrategia, 0(Falso) en caso
* contrario.
*/
int malleability_red_contains_strat(int comm_strategies, int strategy, int *result) {
int value = comm_strategies % strategy ? 0 : 1;
if(result != NULL) *result = value;
return value;
}
void recalculate_counts(struct Counts *counts, int *array, void **recv, int mal_type) {
int i, ini, fin;
ini = 0;
fin = counts->counts[counts->idI];
counts->counts[counts->idI] = recalculate_elems(array, ini, fin);
for(i=counts->idI+1; i<counts->idE; i++) {
fin = counts->displs[i] + counts->counts[i];
ini = counts->displs[i];
counts->counts[i] = recalculate_elems(array, ini, fin);
counts->displs[i] = counts->displs[i-1] + counts->counts[i-1];
}
if(*recv != NULL) {
int datasize, qty;
if(mal_type == MAL_INT) {
datasize = sizeof(int);
} else if(mal_type == MAL_DOUBLE) {
datasize = sizeof(double);
} else if(mal_type == MAL_CHAR) {
datasize = sizeof(char);
} else {
printf("Malleability -- Redistribution type not recognised\n");
MPI_Abort(MPI_COMM_WORLD, -1);
}
qty = counts->counts[counts->idE-1] + counts->displs[counts->idE-1];
free(*recv);
*recv = malloc(qty * datasize);
}
}
int recalculate_elems(int *array, int ini, int fin) {
int i, sol = 0;
for(i=ini; i<fin; i++) {
sol += array[i];
}
return sol;
}
......@@ -7,22 +7,13 @@
#include <string.h>
#include "malleabilityStates.h"
//#define MAL_COMM_COMPLETED 0
//#define MAL_COMM_UNINITIALIZED 2
//#define MAL_ASYNC_PENDING 1
void sync_communication(void *send, void **recv, int qty, MPI_Datatype datatype, int numP, int numO, int is_children_group, MPI_Comm comm);
//#define MAL_USE_NORMAL 0
//#define MAL_USE_IBARRIER 1
//#define MAL_USE_POINT 2
//#define MAL_USE_THREAD 3
int sync_communication(void *send, void **recv, int qty, int mal_type, int dependency, int myId, int numP, int numO, int is_children_group, int comm_type, MPI_Comm comm);
int async_communication(void *send, void **recv, int qty, int mal_type, int dependency, int myId, int numP, int numO, int is_children_group, int red_method, int red_strategies, MPI_Comm comm, MPI_Request **requests, size_t *request_qty);
int send_async(char *array, int qty, int mal_type, int myId, int numP, MPI_Comm intercomm, int numP_child, MPI_Request **comm_req, int red_method, int red_strategies);
void recv_async(char **array, int qty, int mal_type, int myId, int numP, MPI_Comm intercomm, int numP_parents, int red_method, int red_strategies);
void async_communication_start(void *send, void **recv, int qty, MPI_Datatype datatype, int numP, int numO, int is_children_group, MPI_Comm comm, MPI_Request **requests, size_t *request_qty, MPI_Win *win);
int async_communication_check(int is_children_group, MPI_Request *requests, size_t request_qty);
void async_communication_wait(MPI_Request *requests, size_t request_qty);
void async_communication_end(MPI_Request *requests, size_t request_qty, MPI_Win *win);
void malloc_comm_array(char **array, int qty, int myId, int numP);
int malleability_red_contains_strat(int comm_strategies, int strategy, int *result);
#endif
#ifndef MAM_H
#define MAM_H
#include "malleabilityStates.h"
#include "malleabilityManager.h"
#include "MAM_Configuration.h"
#endif
#include "MAM_Configuration.h"
#include "MAM_Init_Configuration.h"
#include "malleabilityDataStructures.h"
#include <limits.h>
typedef struct {
unsigned int *value, default_value;
int config_max_length;
union {
int (*set_config_simple)(unsigned int, unsigned int *);
int (*set_config_complex)(unsigned int);
};
char *env_name;
} mam_config_setting_t;
int MAM_I_set_method(unsigned int new_method, unsigned int *method);
int MAM_I_set_spawn_strat(unsigned int strategy, unsigned int *strategies);
int MAM_I_set_red_strat(unsigned int strategy, unsigned int *strategies);
int MAM_I_set_target_number(unsigned int new_numC);
int MAM_I_configuration_get_defaults();
int MAM_I_contains_strat(unsigned int comm_strategies, unsigned int strategy);
int MAM_I_add_strat(unsigned int *comm_strategies, unsigned int strategy);
int MAM_I_remove_strat(unsigned int *comm_strategies, unsigned int strategy);
mam_config_setting_t configSettings[] = {
{NULL, MALL_SPAWN_MERGE, MAM_METHODS_SPAWN_LEN, {.set_config_simple = MAM_I_set_method }, MAM_SPAWN_METHOD_ENV},
{NULL, MAM_STRAT_SPAWN_CLEAR, MAM_STRATS_SPAWN_LEN, {.set_config_simple = MAM_I_set_spawn_strat }, MAM_SPAWN_STRATS_ENV},
{NULL, MALL_DIST_COMPACT, MAM_METHODS_PHYSICAL_DISTRIBUTION_LEN, {.set_config_simple = MAM_I_set_method }, MAM_PHYSICAL_DISTRIBUTION_METHOD_ENV},
{NULL, MALL_RED_BASELINE, MAM_METHODS_RED_LEN, {.set_config_simple = MAM_I_set_method }, MAM_RED_METHOD_ENV},
{NULL, MAM_STRAT_RED_CLEAR, MAM_STRATS_RED_LEN, {.set_config_simple = MAM_I_set_red_strat }, MAM_RED_STRATS_ENV},
{NULL, 1, INT_MAX, {.set_config_complex = MAM_I_set_target_number }, MAM_NUM_TARGETS_ENV}
};
unsigned int masks_spawn[] = {MAM_STRAT_CLEAR_VALUE, MAM_MASK_PTHREAD, MAM_MASK_SPAWN_SINGLE, MAM_MASK_SPAWN_INTERCOMM};
unsigned int masks_red[] = {MAM_STRAT_CLEAR_VALUE, MAM_MASK_PTHREAD, MAM_MASK_RED_WAIT_SOURCES, MAM_MASK_RED_WAIT_TARGETS};
/**
* @brief Set configuration parameters for MAM.
*
* This function allows setting various configuration parameters for MAM
* such as spawn method, spawn strategies, spawn physical distribution,
* redistribution method, and red strategies.
*
* @param spawn_method The spawn method reconfiguration.
* @param spawn_strategies The spawn strategies reconfiguration.
* @param spawn_dist The spawn physical distribution method reconfiguration.
* @param red_method The redistribution method reconfiguration.
* @param red_strategies The redesitribution strategy for reconfiguration.
*/
void MAM_Set_configuration(int spawn_method, int spawn_strategies, int spawn_dist, int red_method, int red_strategies) {
int i, aux;
int aux_array[] = {spawn_method, spawn_strategies, spawn_dist, red_method, red_strategies};
if(state > MALL_NOT_STARTED) return;
mam_config_setting_t *config = NULL;
for (i = 0; i < MAM_KEY_COUNT-1; i++) { //FIXME Numero magico para no cambiar num_targets
aux = aux_array[i];
config = &configSettings[i];
if (0 <= aux && aux < config->config_max_length) {
if(i == MAM_NUM_TARGETS) {
config->set_config_complex(aux);
} else {
config->set_config_simple(aux, config->value);
}
}
}
}
/*
* @brief Set the configuration value for a specific key in MAM.
*
* Modifies the configuration value associated with the given key
* to the specified "required" value. The final value set is returned in the
* "provided" parameter.
*
* @param key The key for which the configuration value is to be modified.
* @param required The required value to set for the specified key.
* @param provided Pointer to an integer where the final value set will be stored.
* This parameter is updated with the actual value after modification.
* For strategy keys the value is "MAM_STRATS_ADDED" if "required" has
* been added, or "MAM_STRATS_MODIFIED" if multiple strategies of the
* key have been modified.
*/
void MAM_Set_key_configuration(int key, int required, int *provided) {
int i, aux;
if(provided == NULL) provided = &aux;
*provided = MALL_DENIED;
if(required < 0 || state > MALL_NOT_STARTED) return;
mam_config_setting_t *config = NULL;
for (i = 0; i < MAM_KEY_COUNT; i++) {
if (key == i) {
config = &configSettings[i];
break;
}
}
if (config != NULL) {
if (required < config->config_max_length) {
if(i == MAM_NUM_TARGETS) {
*provided = config->set_config_complex(required);
} else {
*provided = config->set_config_simple(required, config->value);
}
} else {*provided = *(config->value); }
} else { printf("MAM: Key %d does not exist\n", key); }
}
/*
* Retorna si una estrategia aparece o no
*/
int MAM_Contains_strat(int key, unsigned int strategy, int *result) {
int strategies, aux = MAM_OK;
unsigned int len = 0, mask;
switch(key) {
case MAM_SPAWN_STRATEGIES:
strategies = mall_conf->spawn_strategies;
mask = masks_spawn[strategy];
len = MAM_STRATS_SPAWN_LEN;
break;
case MAM_RED_STRATEGIES:
strategies = mall_conf->red_strategies;
mask = masks_red[strategy];
len = MAM_STRATS_RED_LEN;
break;
default:
aux = MALL_DENIED;
break;
}
if(aux == MAM_OK && strategy < len) {
aux = MAM_I_contains_strat(strategies, mask);
} else {
aux = 0;
}
if(result != NULL) *result = aux;
return aux;
}
/*
* //TODO
* Tiene que ser llamado despues de setear la config
*/
int MAM_Set_target_number(unsigned int numC){
return MAM_I_set_target_number(numC);
}
//======================================================||
//===============MAM_INIT FUNCTIONS=====================||
//======================================================||
//======================================================||
void MAM_Init_configuration() {
if(mall == NULL || mall_conf == NULL) {
printf("MAM FATAL ERROR: Setting initial config without previous mallocs\n");
fflush(stdout);
MPI_Abort(MPI_COMM_WORLD, -50);
}
configSettings[MAM_SPAWN_METHOD].value = &mall_conf->spawn_method;
configSettings[MAM_SPAWN_STRATEGIES].value = &mall_conf->spawn_strategies;
configSettings[MAM_PHYSICAL_DISTRIBUTION].value = &mall_conf->spawn_dist;
configSettings[MAM_RED_METHOD].value = &mall_conf->red_method;
configSettings[MAM_RED_STRATEGIES].value = &mall_conf->red_strategies;
}
void MAM_Set_initial_configuration() {
int not_filled = 1;
not_filled = MAM_I_configuration_get_defaults();
if(not_filled) {
if(mall->myId == mall->root) printf("MAM WARNING: Starting configuration not set\n");
fflush(stdout);
MPI_Abort(mall->comm, -50);
}
#if USE_MAL_DEBUG >= 2
if(mall->myId == mall->root) {
DEBUG_FUNC("Initial configuration settled", mall->myId, mall->numP);
fflush(stdout);
}
#endif
}
void MAM_Check_configuration() {
if(mall->numC == mall->numP) { // Migrate
MAM_Set_key_configuration(MAM_SPAWN_METHOD, MALL_SPAWN_BASELINE, NULL);
}
if(mall_conf->spawn_method == MALL_SPAWN_MERGE) {
if(MAM_I_contains_strat(mall_conf->spawn_strategies, MAM_MASK_SPAWN_INTERCOMM)) {
MAM_I_remove_strat(&mall_conf->spawn_strategies, MAM_MASK_SPAWN_INTERCOMM);
}
if(mall->numP > mall->numC && MAM_I_contains_strat(mall_conf->spawn_strategies, MAM_MASK_SPAWN_SINGLE)) {
MAM_I_remove_strat(&mall_conf->spawn_strategies, MAM_MASK_SPAWN_SINGLE);
}
}
if(mall_conf->red_method == MALL_RED_RMA_LOCK || mall_conf->red_method == MALL_RED_RMA_LOCKALL) {
if(MAM_I_contains_strat(mall_conf->spawn_strategies, MAM_MASK_SPAWN_INTERCOMM)) {
MAM_I_remove_strat(&mall_conf->spawn_strategies, MAM_MASK_SPAWN_INTERCOMM);
}
if(!MAM_I_contains_strat(mall_conf->red_strategies, MAM_MASK_RED_WAIT_TARGETS) &&
!MAM_I_contains_strat(mall_conf->red_strategies, MAM_MASK_PTHREAD)) {
MAM_I_set_red_strat(MAM_STRAT_RED_WAIT_TARGETS, &mall_conf->red_strategies);
}
}
}
//======================================================||
//================PRIVATE FUNCTIONS=====================||
//======================================================||
//======================================================||
int MAM_I_configuration_get_defaults() {
size_t i;
int set_value;
char *tmp = NULL;
mam_config_setting_t *config = NULL;
for (i = 0; i < MAM_KEY_COUNT; i++) {
config = &configSettings[i];
tmp = getenv(config->env_name);
if(tmp != NULL) {
set_value = atoi(tmp);
} else {
set_value = config->default_value;
}
if (0 <= set_value && set_value < config->config_max_length) {
if(i == MAM_NUM_TARGETS) {
config->set_config_complex(set_value);
} else {
config->set_config_simple(set_value, config->value);
}
}
tmp = NULL;
}
return 0;
}
int MAM_I_set_method(unsigned int new_method, unsigned int *method) {
*method = new_method;
return *method;
}
int MAM_I_set_spawn_strat(unsigned int strategy, unsigned int *strategies) {
int result = 0;
int strat_removed = 0;
switch(strategy) {
case MAM_STRAT_SPAWN_CLEAR:
*strategies = MAM_STRAT_CLEAR_VALUE;
result = MAM_STRATS_MODIFIED;
break;
case MAM_STRAT_SPAWN_PTHREAD:
result = MAM_I_add_strat(strategies, MAM_MASK_PTHREAD);
break;
case MAM_STRAT_SPAWN_SINGLE:
result = MAM_I_add_strat(strategies, MAM_MASK_SPAWN_SINGLE);
break;
case MAM_STRAT_SPAWN_INTERCOMM:
result = MAM_I_add_strat(strategies, MAM_MASK_SPAWN_INTERCOMM);
break;
default:
//Unkown strategy
result = MALL_DENIED;
break;
}
if(strat_removed) {
result = MAM_STRATS_MODIFIED;
}
return result;
}
int MAM_I_set_red_strat(unsigned int strategy, unsigned int *strategies) {
int result = 0;
int strat_removed = 0;
switch(strategy) {
case MAM_STRAT_RED_CLEAR:
*strategies = MAM_STRAT_CLEAR_VALUE;
result = MAM_STRATS_MODIFIED;
break;
case MAM_STRAT_RED_PTHREAD: //TODO - IMPROVEMENT - This could be done with a single operation instead of 3.
result = MAM_I_add_strat(strategies, MAM_MASK_PTHREAD);
if(result == MAM_STRATS_ADDED) {
strat_removed += MAM_I_remove_strat(strategies, MAM_MASK_RED_WAIT_SOURCES);
strat_removed += MAM_I_remove_strat(strategies, MAM_MASK_RED_WAIT_TARGETS);
}
break;
case MAM_STRAT_RED_WAIT_SOURCES:
result = MAM_I_add_strat(strategies, MAM_MASK_RED_WAIT_SOURCES);
if(result == MAM_STRATS_ADDED) {
strat_removed += MAM_I_remove_strat(strategies, MAM_MASK_RED_WAIT_TARGETS);
strat_removed += MAM_I_remove_strat(strategies, MAM_MASK_PTHREAD);
}
break;
case MAM_STRAT_RED_WAIT_TARGETS:
result = MAM_I_add_strat(strategies, MAM_MASK_RED_WAIT_TARGETS);
if(result == MAM_STRATS_ADDED) {
strat_removed += MAM_I_remove_strat(strategies, MAM_MASK_RED_WAIT_SOURCES);
strat_removed += MAM_I_remove_strat(strategies, MAM_MASK_PTHREAD);
}
break;
default:
//Unkown strategy
result = MALL_DENIED;
break;
}
if(strat_removed) {
result = MAM_STRATS_MODIFIED;
}
return result;
}
int MAM_I_set_target_number(unsigned int new_numC) {
if(state > MALL_NOT_STARTED || new_numC == 0) return MALL_DENIED;
mall->numC = (int) new_numC;
return new_numC;
}
/*
* Returns 1 if strategy is applied, 0 otherwise
*/
int MAM_I_contains_strat(unsigned int comm_strategies, unsigned int strategy) {
return comm_strategies & strategy;
}
int MAM_I_add_strat(unsigned int *comm_strategies, unsigned int strategy) {
if(MAM_I_contains_strat(*comm_strategies, strategy)) return MAM_OK;
*comm_strategies |= strategy;
return MAM_STRATS_ADDED;
}
int MAM_I_remove_strat(unsigned int *comm_strategies, unsigned int strategy) {
if(!MAM_I_contains_strat(*comm_strategies, strategy)) return MAM_OK;
*comm_strategies &= ~strategy;
return MAM_STRATS_MODIFIED;
}
#ifndef MAM_CONFIGURATION_H
#define MAM_CONFIGURATION_H
#include <mpi.h>
#include "malleabilityStates.h"
#define MAM_STRAT_CLEAR_VALUE 0
#define MAM_STRATS_ADDED 1
#define MAM_STRATS_MODIFIED 2
#define MAM_MASK_PTHREAD 0x01
#define MAM_MASK_SPAWN_SINGLE 0x02
#define MAM_MASK_SPAWN_INTERCOMM 0x04
#define MAM_MASK_RED_WAIT_SOURCES 0x02
#define MAM_MASK_RED_WAIT_TARGETS 0x04
int MAM_Contains_strat(int key, unsigned int strategy, int *result);
void MAM_Set_configuration(int spawn_method, int spawn_strategies, int spawn_dist, int red_method, int red_strategies);
void MAM_Set_key_configuration(int key, int required, int *provided);
int MAM_Set_target_number(unsigned int numC);
#endif
#ifndef MAM_INIT_CONFIGURATION_H
#define MAM_INIT_CONFIGURATION_H
#include <mpi.h>
#include "malleabilityStates.h"
void MAM_Init_configuration();
void MAM_Set_initial_configuration();
void MAM_Check_configuration();
#endif
......@@ -104,12 +104,8 @@ void get_block_dist(int qty, int id, int numP, struct Dist_data *dist_data) {
dist_data->fin = (id+1) * dist_data->tamBl + rem;
}
if(dist_data->fin > qty) {
dist_data->fin = qty;
}
if(dist_data->ini > dist_data->fin) {
dist_data->ini = dist_data->fin;
}
if(dist_data->fin > qty) { dist_data->fin = qty; }
if(dist_data->ini > dist_data->fin) { dist_data->ini = dist_data->fin; }
dist_data->tamBl = dist_data->fin - dist_data->ini;
}
......@@ -131,18 +127,10 @@ void set_interblock_counts(int id, int numP, struct Dist_data data_dist, int off
}
// Obtiene el proceso con mayor ini entre los dos procesos
if(data_dist.ini > other.ini) {
biggest_ini = data_dist.ini;
} else {
biggest_ini = other.ini;
}
biggest_ini = (data_dist.ini > other.ini) ? data_dist.ini : other.ini;
// Obtiene el proceso con menor fin entre los dos procesos
if(data_dist.fin < other.fin) {
smallest_end = data_dist.fin;
} else {
smallest_end = other.fin;
}
smallest_end = (data_dist.fin < other.fin) ? data_dist.fin : other.fin;
sendcounts[id] = smallest_end - biggest_ini; // Numero de elementos a enviar/recibir del proceso Id
}
......
#include "malleabilityDataStructures.h"
int state = MALL_UNRESERVED;
/*
* Crea un tipo derivado para mandar las dos estructuras principales
* de MaM.
*/
void MAM_Def_main_datatype() {
int i, counts = 10;
int blocklengths[counts];
MPI_Aint displs[counts];
MPI_Datatype types[counts];
for(i=0; i<5; i++) {
blocklengths[i] = 1;
types[i] = MPI_UNSIGNED;
}
for(i=5; i<counts; i++) {
blocklengths[i] = 1;
types[i] = MPI_INT;
}
// Obtain base direction
MPI_Get_address(&(mall_conf->spawn_method), &displs[0]);
MPI_Get_address(&(mall_conf->spawn_strategies), &displs[1]);
MPI_Get_address(&(mall_conf->spawn_dist), &displs[2]);
MPI_Get_address(&(mall_conf->red_method), &displs[3]);
MPI_Get_address(&(mall_conf->red_strategies), &displs[4]);
MPI_Get_address(&(mall->root_parents), &displs[5]);
MPI_Get_address(&(mall->num_parents), &displs[6]); //TODO Add only when Intercomm strat active?
MPI_Get_address(&(mall->num_cpus), &displs[7]);
MPI_Get_address(&(mall->num_nodes), &displs[8]);
MPI_Get_address(&(mall->nodelist_len), &displs[9]);
MPI_Type_create_struct(counts, blocklengths, displs, types, &mall->struct_type);
MPI_Type_commit(&mall->struct_type);
}
void MAM_Free_main_datatype() {
if(mall->struct_type != MPI_DATATYPE_NULL) {
MPI_Type_free(&mall->struct_type);
}
}
/*
* Comunica datos necesarios de las estructuras
* principales de MAM de sources a targets.
*/
void MAM_Comm_main_structures(int rootBcast) {
MPI_Bcast(MPI_BOTTOM, 1, mall->struct_type, rootBcast, mall->intercomm);
if(mall->nodelist == NULL) {
mall->nodelist = calloc(mall->nodelist_len+1, sizeof(char));
mall->nodelist[mall->nodelist_len] = '\0';
}
MPI_Bcast(mall->nodelist, mall->nodelist_len, MPI_CHAR, rootBcast, mall->intercomm);
}
......@@ -4,27 +4,63 @@
/*
* Shows available data structures for inner ussage.
*/
#include <stdlib.h>
#include <stdio.h>
#include <mpi.h>
#include <pthread.h>
#include "malleabilityStates.h"
/* --- SPAWN STRUCTURES --- */
struct physical_dist {
int num_cpus, num_nodes;
char *nodelist;
int target_qty, already_created;
int dist_type, info_type;
};
#define DEBUG_FUNC(debug_string, rank, numP) printf("MaM [P%d/%d]: %s -- %s:%s:%d\n", rank, numP, debug_string, __FILE__, __func__, __LINE__)
/* --- TIME CAPTURE STRUCTURE --- */
typedef struct {
int myId, root, root_parents;
int spawn_qty, initial_qty, target_qty;
int already_created;
int spawn_method, spawn_is_single, spawn_is_async;
char *cmd; //Executable name
MPI_Info mapping;
MPI_Datatype dtype;
struct physical_dist dist; // Used to create mapping var
MPI_Comm comm, returned_comm;
} Spawn_data;
// Spawn, Sync and Async time
double spawn_start, spawn_time;
double sync_start, sync_end;
double async_start, async_end;
double malleability_start, malleability_end;
MPI_Datatype times_type;
} malleability_times_t;
/* --- GLOBAL STRUCTURES --- */
typedef struct {
unsigned int spawn_method;
unsigned int spawn_dist;
unsigned int spawn_strategies;
unsigned int red_method;
unsigned int red_strategies;
malleability_times_t *times;
} malleability_config_t;
typedef struct {
int myId, numP, numC, zombie;
int root, root_collectives;
int num_parents, root_parents;
pthread_t async_thread;
MPI_Comm comm, thread_comm;
MPI_Comm intercomm, tmp_comm;
MPI_Comm *user_comm;
MPI_Datatype struct_type;
// Specific vars for Wait_targets strat
int wait_targets_posted;
MPI_Request wait_targets;
char *name_exec, *nodelist;
int num_cpus, num_nodes, nodelist_len;
} malleability_t;
/* --- VARIABLES --- */
malleability_config_t *mall_conf;
malleability_t *mall;
extern int state;
/* --- FUNCTIONS --- */
void MAM_Def_main_datatype();
void MAM_Free_main_datatype();
void MAM_Comm_main_structures(int rootBcast);
#endif
#include <pthread.h>
#include <string.h>
#include "malleabilityManager.h"
//#include "malleabilityManager.h"
#include "MAM.h"
#include "malleabilityStates.h"
#include "malleabilityDataStructures.h"
#include "malleabilityTypes.h"
#include "malleabilityZombies.h"
#include "malleabilityTimes.h"
#include "malleabilityRMS.h"
#include "MAM_Init_Configuration.h"
#include "spawn_methods/GenericSpawn.h"
#include "CommDist.h"
#define MALLEABILITY_USE_SYNCHRONOUS 0
#define MALLEABILITY_USE_ASYNCHRONOUS 1
void MAM_Commit(int *mam_state);
void send_data(int numP_children, malleability_data_t *data_struct, int is_asynchronous);
void recv_data(int numP_parents, malleability_data_t *data_struct, int is_asynchronous);
void Children_init();
int MAM_St_rms(int *mam_state);
int MAM_St_spawn_start();
int MAM_St_spawn_pending(int wait_completed);
int MAM_St_red_start();
int MAM_St_red_pending(int *mam_state, int wait_completed);
int MAM_St_user_pending(int *mam_state, int wait_completed, void (*user_function)(void *), void *user_args);
int MAM_St_user_completed();
int MAM_St_spawn_adapt_pending(int wait_completed);
int MAM_St_spawn_adapted(int *mam_state);
int MAM_St_red_completed(int *mam_state);
int MAM_St_completed(int *mam_state);
void Children_init(void (*user_function)(void *), void *user_args);
int spawn_step();
int start_redistribution();
int check_redistribution();
int check_redistribution(int wait_completed);
int end_redistribution();
int shrink_redistribution();
void comm_node_data(int rootBcast, int is_child_group);
void def_nodeinfo_type(MPI_Datatype *node_type);
int thread_creation();
int thread_check();
int thread_check(int wait_completed);
void* thread_async_work();
void print_comms_state();
void malleability_comms_update(MPI_Comm comm);
void comm_results(int root, int compute); //FIXME Borrar
void def_malleability_results(MPI_Datatype *new_type); //FIXME BORRAR
typedef struct {
int spawn_method;
int spawn_dist;
int spawn_strategies;
int red_method;
int red_strategies;
int grp;
double exec_start;
double spawn_start, spawn_time;
double sync_time, sync_end;
double async_time, async_end;
double malleability_end, malleability_time;
} malleability_config_t;
typedef struct { //FIXME numC_spawned no se esta usando
int myId, numP, numC, numC_spawned, root, root_parents;
pthread_t async_thread;
MPI_Comm comm, thread_comm;
MPI_Comm intercomm;
MPI_Comm user_comm;
int dup_user_comm;
char *name_exec, *nodelist;
int num_cpus, num_nodes, nodelist_len;
} malleability_t;
int state = MALL_UNRESERVED; //FIXME Mover a otro lado
int dep_not_send = 1; //FIXME BORRAR
malleability_config_t *mall_conf;
malleability_t *mall;
int MAM_I_convert_key(char *key);
void MAM_I_create_user_struct(int is_children_group);
malleability_data_t *rep_s_data;
malleability_data_t *dist_s_data;
malleability_data_t *rep_a_data;
malleability_data_t *dist_a_data;
mam_user_reconf_t *user_reconf;
/*
* Inicializa la reserva de memoria para el modulo de maleabilidad
* creando todas las estructuras necesarias y copias de comunicadores
......@@ -84,60 +68,72 @@ malleability_data_t *dist_a_data;
* la comunicacion los procesos hijo estan preparados para ejecutar la
* aplicacion.
*/
int init_malleability(int myId, int numP, int root, MPI_Comm comm, char *name_exec, char *nodelist, int num_cpus, int num_nodes) {
int MAM_Init(int root, MPI_Comm *comm, char *name_exec, void (*user_function)(void *), void *user_args) {
MPI_Comm dup_comm, thread_comm;
mall_conf = (malleability_config_t *) malloc(sizeof(malleability_config_t));
mall = (malleability_t *) malloc(sizeof(malleability_t));
user_reconf = (mam_user_reconf_t *) malloc(sizeof(mam_user_reconf_t));
MPI_Comm_rank(*comm, &(mall->myId));
MPI_Comm_size(*comm, &(mall->numP));
#if USE_MAL_DEBUG
DEBUG_FUNC("Initializing MaM", mall->myId, mall->numP); fflush(stdout); MPI_Barrier(*comm);
#endif
rep_s_data = (malleability_data_t *) malloc(sizeof(malleability_data_t));
dist_s_data = (malleability_data_t *) malloc(sizeof(malleability_data_t));
rep_a_data = (malleability_data_t *) malloc(sizeof(malleability_data_t));
dist_a_data = (malleability_data_t *) malloc(sizeof(malleability_data_t));
mall->dup_user_comm = 0;
MPI_Comm_dup(comm, &dup_comm);
MPI_Comm_dup(comm, &thread_comm);
MPI_Comm_set_name(dup_comm, "MPI_COMM_MALL");
MPI_Comm_set_name(thread_comm, "MPI_COMM_MALL_THREAD");
MPI_Comm_dup(*comm, &dup_comm);
MPI_Comm_dup(*comm, &thread_comm);
MPI_Comm_set_name(dup_comm, "MAM_MAIN");
MPI_Comm_set_name(thread_comm, "MAM_THREAD");
mall->myId = myId;
mall->numP = numP;
mall->root = root;
mall->root_parents = root;
mall->zombie = 0;
mall->comm = dup_comm;
mall->thread_comm = thread_comm;
mall->user_comm = comm;
mall->tmp_comm = MPI_COMM_NULL;
mall->name_exec = name_exec;
mall->nodelist = nodelist;
mall->num_cpus = num_cpus;
mall->num_nodes = num_nodes;
mall->nodelist = NULL;
mall->nodelist_len = 0;
rep_s_data->entries = 0;
rep_a_data->entries = 0;
dist_s_data->entries = 0;
dist_a_data->entries = 0;
mall_conf->spawn_time = 0; mall_conf->sync_time = 0; mall_conf->async_time = 0; mall_conf->malleability_time = 0;
mall_conf->spawn_start = 0; mall_conf->sync_end = 0; mall_conf->async_end = 0; mall_conf->malleability_end = 0;
state = MALL_NOT_STARTED;
MAM_Init_configuration();
zombies_service_init();
init_malleability_times();
MAM_Def_main_datatype();
// Si son el primer grupo de procesos, obtienen los datos de los padres
MPI_Comm_get_parent(&(mall->intercomm));
if(mall->intercomm != MPI_COMM_NULL ) {
Children_init();
if(mall->intercomm != MPI_COMM_NULL) {
Children_init(user_function, user_args);
return MALLEABILITY_CHILDREN;
}
if(nodelist != NULL) { //TODO To be deprecated by using Slurm or else statement
mall->nodelist_len = strlen(nodelist);
} else { // If no nodelist is detected, get it from the actual run
mall->nodelist = malloc(MPI_MAX_PROCESSOR_NAME * sizeof(char));
MPI_Get_processor_name(mall->nodelist, &mall->nodelist_len);
//TODO Get name of each process and create real nodelist
}
MAM_check_hosts();
MAM_Set_initial_configuration();
#if USE_MAL_BARRIERS && USE_MAL_DEBUG
if(mall->myId == mall->root)
printf("MaM: Using barriers to record times.\n");
#endif
#if USE_MAL_DEBUG
DEBUG_FUNC("MaM has been initialized correctly as parents", mall->myId, mall->numP); fflush(stdout); MPI_Barrier(*comm);
#endif
return MALLEABILITY_NOT_CHILDREN;
}
......@@ -147,7 +143,7 @@ int init_malleability(int myId, int numP, int root, MPI_Comm comm, char *name_ex
* de maleabilidad y asegura que los zombies
* despierten si los hubiese.
*/
void free_malleability() {
void MAM_Finalize() {
free_malleability_data_struct(rep_s_data);
free_malleability_data_struct(rep_a_data);
free_malleability_data_struct(dist_s_data);
......@@ -157,11 +153,16 @@ void free_malleability() {
free(rep_a_data);
free(dist_s_data);
free(dist_a_data);
if(mall->nodelist != NULL) free(mall->nodelist);
if(mall->comm != MPI_COMM_WORLD) MPI_Comm_free(&(mall->comm));
if(mall->thread_comm != MPI_COMM_WORLD) MPI_Comm_free(&(mall->thread_comm));
MAM_Free_main_datatype();
free_malleability_times();
if(mall->comm != MPI_COMM_WORLD && mall->comm != MPI_COMM_NULL) MPI_Comm_free(&(mall->comm));
if(mall->thread_comm != MPI_COMM_WORLD && mall->thread_comm != MPI_COMM_NULL) MPI_Comm_free(&(mall->thread_comm));
if(mall->intercomm != MPI_COMM_WORLD && mall->intercomm != MPI_COMM_NULL) { MPI_Comm_disconnect(&(mall->intercomm)); } //FIXME Error en OpenMPI + Merge
free(mall);
free(mall_conf);
free(user_reconf);
zombies_awake();
zombies_service_free();
......@@ -171,261 +172,305 @@ void free_malleability() {
/*
* TODO Reescribir
* Se realiza el redimensionado de procesos por parte de los padres.
*
* Se crean los nuevos procesos con la distribucion fisica elegida y
* a continuacion se transmite la informacion a los mismos.
*
* Si hay datos asincronos a transmitir, primero se comienza a
* transmitir estos y se termina la funcion. Se tiene que comprobar con
* llamando a la función de nuevo que se han terminado de enviar
* Comprueba el estado de la maleabilidad. Intenta avanzar en la misma
* si es posible. Funciona como una máquina de estados.
* Retorna el estado de la maleabilidad concreto y modifica el argumento
* "mam_state" a uno generico.
*
* Si hay ademas datos sincronos a enviar, no se envian aun.
* El argumento "wait_completed" se utiliza para esperar a la finalización de
* las tareas llevadas a cabo por parte de MAM.
*
* Si solo hay datos sincronos se envian tras la creacion de los procesos
* y finalmente se desconectan los dos grupos de procesos.
*/
int malleability_checkpoint() {
double end_real_time;
int MAM_Checkpoint(int *mam_state, int wait_completed, void (*user_function)(void *), void *user_args) {
int call_checkpoint = 0;
//TODO This could be changed to an array with the functions to call in each case
switch(state) {
case MALL_UNRESERVED:
*mam_state = MAM_UNRESERVED;
break;
case MALL_NOT_STARTED:
// Comprobar si se tiene que realizar un redimensionado
//MPI_Barrier(mall->comm);
mall_conf->malleability_time = MPI_Wtime();
state = spawn_step();
if (state == MALL_SPAWN_COMPLETED || state == MALL_SPAWN_ADAPT_POSTPONE){
malleability_checkpoint();
}
call_checkpoint = MAM_St_rms(mam_state);
break;
case MALL_RMS_COMPLETED:
call_checkpoint = MAM_St_spawn_start();
break;
case MALL_SPAWN_PENDING: // Comprueba si el spawn ha terminado y comienza la redistribucion
case MALL_SPAWN_PENDING: // Comprueba si el spawn ha terminado
case MALL_SPAWN_SINGLE_PENDING:
state = check_spawn_state(&(mall->intercomm), mall->comm, &end_real_time);
if (state == MALL_SPAWN_COMPLETED || state == MALL_SPAWN_ADAPTED) {
//MPI_Barrier(mall->comm);
mall_conf->spawn_time = MPI_Wtime() - mall_conf->spawn_start;
malleability_checkpoint();
}
call_checkpoint = MAM_St_spawn_pending(wait_completed);
break;
case MALL_SPAWN_ADAPT_POSTPONE:
case MALL_SPAWN_COMPLETED:
state = start_redistribution();
malleability_checkpoint();
call_checkpoint = MAM_St_red_start();
break;
case MALL_DIST_PENDING:
if(malleability_red_contains_strat(mall_conf->red_strategies, MALL_RED_THREAD, NULL)) {
state = thread_check();
} else {
state = check_redistribution();
}
if(state != MALL_DIST_PENDING) {
malleability_checkpoint();
}
call_checkpoint = MAM_St_red_pending(mam_state, wait_completed);
break;
case MALL_SPAWN_ADAPT_PENDING:
//MPI_Barrier(mall->comm);
mall_conf->spawn_start = MPI_Wtime();
unset_spawn_postpone_flag(state);
state = check_spawn_state(&(mall->intercomm), mall->comm, &end_real_time);
case MALL_USER_PENDING:
call_checkpoint = MAM_St_user_pending(mam_state, wait_completed, user_function, user_args);
break;
if(!malleability_spawn_contains_strat(mall_conf->spawn_strategies, MALL_SPAWN_PTHREAD, NULL)) {
//MPI_Barrier(mall->comm);
mall_conf->spawn_time = MPI_Wtime() - mall_conf->spawn_start;
malleability_checkpoint();
}
case MALL_USER_COMPLETED:
call_checkpoint = MAM_St_user_completed();
break;
case MALL_SPAWN_ADAPTED:
state = shrink_redistribution();
malleability_checkpoint();
case MALL_SPAWN_ADAPT_PENDING:
call_checkpoint = MAM_St_spawn_adapt_pending(wait_completed);
break;
case MALL_DIST_COMPLETED: //TODO No es esto muy feo?
//MPI_Barrier(mall->comm);
mall_conf->malleability_end = MPI_Wtime();
state = MALL_COMPLETED;
case MALL_SPAWN_ADAPTED:
case MALL_DIST_COMPLETED:
call_checkpoint = MAM_St_completed(mam_state);
break;
}
return state;
}
// Funciones solo necesarias por el benchmark
//-------------------------------------------------------------------------------------------------------------
void set_benchmark_grp(int grp) {
mall_conf->grp = grp;
if(call_checkpoint) { MAM_Checkpoint(mam_state, wait_completed, user_function, user_args); }
if(state > MALL_NOT_STARTED && state < MALL_COMPLETED) *mam_state = MAM_PENDING;
return state;
}
//-------------------------------------------------------------------------------------------------------------
void set_malleability_configuration(int spawn_method, int spawn_strategies, int spawn_dist, int red_method, int red_strategies) {
mall_conf->spawn_method = spawn_method;
mall_conf->spawn_strategies = spawn_strategies;
mall_conf->spawn_dist = spawn_dist;
mall_conf->red_method = red_method;
mall_conf->red_strategies = red_strategies;
/*
* TODO
*/
void MAM_Resume_redistribution(int *mam_state) {
state = MALL_USER_COMPLETED;
if(mam_state != NULL) *mam_state = MAM_PENDING;
}
/*
* To be deprecated
* Tiene que ser llamado despues de setear la config
* TODO
*/
void set_children_number(int numC){
if((mall_conf->spawn_method == MALL_SPAWN_MERGE) && (numC >= mall->numP)) {
mall->numC = numC;
mall->numC_spawned = numC - mall->numP;
void MAM_Commit(int *mam_state) {
int zombies = 0;
#if USE_MAL_DEBUG
if(mall->myId == mall->root){ DEBUG_FUNC("Trying to commit", mall->myId, mall->numP); } fflush(stdout);
#endif
// Get times before commiting
if(mall_conf->spawn_method == MALL_SPAWN_BASELINE) {
// This communication is only needed when a root process will become a zombie
malleability_times_broadcast(mall->root_collectives);
}
if(numC == mall->numP) { // Migrar
mall->numC_spawned = numC;
mall_conf->spawn_method = MALL_SPAWN_BASELINE;
// Free unneded communicators
if(mall->tmp_comm != MPI_COMM_WORLD && mall->tmp_comm != MPI_COMM_NULL) MPI_Comm_free(&(mall->tmp_comm));
if(*(mall->user_comm) != MPI_COMM_WORLD && *(mall->user_comm) != MPI_COMM_NULL) MPI_Comm_free(mall->user_comm);
// Zombies treatment
if(mall_conf->spawn_method == MALL_SPAWN_MERGE) {
MPI_Allreduce(&mall->zombie, &zombies, 1, MPI_INT, MPI_MAX, mall->comm);
if(zombies) {
zombies_collect_suspended(mall->comm);
}
} else {
mall->numC = numC;
mall->numC_spawned = numC;
}
}
/*
* TODO
*/
void get_malleability_user_comm(MPI_Comm *comm) {
if(mall->dup_user_comm) {
if(mall->user_comm != MPI_COMM_WORLD) MPI_Comm_free(&(mall->user_comm));
MPI_Comm_dup(mall->comm, &(mall->user_comm));
MPI_Comm_set_name(mall->user_comm, "MPI_COMM_MALL_USER");
mall->dup_user_comm = 0;
// Zombies KILL
if(mall->zombie) {
#if USE_MAL_DEBUG >= 2
DEBUG_FUNC("Is terminating as zombie", mall->myId, mall->numP); fflush(stdout);
#endif
MAM_Finalize();
MPI_Finalize();
exit(0);
}
*comm = mall->user_comm;
// Reset/Free communicators
if(mall_conf->spawn_method == MALL_SPAWN_MERGE) { malleability_comms_update(mall->intercomm); }
if(mall->intercomm != MPI_COMM_NULL && mall->intercomm != MPI_COMM_WORLD) { MPI_Comm_disconnect(&(mall->intercomm)); } //FIXME Error en OpenMPI + Merge
MPI_Comm_rank(mall->comm, &mall->myId);
MPI_Comm_size(mall->comm, &mall->numP);
mall->root = mall_conf->spawn_method == MALL_SPAWN_BASELINE ? mall->root : mall->root_parents;
mall->root_parents = mall->root;
state = MALL_NOT_STARTED;
if(mam_state != NULL) *mam_state = MAM_COMPLETED;
// Set new communicator
if(mall_conf->spawn_method == MALL_SPAWN_BASELINE) { *(mall->user_comm) = MPI_COMM_WORLD; }
else if(mall_conf->spawn_method == MALL_SPAWN_MERGE) { MPI_Comm_dup(mall->comm, mall->user_comm); }
#if USE_MAL_DEBUG
if(mall->myId == mall->root) DEBUG_FUNC("Reconfiguration has been commited", mall->myId, mall->numP); fflush(stdout);
#endif
#if USE_MAL_BARRIERS
MPI_Barrier(mall->comm);
#endif
mall_conf->times->malleability_end = MPI_Wtime();
}
/*
* Anyade a la estructura concreta de datos elegida
* el nuevo set de datos "data" de un total de "total_qty" elementos.
*
* Los datos variables se tienen que anyadir cuando quieran ser mandados, no antes
*
* Mas informacion en la funcion "add_data".
*
* //FIXME Si es constante se debería ir a asincrono, no sincrono
* This function adds data to a data structure based on whether the operation is synchronous or asynchronous,
* and whether the data is replicated or distributed. It takes the following parameters:
* - data: a pointer to the data to be added
* - index: a pointer to a size_t variable where the index of the added data will be stored
* - total_qty: the amount of elements in data
* - type: the MPI datatype of the data
* - is_replicated: a flag indicating whether the data is replicated (MAM_DATA_REPLICATED) or not (MAM_DATA_DISTRIBUTED)
* - is_constant: a flag indicating whether the operation is asynchronous (MAM_DATA_CONSTANT) or synchronous (MAM_DATA_VARIABLE)
* Finally, it updates the index with the index of the last added data if index is not NULL.
*/
void malleability_add_data(void *data, size_t total_qty, int type, int dependency, int is_replicated, int is_constant) {
size_t total_reqs = 0;
void MAM_Data_add(void *data, size_t *index, size_t total_qty, MPI_Datatype type, int is_replicated, int is_constant) {
size_t total_reqs = 0, returned_index;
if(is_constant) {
if(is_constant) { //Async
if(is_replicated) {
add_data(data, total_qty, type, dependency, total_reqs, rep_s_data);
} else {
add_data(data, total_qty, type, dependency, total_reqs, dist_s_data);
}
} else {
if(is_replicated) {
add_data(data, total_qty, type, dependency, total_reqs, rep_a_data); //FIXME total_reqs==0 ???
total_reqs = 1;
add_data(data, total_qty, type, total_reqs, rep_a_data);
returned_index = rep_a_data->entries-1;
} else {
if(mall_conf->red_method == MALL_RED_BASELINE) {
total_reqs = 1;
} else if(mall_conf->red_method == MALL_RED_IBARRIER) { //TODO This is a strategy, not a method
total_reqs = 2;
} else if(mall_conf->red_method == MALL_RED_POINT) {
} else if(mall_conf->red_method == MALL_RED_POINT || mall_conf->red_method == MALL_RED_RMA_LOCK || mall_conf->red_method == MALL_RED_RMA_LOCKALL) {
total_reqs = mall->numC;
}
add_data(data, total_qty, type, dependency, total_reqs, dist_a_data);
add_data(data, total_qty, type, total_reqs, dist_a_data);
returned_index = dist_a_data->entries-1;
}
} else { //Sync
if(is_replicated) {
add_data(data, total_qty, type, total_reqs, rep_s_data);
returned_index = rep_s_data->entries-1;
} else {
add_data(data, total_qty, type, total_reqs, dist_s_data);
returned_index = dist_s_data->entries-1;
}
}
if(index != NULL) *index = returned_index;
}
/*
* Modifica en la estructura concreta de datos elegida en el indice "index"
* con el set de datos "data" de un total de "total_qty" elementos.
*
* Los datos variables se tienen que modificar cuando quieran ser mandados, no antes
*
* Mas informacion en la funcion "modify_data".
* //FIXME Si es constante se debería ir a asincrono, no sincrono
* This function modifies a data entry to a data structure based on whether the operation is synchronous or asynchronous,
* and whether the data is replicated or distributed. It takes the following parameters:
* - data: a pointer to the data to be added
* - index: a value indicating which entry will be modified
* - total_qty: the amount of elements in data
* - type: the MPI datatype of the data
* - is_replicated: a flag indicating whether the data is replicated (MAM_DATA_REPLICATED) or not (MAM_DATA_DISTRIBUTED)
* - is_constant: a flag indicating whether the operation is asynchronous (MAM_DATA_CONSTANT) or synchronous (MAM_DATA_VARIABLE)
*/
void malleability_modify_data(void *data, size_t index, size_t total_qty, int type, int dependency, int is_replicated, int is_constant) {
void MAM_Data_modify(void *data, size_t index, size_t total_qty, MPI_Datatype type, int is_replicated, int is_constant) {
size_t total_reqs = 0;
if(is_constant) {
if(is_replicated) {
modify_data(data, index, total_qty, type, dependency, total_reqs, rep_s_data);
} else {
modify_data(data, index, total_qty, type, dependency, total_reqs, dist_s_data);
}
} else {
if(is_replicated) {
modify_data(data, index, total_qty, type, dependency, total_reqs, rep_a_data); //FIXME total_reqs==0 ???
total_reqs = 1;
modify_data(data, index, total_qty, type, total_reqs, rep_a_data); //FIXME total_reqs==0 ???
} else {
if(mall_conf->red_method == MALL_RED_BASELINE) {
total_reqs = 1;
} else if(mall_conf->red_method == MALL_RED_IBARRIER) { //TODO This is a strategy, not a method
total_reqs = 2;
} else if(mall_conf->red_method == MALL_RED_POINT) {
} else if(mall_conf->red_method == MALL_RED_POINT || mall_conf->red_method == MALL_RED_RMA_LOCK || mall_conf->red_method == MALL_RED_RMA_LOCKALL) {
total_reqs = mall->numC;
}
modify_data(data, index, total_qty, type, dependency, total_reqs, dist_a_data);
modify_data(data, index, total_qty, type, total_reqs, dist_a_data);
}
} else {
if(is_replicated) {
modify_data(data, index, total_qty, type, total_reqs, rep_s_data);
} else {
modify_data(data, index, total_qty, type, total_reqs, dist_s_data);
}
}
}
/*
* Devuelve el numero de entradas para la estructura de descripcion de
* datos elegida.
* //FIXME Si es constante se debería ir a asincrono, no sincrono
* This functions returns how many data entries are available for one of the specific data structures.
* It takes the following parameters:
* - is_replicated: a flag indicating whether the structure is replicated (MAM_DATA_REPLICATED) or not (MAM_DATA_DISTRIBUTED)
* - is_constant: a flag indicating whether the operation is asynchronous (MAM_DATA_CONSTANT) or synchronous (MAM_DATA_VARIABLE)
* - entries: a pointer where the amount of entries will be stored
*/
void malleability_get_entries(size_t *entries, int is_replicated, int is_constant){
void MAM_Data_get_entries(int is_replicated, int is_constant, size_t *entries){
if(is_constant) {
if(is_replicated) {
*entries = rep_s_data->entries;
*entries = rep_a_data->entries;
} else {
*entries = dist_s_data->entries;
*entries = dist_a_data->entries;
}
} else {
if(is_replicated) {
*entries = rep_a_data->entries;
*entries = rep_s_data->entries;
} else {
*entries = dist_a_data->entries;
*entries = dist_s_data->entries;
}
}
}
/*
* Devuelve el elemento de la lista "index" al usuario.
* La devolución es en el mismo orden que lo han metido los padres
* con la funcion "malleability_add_data()".
* Es tarea del usuario saber el tipo de esos datos.
* TODO Refactor a que sea automatico
* //FIXME Si es constante se debería ir a asincrono, no sincrono
* This function returns a data entry to a data structure based on whether the operation is synchronous or asynchronous,
* and whether the data is replicated or distributed. It takes the following parameters:
* - index: a value indicating which entry will be modified
* - is_replicated: a flag indicating whether the data is replicated (MAM_DATA_REPLICATED) or not (MAM_DATA_DISTRIBUTED)
* - is_constant: a flag indicating whether the operation is asynchronous (MAM_DATA_CONSTANT) or synchronous (MAM_DATA_VARIABLE)
* - data: a pointer where the data will be stored. The user must free it
* - total_qty: the amount of elements in data for all ranks
* - local_qty: the amount of elements in data for this rank
*/
void malleability_get_data(void **data, size_t index, int is_replicated, int is_constant) {
void MAM_Data_get_pointer(void **data, size_t index, size_t *total_qty, MPI_Datatype *type, int is_replicated, int is_constant) {
malleability_data_t *data_struct;
if(is_constant) {
if(is_replicated) {
data_struct = rep_s_data;
data_struct = rep_a_data;
} else {
data_struct = dist_s_data;
data_struct = dist_a_data;
}
} else {
if(is_replicated) {
data_struct = rep_a_data;
data_struct = rep_s_data;
} else {
data_struct = dist_a_data;
data_struct = dist_s_data;
}
}
*data = data_struct->arrays[index];
*total_qty = data_struct->qty[index];
*type = data_struct->types[index];
//get_block_dist(qty, mall->myId, mall->numP, &dist_data); //FIXME Asegurar que numP es correcto
}
/*
* @brief Returns a structure to perform data redistribution during a reconfiguration.
*
* This function is intended to be called when the state of MaM is MALL_USER_PENDING only.
* It is designed to provide the necessary information for the user to perform data redistribution.
*
* Parameters:
* - mam_user_reconf_t *reconf_info: A pointer to a mam_user_reconf_t structure where the function will store the required information for data redistribution.
*
* Return Value:
* - MAM_OK: If the function successfully retrieves the reconfiguration information.
* - MALL_DENIED: If the function is called when the state of the MaM is not MALL_USER_PENDING.
*/
int MAM_Get_Reconf_Info(mam_user_reconf_t *reconf_info) {
if(state != MALL_USER_PENDING) return MALL_DENIED;
*reconf_info = *user_reconf;
return MAM_OK;
}
/*
* @brief Returns the times used for the different steps of last reconfiguration.
*
* This function is intended to be called when a reconfiguration has ended.
* It is designed to provide the necessary information for the user to perform data redistribution.
*
* Parameters:
* - double *sp_time: A pointer where the spawn time will be saved.
* - double *sy_time: A pointer where the sychronous data redistribution time will be saved.
* - double *asy_time: A pointer where the asychronous data redistribution time will be saved.
* - double *mall_time: A pointer where the malleability time will be saved.
*/
void MAM_Retrieve_times(double *sp_time, double *sy_time, double *asy_time, double *mall_time) {
MAM_I_retrieve_times(sp_time, sy_time, asy_time, mall_time);
}
//======================================================||
//================PRIVATE FUNCTIONS=====================||
......@@ -433,7 +478,6 @@ void malleability_get_data(void **data, size_t index, int is_replicated, int is_
//======================================================||
//======================================================||
/*
* Funcion generalizada para enviar datos desde los hijos.
* La asincronizidad se refiere a si el hilo padre e hijo lo hacen
......@@ -444,19 +488,18 @@ void send_data(int numP_children, malleability_data_t *data_struct, int is_async
void *aux_send, *aux_recv;
if(is_asynchronous) {
i= dep_not_send ? 0 : data_struct->entries - 2; //FIXME BORRAR
for(; i < data_struct->entries; i++) {
if(data_struct->dependencies[i] == 1+MAL_DATA_DEPENDENT && dep_not_send && mall_conf->spawn_method == MALL_SPAWN_MERGE) break; //FIXME BORRAR dep_not_send
for(i=0; i < data_struct->entries; i++) {
aux_send = data_struct->arrays[i];
aux_recv = NULL;
async_communication(aux_send, &aux_recv, data_struct->qty[i], data_struct->types[i], data_struct->dependencies[i], mall->myId, mall->numP, numP_children, MALLEABILITY_NOT_CHILDREN, mall_conf->red_method, mall_conf->red_strategies, mall->intercomm, &(data_struct->requests[i]), &(data_struct->request_qty[i]));
async_communication_start(aux_send, &aux_recv, data_struct->qty[i], data_struct->types[i], mall->numP, numP_children, MALLEABILITY_NOT_CHILDREN,
mall->intercomm, &(data_struct->requests[i]), &(data_struct->request_qty[i]), &(data_struct->windows[i]));
if(aux_recv != NULL) data_struct->arrays[i] = aux_recv;
}
} else {
for(i=0; i < data_struct->entries; i++) {
aux_send = data_struct->arrays[i];
aux_recv = NULL;
sync_communication(aux_send, &aux_recv, data_struct->qty[i], data_struct->types[i], data_struct->dependencies[i], mall->myId, mall->numP, numP_children, MALLEABILITY_NOT_CHILDREN, mall_conf->red_method, mall->intercomm);
sync_communication(aux_send, &aux_recv, data_struct->qty[i], data_struct->types[i], mall->numP, numP_children, MALLEABILITY_NOT_CHILDREN, mall->intercomm);
if(aux_recv != NULL) data_struct->arrays[i] = aux_recv;
}
}
......@@ -474,91 +517,287 @@ void recv_data(int numP_parents, malleability_data_t *data_struct, int is_asynch
if(is_asynchronous) {
for(i=0; i < data_struct->entries; i++) {
aux = data_struct->arrays[i];
async_communication(aux_s, &aux, data_struct->qty[i], data_struct->types[i], data_struct->dependencies[i], mall->myId, mall->numP, numP_parents, MALLEABILITY_CHILDREN, mall_conf->red_method, mall_conf->red_strategies, mall->intercomm, &(data_struct->requests[i]), &(data_struct->request_qty[i]));
async_communication_start(aux_s, &aux, data_struct->qty[i], data_struct->types[i], mall->numP, numP_parents, MALLEABILITY_CHILDREN,
mall->intercomm, &(data_struct->requests[i]), &(data_struct->request_qty[i]), &(data_struct->windows[i]));
data_struct->arrays[i] = aux;
}
} else {
for(i=0; i < data_struct->entries; i++) {
aux = data_struct->arrays[i];
sync_communication(aux_s, &aux, data_struct->qty[i], data_struct->types[i], data_struct->dependencies[i], mall->myId, mall->numP, numP_parents, MALLEABILITY_CHILDREN, mall_conf->red_method, mall->intercomm);
sync_communication(aux_s, &aux, data_struct->qty[i], data_struct->types[i], mall->numP, numP_parents, MALLEABILITY_CHILDREN, mall->intercomm);
data_struct->arrays[i] = aux;
}
}
}
//======================================================||
//================PRIVATE FUNCTIONS=====================||
//====================MAM STAGES========================||
//======================================================||
//======================================================||
//======================================================||
//======================================================||
//======================================================||
//======================================================||
int MAM_St_rms(int *mam_state) {
reset_malleability_times();
#if USE_MAL_BARRIERS
MPI_Barrier(mall->comm);
#endif
mall_conf->times->malleability_start = MPI_Wtime();
*mam_state = MAM_NOT_STARTED;
state = MALL_RMS_COMPLETED;
MAM_Check_configuration();
mall->wait_targets_posted = 0;
//if(CHECK_RMS()) {return MALL_DENIED;}
return 1;
}
int MAM_St_spawn_start() {
mall->num_parents = mall->numP;
state = spawn_step();
//FIXME Esto es necesario pero feo
if(mall_conf->spawn_method == MALL_SPAWN_MERGE && mall->myId >= mall->numC){ mall->zombie = 1; }
else if(mall_conf->spawn_method == MALL_SPAWN_BASELINE){ mall->zombie = 1; }
if (state == MALL_SPAWN_COMPLETED || state == MALL_SPAWN_ADAPT_POSTPONE){
return 1;
}
return 0;
}
int MAM_St_spawn_pending(int wait_completed) {
state = check_spawn_state(&(mall->intercomm), mall->comm, wait_completed);
if (state == MALL_SPAWN_COMPLETED || state == MALL_SPAWN_ADAPTED) {
#if USE_MAL_BARRIERS
MPI_Barrier(mall->comm);
#endif
mall_conf->times->spawn_time = MPI_Wtime() - mall_conf->times->malleability_start;
return 1;
}
return 0;
}
int MAM_St_red_start() {
if(MAM_Contains_strat(MAM_SPAWN_STRATEGIES, MAM_STRAT_SPAWN_INTERCOMM, NULL)) {
mall->root_collectives = mall->myId == mall->root ? MPI_ROOT : MPI_PROC_NULL;
} else {
mall->root_collectives = mall->root;
}
state = start_redistribution();
return 1;
}
int MAM_St_red_pending(int *mam_state, int wait_completed) {
if(MAM_Contains_strat(MAM_RED_STRATEGIES, MAM_STRAT_RED_PTHREAD, NULL)) {
state = thread_check(wait_completed);
} else {
state = check_redistribution(wait_completed);
}
if(state != MALL_DIST_PENDING) {
if(MAM_Contains_strat(MAM_SPAWN_STRATEGIES, MAM_STRAT_SPAWN_INTERCOMM, NULL)) {
MPI_Intercomm_merge(mall->intercomm, MALLEABILITY_NOT_CHILDREN, &mall->tmp_comm); //El que pone 0 va primero
} else {
MPI_Comm_dup(mall->intercomm, &mall->tmp_comm);
}
MPI_Comm_set_name(mall->tmp_comm, "MAM_USER_TMP");
state = MALL_USER_PENDING;
*mam_state = MAM_USER_PENDING;
return 1;
}
return 0;
}
int MAM_St_user_pending(int *mam_state, int wait_completed, void (*user_function)(void *), void *user_args) {
#if USE_MAL_DEBUG
if(mall->myId == mall->root) DEBUG_FUNC("Starting USER redistribution", mall->myId, mall->numP); fflush(stdout);
#endif
if(user_function != NULL) {
MAM_I_create_user_struct(MALLEABILITY_NOT_CHILDREN);
do {
user_function(user_args);
} while(wait_completed && state == MALL_USER_PENDING);
} else {
MAM_Resume_redistribution(mam_state);
}
if(state != MALL_USER_PENDING) {
#if USE_MAL_DEBUG
if(mall->myId == mall->root) DEBUG_FUNC("Ended USER redistribution", mall->myId, mall->numP); fflush(stdout);
#endif
return 1;
}
return 0;
}
int MAM_St_user_completed() {
state = end_redistribution();
return 1;
}
int MAM_St_spawn_adapt_pending(int wait_completed) {
wait_completed = 1;
#if USE_MAL_BARRIERS
MPI_Barrier(mall->comm);
#endif
mall_conf->times->spawn_start = MPI_Wtime();
unset_spawn_postpone_flag(state);
state = check_spawn_state(&(mall->intercomm), mall->comm, wait_completed);
/* TODO Comentar problema, basicamente indicar que no es posible de la forma actual
* Ademas es solo para una operación que hemos visto como "extremadamente" rápida
if(!MAM_Contains_strat(MAM_SPAWN_STRATEGIES, MAM_STRAT_SPAWN_PTHREAD, NULL)) {
#if USE_MAL_BARRIERS
MPI_Barrier(mall->comm);
#endif
mall_conf->times->spawn_time = MPI_Wtime() - mall_conf->times->spawn_start;
return 1;
}
return 0;
*/
#if USE_MAL_BARRIERS
MPI_Barrier(mall->comm);
#endif
mall_conf->times->spawn_time = MPI_Wtime() - mall_conf->times->spawn_start;
return 1;
}
int MAM_St_completed(int *mam_state) {
MAM_Commit(mam_state);
return 0;
}
//======================================================||
//================PRIVATE FUNCTIONS=====================||
//=====================CHILDREN=========================||
//======================================================||
//======================================================||
//======================================================||
//======================================================||
//======================================================||
//======================================================||
/*
* Inicializacion de los datos de los hijos.
* En la misma se reciben datos de los padres: La configuracion
* de la ejecucion a realizar; y los datos a recibir de los padres
* ya sea de forma sincrona, asincrona o ambas.
*/
void Children_init() {
void Children_init(void (*user_function)(void *), void *user_args) {
size_t i;
int numP_parents, root_parents;
int is_intercomm;
malleability_connect_children(mall->myId, mall->numP, mall->root, mall->comm, &numP_parents, &root_parents, &(mall->intercomm));
MPI_Comm_test_inter(mall->intercomm, &is_intercomm);
if(!is_intercomm) { // For intracommunicators, these processes will be added
MPI_Comm_rank(mall->intercomm, &(mall->myId));
MPI_Comm_size(mall->intercomm, &(mall->numP));
#if USE_MAL_DEBUG
DEBUG_FUNC("MaM will now initialize spawned processes", mall->myId, mall->numP); fflush(stdout); MPI_Barrier(MPI_COMM_WORLD);
#endif
malleability_connect_children(mall->comm, &(mall->intercomm));
if(mall_conf->spawn_method == MALL_SPAWN_MERGE) { // For Merge Method, these processes will be added
MPI_Comm_rank(mall->intercomm, &mall->myId);
MPI_Comm_size(mall->intercomm, &mall->numP);
}
mall->root_collectives = mall->root_parents;
comm_node_data(root_parents, MALLEABILITY_CHILDREN);
MPI_Bcast(&(mall_conf->red_method), 1, MPI_INT, root_parents, mall->intercomm);
MPI_Bcast(&(mall_conf->red_strategies), 1, MPI_INT, root_parents, mall->intercomm);
#if USE_MAL_DEBUG
DEBUG_FUNC("Spawned have completed spawn step", mall->myId, mall->numP); fflush(stdout); MPI_Barrier(MPI_COMM_WORLD);
#endif
comm_data_info(rep_a_data, dist_a_data, MALLEABILITY_CHILDREN, mall->myId, root_parents, mall->intercomm);
comm_data_info(rep_a_data, dist_a_data, MALLEABILITY_CHILDREN);
if(dist_a_data->entries || rep_a_data->entries) { // Recibir datos asincronos
//MPI_Barrier(mall->intercomm);
if(malleability_red_contains_strat(mall_conf->red_strategies, MALL_RED_THREAD, NULL)) {
recv_data(numP_parents, dist_a_data, MALLEABILITY_USE_SYNCHRONOUS);
#if USE_MAL_DEBUG >= 2
DEBUG_FUNC("Spawned start asynchronous redistribution", mall->myId, mall->numP); fflush(stdout); MPI_Barrier(MPI_COMM_WORLD);
#endif
#if USE_MAL_BARRIERS
MPI_Barrier(mall->intercomm);
#endif
if(MAM_Contains_strat(MAM_RED_STRATEGIES, MAM_STRAT_RED_PTHREAD, NULL)) {
recv_data(mall->num_parents, dist_a_data, MALLEABILITY_USE_SYNCHRONOUS);
for(i=0; i<rep_a_data->entries; i++) {
MPI_Bcast(rep_a_data->arrays[i], rep_a_data->qty[i], rep_a_data->types[i], mall->root_collectives, mall->intercomm);
}
} else {
recv_data(numP_parents, dist_a_data, MALLEABILITY_USE_ASYNCHRONOUS);
recv_data(mall->num_parents, dist_a_data, MALLEABILITY_USE_ASYNCHRONOUS);
for(i=0; i<rep_a_data->entries; i++) {
MPI_Ibcast(rep_a_data->arrays[i], rep_a_data->qty[i], rep_a_data->types[i], mall->root_collectives, mall->intercomm, &(rep_a_data->requests[i][0]));
}
#if USE_MAL_DEBUG >= 2
DEBUG_FUNC("Spawned started asynchronous redistribution", mall->myId, mall->numP); fflush(stdout); MPI_Barrier(MPI_COMM_WORLD);
#endif
//MPI_Barrier(mall->intercomm);
mall_conf->async_end= MPI_Wtime(); // Obtener timestamp de cuando termina comm asincrona
for(i=0; i<rep_a_data->entries; i++) {
async_communication_wait(rep_a_data->requests[i], rep_a_data->request_qty[i]);
}
for(i=0; i<dist_a_data->entries; i++) {
async_communication_wait(dist_a_data->requests[i], dist_a_data->request_qty[i]);
}
if(MAM_Contains_strat(MAM_RED_STRATEGIES, MAM_STRAT_RED_WAIT_TARGETS, NULL)) {
MPI_Ibarrier(mall->intercomm, &mall->wait_targets);
mall->wait_targets_posted = 1;
MPI_Wait(&mall->wait_targets, MPI_STATUS_IGNORE);
}
comm_data_info(rep_s_data, dist_s_data, MALLEABILITY_CHILDREN, mall->myId, root_parents, mall->intercomm);
if(dist_s_data->entries || rep_s_data->entries) { // Recibir datos sincronos
//MPI_Barrier(mall->intercomm);
recv_data(numP_parents, dist_s_data, MALLEABILITY_USE_SYNCHRONOUS);
#if USE_MAL_DEBUG >= 2
DEBUG_FUNC("Spawned waited for all asynchronous redistributions", mall->myId, mall->numP); fflush(stdout); MPI_Barrier(MPI_COMM_WORLD);
#endif
for(i=0; i<dist_a_data->entries; i++) {
async_communication_end(dist_a_data->requests[i], dist_a_data->request_qty[i], &(dist_a_data->windows[i]));
}
for(i=0; i<rep_a_data->entries; i++) {
async_communication_end(rep_a_data->requests[i], rep_a_data->request_qty[i], &(rep_a_data->windows[i]));
}
}
// TODO Crear funcion especifica y anyadir para Asinc
// TODO Tener en cuenta el tipo y qty
for(i=0; i<rep_s_data->entries; i++) {
MPI_Datatype datatype;
if(rep_s_data->types[i] == MAL_INT) {
datatype = MPI_INT;
} else if(rep_s_data->types[i] == MAL_DOUBLE) {
datatype = MPI_DOUBLE;
} else if(rep_s_data->types[i] == MAL_CHAR) {
datatype = MPI_CHAR;
} else {
printf("Malleability -- Redistribution recv type not recognised\n");
MPI_Abort(MPI_COMM_WORLD, -1);
#if USE_MAL_BARRIERS
MPI_Barrier(mall->intercomm);
#endif
mall_conf->times->async_end= MPI_Wtime(); // Obtener timestamp de cuando termina comm asincrona
}
MPI_Bcast(rep_s_data->arrays[i], rep_s_data->qty[i], datatype, root_parents, mall->intercomm);
#if USE_MAL_DEBUG
DEBUG_FUNC("Spawned have completed asynchronous data redistribution step", mall->myId, mall->numP); fflush(stdout); MPI_Barrier(MPI_COMM_WORLD);
#endif
if(MAM_Contains_strat(MAM_SPAWN_STRATEGIES, MAM_STRAT_SPAWN_INTERCOMM, NULL)) {
MPI_Intercomm_merge(mall->intercomm, MALLEABILITY_CHILDREN, &mall->tmp_comm); //El que pone 0 va primero
} else {
MPI_Comm_dup(mall->intercomm, &mall->tmp_comm);
}
//MPI_Barrier(mall->intercomm);
mall_conf->sync_end = MPI_Wtime(); // Obtener timestamp de cuando termina comm sincrona
MPI_Comm_set_name(mall->tmp_comm, "MAM_USER_TMP");
if(user_function != NULL) {
state = MALL_USER_PENDING;
MAM_I_create_user_struct(MALLEABILITY_CHILDREN);
user_function(user_args);
}
comm_results(root_parents, 1);
if(!is_intercomm) {
malleability_comms_update(mall->intercomm);
comm_data_info(rep_s_data, dist_s_data, MALLEABILITY_CHILDREN);
if(dist_s_data->entries || rep_s_data->entries) { // Recibir datos sincronos
#if USE_MAL_BARRIERS
MPI_Barrier(mall->intercomm);
#endif
recv_data(mall->num_parents, dist_s_data, MALLEABILITY_USE_SYNCHRONOUS);
for(i=0; i<rep_s_data->entries; i++) {
MPI_Bcast(rep_s_data->arrays[i], rep_s_data->qty[i], rep_s_data->types[i], mall->root_collectives, mall->intercomm);
}
#if USE_MAL_BARRIERS
MPI_Barrier(mall->intercomm);
#endif
mall_conf->times->sync_end = MPI_Wtime(); // Obtener timestamp de cuando termina comm sincrona
}
//MPI_Barrier(mall->comm);
mall_conf->malleability_end = MPI_Wtime(); // Obtener timestamp de cuando termina maleabilidad
MPI_Comm_disconnect(&(mall->intercomm)); //FIXME Error en OpenMPI + Merge
#if USE_MAL_DEBUG
DEBUG_FUNC("Targets have completed synchronous data redistribution step", mall->myId, mall->numP); fflush(stdout); MPI_Barrier(MPI_COMM_WORLD);
#endif
MAM_Commit(NULL);
#if USE_MAL_DEBUG
DEBUG_FUNC("MaM has been initialized correctly for new ranks", mall->myId, mall->numP); fflush(stdout); MPI_Barrier(MPI_COMM_WORLD);
#endif
}
//======================================================||
......@@ -566,20 +805,26 @@ void Children_init() {
//=====================PARENTS==========================||
//======================================================||
//======================================================||
//======================================================||
//======================================================||
/*
* Se encarga de realizar la creacion de los procesos hijos.
* Si se pide en segundo plano devuelve el estado actual.
*/
int spawn_step(){
//MPI_Barrier(mall->comm);
mall_conf->spawn_start = MPI_Wtime();
state = init_spawn(mall->name_exec, mall->num_cpus, mall->num_nodes, mall->nodelist, mall->myId, mall->numP, mall->numC, mall->root, mall_conf->spawn_dist, mall_conf->spawn_method, mall_conf->spawn_strategies, mall->thread_comm, &(mall->intercomm));
if(!malleability_spawn_contains_strat(mall_conf->spawn_strategies, MALL_SPAWN_PTHREAD, NULL)) {
//MPI_Barrier(mall->comm);
mall_conf->spawn_time = MPI_Wtime() - mall_conf->spawn_start;
#if USE_MAL_BARRIERS
MPI_Barrier(mall->comm);
#endif
mall_conf->times->spawn_start = MPI_Wtime();
state = init_spawn(mall->thread_comm, &(mall->intercomm));
if(!MAM_Contains_strat(MAM_SPAWN_STRATEGIES, MAM_STRAT_SPAWN_PTHREAD, NULL)) {
#if USE_MAL_BARRIERS
MPI_Barrier(mall->comm);
#endif
mall_conf->times->spawn_time = MPI_Wtime() - mall_conf->times->malleability_start;
}
return state;
}
......@@ -600,40 +845,36 @@ int spawn_step(){
* grupos de procesos.
*/
int start_redistribution() {
int rootBcast, is_intercomm;
size_t i;
is_intercomm = 0;
if(mall->intercomm != MPI_COMM_NULL) {
MPI_Comm_test_inter(mall->intercomm, &is_intercomm);
} else {
if(mall->intercomm == MPI_COMM_NULL) {
// Si no tiene comunicador creado, se debe a que se ha pospuesto el Spawn
// y se trata del spawn Merge Shrink
MPI_Comm_dup(mall->comm, &(mall->intercomm));
}
if(is_intercomm) {
rootBcast = mall->myId == mall->root ? MPI_ROOT : MPI_PROC_NULL;
} else {
rootBcast = mall->root;
}
comm_node_data(rootBcast, MALLEABILITY_NOT_CHILDREN);
MPI_Bcast(&(mall_conf->red_method), 1, MPI_INT, rootBcast, mall->intercomm);
MPI_Bcast(&(mall_conf->red_strategies), 1, MPI_INT, rootBcast, mall->intercomm);
comm_data_info(rep_a_data, dist_a_data, MALLEABILITY_NOT_CHILDREN, mall->myId, mall->root, mall->intercomm);
comm_data_info(rep_a_data, dist_a_data, MALLEABILITY_NOT_CHILDREN);
if(dist_a_data->entries || rep_a_data->entries) { // Enviar datos asincronos
//FIXME No se envian los datos replicados (rep_a_data)
//MPI_Barrier(mall->intercomm);
mall_conf->async_time = MPI_Wtime();
if(malleability_red_contains_strat(mall_conf->red_strategies, MALL_RED_THREAD, NULL)) {
#if USE_MAL_BARRIERS
MPI_Barrier(mall->intercomm);
#endif
mall_conf->times->async_start = MPI_Wtime();
if(MAM_Contains_strat(MAM_RED_STRATEGIES, MAM_STRAT_RED_PTHREAD, NULL)) {
return thread_creation();
} else {
send_data(mall->numC, dist_a_data, MALLEABILITY_USE_ASYNCHRONOUS);
for(i=0; i<rep_a_data->entries; i++) {
MPI_Ibcast(rep_a_data->arrays[i], rep_a_data->qty[i], rep_a_data->types[i], mall->root_collectives, mall->intercomm, &(rep_a_data->requests[i][0]));
}
if(mall->zombie && MAM_Contains_strat(MAM_RED_STRATEGIES, MAM_STRAT_RED_WAIT_TARGETS, NULL)) {
MPI_Ibarrier(mall->intercomm, &mall->wait_targets);
mall->wait_targets_posted = 1;
}
return MALL_DIST_PENDING;
}
}
return end_redistribution();
return MALL_USER_PENDING;
}
......@@ -649,57 +890,90 @@ int start_redistribution() {
* terminada cuando los padres terminan de enviar.
* Si se utiliza el modo "MAL_USE_IBARRIER", se considera terminada cuando
* los hijos han terminado de recibir.
* //FIXME Modificar para que se tenga en cuenta rep_a_data
*/
int check_redistribution() {
int is_intercomm, completed, local_completed, all_completed, test_err;
size_t i, j, req_qty;
int check_redistribution(int wait_completed) {
int completed, local_completed, all_completed;
size_t i, req_qty;
MPI_Request *req_completed;
MPI_Win window;
local_completed = 1;
test_err = 0;
//FIXME Modificar para que se tenga en cuenta rep_a_data
#if USE_MAL_DEBUG >= 2
DEBUG_FUNC("Sources are testing for all asynchronous redistributions", mall->myId, mall->numP); fflush(stdout); MPI_Barrier(MPI_COMM_WORLD);
#endif
if(wait_completed) {
if(MAM_Contains_strat(MAM_RED_STRATEGIES, MAM_STRAT_RED_WAIT_TARGETS, NULL) && !mall->wait_targets_posted) {
MPI_Ibarrier(mall->intercomm, &mall->wait_targets);
mall->wait_targets_posted = 1;
}
for(i=0; i<dist_a_data->entries; i++) {
req_completed = dist_a_data->requests[i];
req_qty = dist_a_data->request_qty[i];
if(malleability_red_contains_strat(mall_conf->red_strategies, MALL_RED_IBARRIER, NULL)) { //FIXME Strategy not fully implemented
test_err = MPI_Test(&(req_completed[req_qty-1]), &completed, MPI_STATUS_IGNORE);
local_completed = local_completed && completed;
async_communication_wait(req_completed, req_qty);
}
for(i=0; i<rep_a_data->entries; i++) {
req_completed = rep_a_data->requests[i];
req_qty = rep_a_data->request_qty[i];
async_communication_wait(req_completed, req_qty);
}
if(MAM_Contains_strat(MAM_RED_STRATEGIES, MAM_STRAT_RED_WAIT_TARGETS, NULL)) { MPI_Wait(&mall->wait_targets, MPI_STATUS_IGNORE); }
} else {
if(mall->wait_targets_posted) {
MPI_Test(&mall->wait_targets, &local_completed, MPI_STATUS_IGNORE);
} else {
for(j=0; j<req_qty; j++) {
test_err = MPI_Test(&(req_completed[j]), &completed, MPI_STATUS_IGNORE);
for(i=0; i<dist_a_data->entries; i++) {
req_completed = dist_a_data->requests[i];
req_qty = dist_a_data->request_qty[i];
completed = async_communication_check(MALLEABILITY_NOT_CHILDREN, req_completed, req_qty);
local_completed = local_completed && completed;
}
// test_err = MPI_Testall(req_qty, req_completed, &completed, MPI_STATUSES_IGNORE);
}
for(i=0; i<rep_a_data->entries; i++) {
req_completed = rep_a_data->requests[i];
req_qty = rep_a_data->request_qty[i];
completed = async_communication_check(MALLEABILITY_NOT_CHILDREN, req_completed, req_qty);
local_completed = local_completed && completed;
}
if (test_err != MPI_SUCCESS && test_err != MPI_ERR_PENDING) {
printf("P%d aborting -- Test Async\n", mall->myId);
MPI_Abort(MPI_COMM_WORLD, test_err);
if(local_completed && MAM_Contains_strat(MAM_RED_STRATEGIES, MAM_STRAT_RED_WAIT_TARGETS, NULL)) {
MPI_Ibarrier(mall->intercomm, &mall->wait_targets);
mall->wait_targets_posted = 1;
MPI_Test(&mall->wait_targets, &local_completed, MPI_STATUS_IGNORE); //TODO - Figure out if last process takes profit from calling here
}
}
#if USE_MAL_DEBUG >= 2
DEBUG_FUNC("Sources will now check a global decision", mall->myId, mall->numP); fflush(stdout); MPI_Barrier(MPI_COMM_WORLD);
#endif
MPI_Allreduce(&local_completed, &all_completed, 1, MPI_INT, MPI_MIN, mall->comm);
if(!all_completed) return MALL_DIST_PENDING; // Continue only if asynchronous send has ended
}
#if USE_MAL_DEBUG >= 2
DEBUG_FUNC("Sources sent asynchronous redistributions", mall->myId, mall->numP); fflush(stdout); MPI_Barrier(MPI_COMM_WORLD);
#endif
if(malleability_red_contains_strat(mall_conf->red_strategies, MALL_RED_IBARRIER, NULL)) { //FIXME Strategy not fully implemented
MPI_Waitall(req_qty, req_completed, MPI_STATUSES_IGNORE);
//Para la desconexión de ambos grupos de procesos es necesario indicar a MPI que esta comm
//ha terminado, aunque solo se pueda llegar a este punto cuando ha terminado
for(i=0; i<dist_a_data->entries; i++) {
req_completed = dist_a_data->requests[i];
req_qty = dist_a_data->request_qty[i];
window = dist_a_data->windows[i];
async_communication_end(req_completed, req_qty, &window);
}
if(dep_not_send && mall_conf->spawn_method == MALL_SPAWN_MERGE) { //FIXME BORRAR // A MITAD -- Algunas partes son útiles
dep_not_send = 0;
send_data(mall->numC, dist_a_data, MALLEABILITY_USE_ASYNCHRONOUS);
return MALL_DIST_PENDING;
for(i=0; i<rep_a_data->entries; i++) {
req_completed = rep_a_data->requests[i];
req_qty = rep_a_data->request_qty[i];
window = rep_a_data->windows[i];
async_communication_end(req_completed, req_qty, &window);
}
MPI_Comm_test_inter(mall->intercomm, &is_intercomm);
//MPI_Barrier(mall->intercomm);
if(!is_intercomm) mall_conf->async_end = MPI_Wtime(); // Merge method only
return end_redistribution();
#if USE_MAL_BARRIERS
MPI_Barrier(mall->intercomm);
#endif
if(mall_conf->spawn_method == MALL_SPAWN_MERGE) mall_conf->times->async_end = MPI_Wtime(); // Merge method only
return MALL_USER_PENDING;
}
/*
* Termina la redistribución de los datos con los hijos, comprobando
* si se han realizado iteraciones con comunicaciones en segundo plano
......@@ -710,137 +984,34 @@ int check_redistribution() {
*/
int end_redistribution() {
size_t i;
int is_intercomm, rootBcast, local_state;
int local_state;
MPI_Comm_test_inter(mall->intercomm, &is_intercomm);
if(is_intercomm) {
rootBcast = mall->myId == mall->root ? MPI_ROOT : MPI_PROC_NULL;
} else {
rootBcast = mall->root;
}
comm_data_info(rep_s_data, dist_s_data, MALLEABILITY_NOT_CHILDREN, mall->myId, mall->root, mall->intercomm);
comm_data_info(rep_s_data, dist_s_data, MALLEABILITY_NOT_CHILDREN);
if(dist_s_data->entries || rep_s_data->entries) { // Enviar datos sincronos
//MPI_Barrier(mall->intercomm);
mall_conf->sync_time = MPI_Wtime();
#if USE_MAL_BARRIERS
MPI_Barrier(mall->intercomm);
#endif
mall_conf->times->sync_start = MPI_Wtime();
send_data(mall->numC, dist_s_data, MALLEABILITY_USE_SYNCHRONOUS);
// TODO Crear funcion especifica y anyadir para Asinc
// TODO Tener en cuenta el tipo
for(i=0; i<rep_s_data->entries; i++) {
MPI_Datatype datatype;
if(rep_s_data->types[i] == MAL_INT) {
datatype = MPI_INT;
} else if(rep_s_data->types[i] == MAL_DOUBLE) {
datatype = MPI_DOUBLE;
} else if(rep_s_data->types[i] == MAL_CHAR) {
datatype = MPI_CHAR;
} else {
printf("Malleability -- Redistribution data array type not recognised\n");
MPI_Abort(MPI_COMM_WORLD, -1);
}
MPI_Bcast(rep_s_data->arrays[i], rep_s_data->qty[i], datatype, rootBcast, mall->intercomm);
}
//MPI_Barrier(mall->intercomm);
if(!is_intercomm) mall_conf->sync_end = MPI_Wtime(); // Merge method only
MPI_Bcast(rep_s_data->arrays[i], rep_s_data->qty[i], rep_s_data->types[i], mall->root_collectives, mall->intercomm);
}
int compute = mall_conf->spawn_method == MALL_SPAWN_MERGE ? 1 : 0;
comm_results(rootBcast, compute);
#if USE_MAL_BARRIERS
MPI_Barrier(mall->intercomm);
#endif
if(mall_conf->spawn_method == MALL_SPAWN_MERGE) mall_conf->times->sync_end = MPI_Wtime(); // Merge method only
}
local_state = MALL_DIST_COMPLETED;
if(!is_intercomm) { // Merge Spawn
if(mall->numP < mall->numC) { // Expand
malleability_comms_update(mall->intercomm);
} else { // Shrink || Merge Shrink requiere de mas tareas
if(mall_conf->spawn_method == MALL_SPAWN_MERGE && mall->numP > mall->numC) { // Merge Shrink
local_state = MALL_SPAWN_ADAPT_PENDING;
}
}
if(mall->intercomm != MPI_COMM_NULL && mall->intercomm != MPI_COMM_WORLD) {
MPI_Comm_disconnect(&(mall->intercomm)); //FIXME Error en OpenMPI + Merge
}
return local_state;
}
///=============================================
///=============================================
///=============================================
//TODO Add comment
int shrink_redistribution() {
//MPI_Barrier(mall->comm);
double time_extra = MPI_Wtime();
//TODO Create new state before collecting zombies. Processes can perform tasks before that. Then call again Malleability to commit the change
zombies_collect_suspended(mall->user_comm, mall->myId, mall->numP, mall->numC, mall->root);
if(mall->myId < mall->numC) {
if(mall->thread_comm != MPI_COMM_WORLD) MPI_Comm_free(&(mall->thread_comm)); //FIXME Modificar a que se pida pro el usuario el cambio y se llama a comms_update
if(mall->comm != MPI_COMM_WORLD) MPI_Comm_free(&(mall->comm));
mall->dup_user_comm = 1;
MPI_Comm_dup(mall->intercomm, &(mall->thread_comm));
MPI_Comm_dup(mall->intercomm, &(mall->comm));
MPI_Comm_set_name(mall->thread_comm, "MPI_COMM_MALL_THREAD");
MPI_Comm_set_name(mall->comm, "MPI_COMM_MALL");
MPI_Comm_free(&(mall->intercomm));
//MPI_Barrier(mall->comm);
mall_conf->spawn_time += MPI_Wtime() - time_extra;
return MALL_DIST_COMPLETED;
} else {
return MALL_ZOMBIE;
}
}
//======================================================||
//================PRIVATE FUNCTIONS=====================||
//=================COMM NODE INFO ======================||
//======================================================||
//======================================================||
//TODO Add comment
void comm_node_data(int rootBcast, int is_child_group) {
MPI_Datatype node_type;
def_nodeinfo_type(&node_type);
MPI_Bcast(mall, 1, node_type, rootBcast, mall->intercomm);
if(is_child_group) {
mall->nodelist = malloc((mall->nodelist_len+1) * sizeof(char));
mall->nodelist[mall->nodelist_len] = '\0';
}
MPI_Bcast(mall->nodelist, mall->nodelist_len, MPI_CHAR, rootBcast, mall->intercomm);
MPI_Type_free(&node_type);
}
//TODO Add comment
void def_nodeinfo_type(MPI_Datatype *node_type) {
int i, counts = 3;
int blocklengths[3] = {1, 1, 1};
MPI_Aint displs[counts], dir;
MPI_Datatype types[counts];
// Rellenar vector types
types[0] = types[1] = types[2] = MPI_INT;
// Rellenar vector displs
MPI_Get_address(mall, &dir);
MPI_Get_address(&(mall->num_cpus), &displs[0]);
MPI_Get_address(&(mall->num_nodes), &displs[1]);
MPI_Get_address(&(mall->nodelist_len), &displs[2]);
for(i=0;i<counts;i++) displs[i] -= dir;
MPI_Type_create_struct(counts, blocklengths, displs, types, node_type);
MPI_Type_commit(node_type);
}
// TODO MOVER A OTRO LADO??
//======================================================||
//================PRIVATE FUNCTIONS=====================||
......@@ -869,23 +1040,32 @@ int thread_creation() {
*
* El estado de la comunicación es devuelto al finalizar la función.
*/
int thread_check() {
int all_completed = 0, is_intercomm;
int thread_check(int wait_completed) {
int all_completed = 0;
if(wait_completed && comm_state == MALL_DIST_PENDING) {
if(pthread_join(mall->async_thread, NULL)) {
printf("Error al esperar al hilo\n");
MPI_Abort(MPI_COMM_WORLD, -1);
return -2;
}
}
// Comprueba que todos los hilos han terminado la distribucion (Mismo valor en commAsync)
MPI_Allreduce(&comm_state, &all_completed, 1, MPI_INT, MPI_MAX, mall->comm);
if(all_completed != MALL_DIST_COMPLETED) return MALL_DIST_PENDING; // Continue only if asynchronous send has ended
//FIXME No se tiene en cuenta el estado MALL_APP_ENDED
if(pthread_join(mall->async_thread, NULL)) {
printf("Error al esperar al hilo\n");
MPI_Abort(MPI_COMM_WORLD, -1);
return -2;
}
MPI_Comm_test_inter(mall->intercomm, &is_intercomm);
//MPI_Barrier(mall->intercomm);
if(!is_intercomm) mall_conf->async_end = MPI_Wtime(); // Merge method only
return end_redistribution();
#if USE_MAL_BARRIERS
MPI_Barrier(mall->intercomm);
#endif
if(mall_conf->spawn_method == MALL_SPAWN_MERGE) mall_conf->times->async_end = MPI_Wtime(); // Merge method only
return MALL_USER_PENDING;
}
......@@ -898,7 +1078,12 @@ int thread_check() {
* por el valor "commAsync".
*/
void* thread_async_work() {
size_t i;
send_data(mall->numC, dist_a_data, MALLEABILITY_USE_SYNCHRONOUS);
for(i=0; i<rep_a_data->entries; i++) {
MPI_Bcast(rep_a_data->arrays[i], rep_a_data->qty[i], rep_a_data->types[i], mall->root_collectives, mall->intercomm);
}
comm_state = MALL_DIST_COMPLETED;
pthread_exit(NULL);
}
......@@ -914,8 +1099,8 @@ void print_comms_state() {
MPI_Comm_get_name(mall->comm, test, &tester);
printf("P%d Comm=%d Name=%s\n", mall->myId, mall->comm, test);
MPI_Comm_get_name(mall->user_comm, test, &tester);
printf("P%d Comm=%d Name=%s\n", mall->myId, mall->user_comm, test);
MPI_Comm_get_name(*(mall->user_comm), test, &tester);
printf("P%d Comm=%d Name=%s\n", mall->myId, *(mall->user_comm), test);
if(mall->intercomm != MPI_COMM_NULL) {
MPI_Comm_get_name(mall->intercomm, test, &tester);
printf("P%d Comm=%d Name=%s\n", mall->myId, mall->intercomm, test);
......@@ -923,71 +1108,34 @@ void print_comms_state() {
free(test);
}
/*
* Función solo necesaria en Merge
*/
void malleability_comms_update(MPI_Comm comm) {
if(mall->thread_comm != MPI_COMM_WORLD) MPI_Comm_free(&(mall->thread_comm));
if(mall->comm != MPI_COMM_WORLD) MPI_Comm_free(&(mall->comm));
if(mall->user_comm != MPI_COMM_WORLD) MPI_Comm_free(&(mall->user_comm)); //TODO No es peligroso?
MPI_Comm_dup(comm, &(mall->thread_comm));
MPI_Comm_dup(comm, &(mall->comm));
MPI_Comm_dup(comm, &(mall->user_comm));
MPI_Comm_set_name(mall->thread_comm, "MPI_COMM_MALL_THREAD");
MPI_Comm_set_name(mall->comm, "MPI_COMM_MALL");
MPI_Comm_set_name(mall->user_comm, "MPI_COMM_MALL_USER");
MPI_Comm_set_name(mall->thread_comm, "MAM_THREAD");
MPI_Comm_set_name(mall->comm, "MAM_MAIN");
}
void comm_results(int root, int compute) {
MPI_Datatype new_type;
def_malleability_results(&new_type);
MPI_Bcast(mall_conf, 1, new_type, root, mall->intercomm);
/*
if(compute) {
mall_conf->sync_time = mall_conf->sync_end - mall_conf->sync_time;
if(dist_a_data->entries) mall_conf->async_time = mall_conf->async_end - mall_conf->async_time;
mall_conf->malleability_time = mall_conf->malleability_end - mall_conf->malleability_time;
}
* TODO Por hacer
*/
MPI_Type_free(&new_type);
}
void set_global_time(double ex_start) {
mall_conf->exec_start = ex_start;
}
void MAM_I_create_user_struct(int is_children_group) {
user_reconf->comm = mall->tmp_comm;
void retrieve_results(double *sp_time, double *sy_time, double *asy_time, double *mall_time, double *ex_start) {
mall_conf->sync_time = mall_conf->sync_end - mall_conf->sync_time;
if(dist_a_data->entries) mall_conf->async_time = mall_conf->async_end - mall_conf->async_time;
mall_conf->malleability_time = mall_conf->malleability_end - mall_conf->malleability_time;
*sp_time = mall_conf->spawn_time;
*sy_time = mall_conf->sync_time;
*asy_time = mall_conf->async_time;
*mall_time = mall_conf->malleability_time;
*ex_start = mall_conf->exec_start;
}
void def_malleability_results(MPI_Datatype *new_type) {
int i, counts = 5;
int blocklengths[counts];
MPI_Aint displs[counts], dir;
MPI_Datatype types[counts];
blocklengths[0] = blocklengths[1] = blocklengths[2] = blocklengths[3] = blocklengths[4] = 1;
types[0] = types[1] = types[2] = types[3] = types[4] = MPI_DOUBLE;
// Rellenar vector displs
MPI_Get_address(mall_conf, &dir);
// Obtener direccion base
MPI_Get_address(&(mall_conf->spawn_time), &displs[0]);
MPI_Get_address(&(mall_conf->sync_time), &displs[1]);
MPI_Get_address(&(mall_conf->async_time), &displs[2]);
MPI_Get_address(&(mall_conf->malleability_time), &displs[3]);
MPI_Get_address(&(mall_conf->exec_start), &displs[4]);
for(i=0;i<counts;i++) displs[i] -= dir;
MPI_Type_create_struct(counts, blocklengths, displs, types, new_type);
MPI_Type_commit(new_type);
if(is_children_group) {
user_reconf->rank_state = MAM_PROC_NEW_RANK;
user_reconf->numS = mall->num_parents;
user_reconf->numT = mall->numP;
} else {
user_reconf->numS = mall->numP;
user_reconf->numT = mall->numC;
if(mall->zombie) user_reconf->rank_state = MAM_PROC_ZOMBIE;
else user_reconf->rank_state = MAM_PROC_CONTINUE;
}
}
......@@ -9,21 +9,25 @@
#include <mpi.h>
#include "malleabilityStates.h"
int init_malleability(int myId, int numP, int root, MPI_Comm comm, char *name_exec, char *nodelist, int num_cpus, int num_nodes);
void free_malleability();
int malleability_checkpoint();
void set_benchmark_grp(int grp);
typedef struct {
int numS, numT;
int rank_state;
MPI_Comm comm;
} mam_user_reconf_t;
void set_malleability_configuration(int spawn_method, int spawn_strategies, int spawn_dist, int red_method, int red_strategies);
void set_children_number(int numC); // TODO TO BE DEPRECATED
void get_malleability_user_comm(MPI_Comm *comm);
int MAM_Init(int root, MPI_Comm *comm, char *name_exec, void (*user_function)(void *), void *user_args);
void MAM_Finalize();
int MAM_Checkpoint(int *mam_state, int wait_completed, void (*user_function)(void *), void *user_args);
void MAM_Resume_redistribution(int *mam_state);
void malleability_add_data(void *data, size_t total_qty, int type, int dependency, int is_replicated, int is_constant);
void malleability_modify_data(void *data, size_t index, size_t total_qty, int type, int dependency, int is_replicated, int is_constant);
void malleability_get_entries(size_t *entries, int is_replicated, int is_constant);
void malleability_get_data(void **data, size_t index, int is_replicated, int is_constant);
int MAM_Get_Reconf_Info(mam_user_reconf_t *reconf_info);
void MAM_Data_add(void *data, size_t *index, size_t total_qty, MPI_Datatype type, int is_replicated, int is_constant);
void MAM_Data_modify(void *data, size_t index, size_t total_qty, MPI_Datatype type, int is_replicated, int is_constant);
void MAM_Data_get_entries(int is_replicated, int is_constant, size_t *entries);
void MAM_Data_get_pointer(void **data, size_t index, size_t *total_qty, MPI_Datatype *type, int is_replicated, int is_constant);
void MAM_Retrieve_times(double *sp_time, double *sy_time, double *asy_time, double *mall_time);
void retrieve_results(double *sp_time, double *sy_time, double *asy_time, double *mall_time, double *ex_start); //FIXME BORRAR
void set_global_time(double ex_start); //FIXME BORRAR
#endif
#define _GNU_SOURCE
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include <sched.h>
#include <mpi.h>
#include "malleabilityRMS.h"
#include "malleabilityDataStructures.h"
#if USE_MAL_SLURM
#include <slurm/slurm.h>
int MAM_I_slurm_getenv_hosts_info();
int MAM_I_slurm_getjob_hosts_info();
#endif
int MAM_I_get_hosts_info();
int GetCPUCount();
void MAM_check_hosts() {
int not_filled = 1;
#if USE_MAL_SLURM
not_filled = MAM_I_slurm_getenv_hosts_info();
if(not_filled) {
if(mall->nodelist != NULL) {
free(mall->nodelist);
mall->nodelist = NULL;
}
not_filled = MAM_I_slurm_getjob_hosts_info();
}
#endif
if(not_filled) {
if(mall->nodelist != NULL) {
free(mall->nodelist);
mall->nodelist = NULL;
}
not_filled = MAM_I_get_hosts_info();
}
if(not_filled) {
if(mall->myId == mall->root) printf("MAM FATAL ERROR: It has not been possible to obtain the nodelist\n");
fflush(stdout);
MPI_Abort(mall->comm, -50);
}
#if USE_MAL_DEBUG >= 2
if(mall->myId == mall->root) {
DEBUG_FUNC("Obtained Nodelist", mall->myId, mall->numP);
printf("NODELIST: %s\nNODE_COUNT: %d NUM_CPUS_PER_NODE: %d\n", mall->nodelist, mall->num_nodes, mall->num_cpus);
fflush(stdout);
}
#endif
}
/*
* TODO
* FIXME Does not consider heterogenous machines for num_cpus
* FIXME Always returns 0... -- Perform error checking?
*/
int MAM_I_get_hosts_info() {
int i, j, name_len, max_name_len, unique_count, *unique_hosts;
char *my_host, *all_hosts, *confirmed_host, *tested_host;
all_hosts = NULL;
my_host = (char *) malloc(MPI_MAX_PROCESSOR_NAME * sizeof(char));
MPI_Get_processor_name(my_host, &name_len);
MPI_Allreduce(&name_len, &max_name_len, 1, MPI_INT, MPI_MAX, mall->comm);
my_host[max_name_len] = '\0';
max_name_len++; // Len does not consider terminating character
if(mall->myId == mall->root) {
all_hosts = (char *) malloc(mall->numP * max_name_len * sizeof(char));
unique_hosts = (int *) malloc(mall->numP * sizeof(int));
unique_hosts[0] = 0; //First host will always be unique
unique_count = 1;
}
MPI_Gather(my_host, max_name_len, MPI_CHAR, all_hosts, max_name_len, MPI_CHAR, mall->root, mall->comm);
if(mall->myId == mall->root) {
for (i = 1; i < mall->numP; i++) {
for (j = 0; j < unique_count; j++) {
tested_host = all_hosts + (i * max_name_len);
confirmed_host = all_hosts + (unique_hosts[j] * max_name_len);
if (strcmp(tested_host, confirmed_host) != 0) {
unique_hosts[unique_count] = i;
unique_count++;
break;
}
}
}
mall->num_nodes = unique_count;
mall->num_cpus = GetCPUCount();
mall->nodelist_len = unique_count*max_name_len;
mall->nodelist = (char *) malloc(mall->nodelist_len * sizeof(char));
strcpy(mall->nodelist, ""); //FIXME Strcat can be very inneficient...
for (i = 0; i < unique_count; i++) {
confirmed_host = all_hosts + (unique_hosts[i] * max_name_len);
strcat(mall->nodelist, confirmed_host);
if (i < unique_count - 1) {
strcat(mall->nodelist, ",");
}
}
free(all_hosts);
free(unique_hosts);
}
free(my_host);
return 0;
}
/*
* @brief Get the total number of CPUs available to the process.
*
* This function uses sched_getaffinity to obtain the CPU affinity of the current process
* and counts the number of CPUs in the affinity set. It adjusts the loop based on the
* maximum number of CPUs allowed on the system.
*
* @return The total number of CPUs available to the process.
*
* Code obtained from: https://stackoverflow.com/questions/4586405/how-to-get-the-number-of-cpus-in-linux-using-c
* The code has been slightly modified.
*/
int GetCPUCount() {
cpu_set_t cs;
CPU_ZERO(&cs);
sched_getaffinity(0, sizeof(cs), &cs);
int count = 0;
int max_cpus = sysconf(_SC_NPROCESSORS_ONLN);
for (int i = 0; i < max_cpus; i++) {
if (CPU_ISSET(i, &cs)) {
count++;
} else {
break;
}
}
return count;
}
#if USE_MAL_SLURM
/*
* TODO
*/
int MAM_I_slurm_getenv_hosts_info() {
char *tmp = NULL, *tmp_copy, *token;
int cpus, count;
//int i, *cpus_counts, *nodes_counts, *aux;
tmp = getenv("SLURM_JOB_NUM_NODES");
if(tmp == NULL) return 1;
mall->num_nodes = atoi(tmp);
tmp = NULL;
tmp = getenv("SLURM_JOB_NODELIST");
if(tmp == NULL) return 1;
mall->nodelist_len = strlen(tmp)+1;
mall->nodelist = (char *) malloc(mall->nodelist_len * sizeof(char));
strcpy(mall->nodelist, tmp);
tmp = NULL;
tmp = getenv("SLURM_JOB_CPUS_PER_NODE");
if(tmp == NULL) return 1;
tmp_copy = (char *) malloc((strlen(tmp)+1) * sizeof(char));
strcpy(tmp_copy, tmp);
token = strtok(tmp_copy, ",");
//TODO When MaM considers heteregenous allocations, these will be needed instead of num_cpus.
//cpus_counts = (int *) malloc(mall->num_nodes * sizeof(int));
//nodes_counts = (int *) malloc(mall->num_nodes * sizeof(int));
//i = 0;
mall->num_cpus = 0;
while (token != NULL) {
count = 1; // The count is not present when is 1 node.
if (sscanf(token, "%d(x%d)", &cpus, &count) >= 1) {
mall->num_cpus = cpus; // num_cpus stores the amount of cores per cpu
//cpus_per_node[i] = cpus;
//nodes_count[i] = count;
//i++;
}
token = strtok(NULL, ",");
}
/*
if(i < mall->num_nodes) {
aux = (int *) realloc(cpus_per_node, i * sizeof(int));
if(cpus_per_node != aux && cpus_per_node != NULL) free(cpus_per_node);
cpus_per_node = aux;
aux = (int *) realloc(nodes_counts, i * sizeof(int));
if(nodes_count != aux && nodes_count != NULL) free(nodes_count);
nodes_count = aux;
}
*/
free(tmp_copy);
return 0;
}
/*
* TODO
* FIXME Does not consider heterogenous machines
*/
int MAM_I_slurm_getjob_hosts_info() {
int jobId, err;
char *tmp = NULL;
job_info_msg_t *j_info;
slurm_job_info_t last_record;
tmp = getenv("SLURM_JOB_ID");
if(tmp == NULL) return 1;
jobId = atoi(tmp);
err = slurm_load_job(&j_info, jobId, 1);
if(err) return err;
last_record = j_info->job_array[j_info->record_count - 1];
mall->num_nodes = last_record.num_nodes;
mall->num_cpus = last_record.num_cpus;
mall->nodelist_len = strlen(last_record.nodes)+1;
mall->nodelist = (char *) malloc(mall->nodelist_len * sizeof(char));
strcpy(mall->nodelist, last_record.nodes);
slurm_free_job_info_msg(j_info);
return 0;
}
#endif
//TODO REFACTOR PARA CUANDO SE COMUNIQUE CON RMS
// Get Slurm job info
//int jobId;
//char *tmp;
//job_info_msg_t *j_info;
//slurm_job_info_t last_record;
//tmp = getenv("SLURM_JOB_ID");
//jobId = atoi(tmp);
//slurm_load_job(&j_info, jobId, 1);
//last_record = j_info->job_array[j_info->record_count - 1];
// Free JOB INFO
//slurm_free_job_info_msg(j_info);
#ifndef MALLEABILITY_RMS_H
#define MALLEABILITY_RMS_H
void MAM_check_hosts();
#endif
......@@ -6,32 +6,46 @@
//States
#define MALL_DENIED -1
enum mall_states{MALL_UNRESERVED, MALL_NOT_STARTED, MALL_ZOMBIE, MALL_SPAWN_PENDING, MALL_SPAWN_SINGLE_PENDING,
#define MAM_OK 0
enum mall_inner_states{MALL_UNRESERVED, MALL_NOT_STARTED, MALL_RMS_COMPLETED, MALL_SPAWN_PENDING, MALL_SPAWN_SINGLE_PENDING,
MALL_SPAWN_SINGLE_COMPLETED, MALL_SPAWN_ADAPT_POSTPONE, MALL_SPAWN_COMPLETED, MALL_DIST_PENDING, MALL_DIST_COMPLETED,
MALL_SPAWN_ADAPT_PENDING, MALL_SPAWN_ADAPTED, MALL_COMPLETED};
enum mall_spawn_methods{MALL_SPAWN_BASELINE, MALL_SPAWN_MERGE};
#define MALL_SPAWN_PTHREAD 2
#define MALL_SPAWN_SINGLE 3
MALL_SPAWN_ADAPT_PENDING, MALL_USER_PENDING, MALL_USER_COMPLETED, MALL_SPAWN_ADAPTED, MALL_COMPLETED};
enum mam_states{MAM_UNRESERVED, MAM_NOT_STARTED, MAM_PENDING, MAM_USER_PENDING, MAM_COMPLETED};
enum mam_proc_states{MAM_PROC_CONTINUE, MAM_PROC_NEW_RANK, MAM_PROC_ZOMBIE};
enum mall_redistribution_methods{MALL_RED_BASELINE, MALL_RED_POINT, MALL_RED_RMA_LOCK, MALL_RED_RMA_LOCKALL, MALL_RED_IBARRIER};
#define MALL_RED_THREAD 2
//#define MALL_RED_IBARRIER 3 Agregar como estrategia y eliminar como método
enum mall_spawn_methods{MALL_SPAWN_BASELINE, MALL_SPAWN_MERGE, MAM_METHODS_SPAWN_LEN};
enum mam_spawn_strategies{MAM_STRAT_SPAWN_CLEAR, MAM_STRAT_SPAWN_PTHREAD, MAM_STRAT_SPAWN_SINGLE, MAM_STRAT_SPAWN_INTERCOMM, MAM_STRATS_SPAWN_LEN};
enum mam_phy_dist_methods{MALL_DIST_SPREAD = 1, MALL_DIST_COMPACT, MAM_METHODS_PHYSICAL_DISTRIBUTION_LEN}; //FIXME Cambiar nombres a PHY_DIST?
enum mam_phy_info_methods{MALL_DIST_STRING = 1, MALL_DIST_HOSTFILE}; //FIXME Cambiar nombres a PHY_DIST?
enum mall_redistribution_methods{MALL_RED_BASELINE, MALL_RED_POINT, MALL_RED_RMA_LOCK, MALL_RED_RMA_LOCKALL, MAM_METHODS_RED_LEN};
enum mam_red_strategies{MAM_STRAT_RED_CLEAR, MAM_STRAT_RED_PTHREAD, MAM_STRAT_RED_WAIT_SOURCES, MAM_STRAT_RED_WAIT_TARGETS, MAM_STRATS_RED_LEN};
/* KEYS & VALUES for config*/
enum mam_key_values{MAM_SPAWN_METHOD=0, MAM_SPAWN_STRATEGIES, MAM_PHYSICAL_DISTRIBUTION, MAM_RED_METHOD, MAM_RED_STRATEGIES, MAM_NUM_TARGETS, MAM_KEY_COUNT};
#define MAM_SPAWN_METHOD_ENV "MAM_SPAWN_METHOD"
#define MAM_SPAWN_STRATS_ENV "MAM_SPAWN_STRATS"
#define MAM_PHYSICAL_DISTRIBUTION_METHOD_ENV "MAM_PHYSICAL_DISTRIBUTION_METHOD"
#define MAM_RED_METHOD_ENV "MAM_RED_METHOD"
#define MAM_RED_STRATS_ENV "MAM_RED_STRATS"
#define MAM_NUM_TARGETS_ENV "MAM_NUM_TARGETS"
#define MALLEABILITY_ROOT 0
#define MAL_APP_EXECUTING 0
#define MAL_APP_ENDED 1
#define MAL_INT 0
#define MAL_CHAR 1
#define MAL_DOUBLE 2
#define MAL_DATA_ALONE -1
#define MAL_DATA_INDEPENDENT 0
#define MAL_DATA_DEPENDENT 1
////////////////
#define MAM_CHECK_COMPLETION 0
#define MAM_WAIT_COMPLETION 1
#define MALLEABILITY_CHILDREN 1
#define MALLEABILITY_NOT_CHILDREN 0
#define MAM_DATA_DISTRIBUTED 0
#define MAM_DATA_REPLICATED 1
#define MAM_DATA_VARIABLE 0
#define MAM_DATA_CONSTANT 1
#endif
#include "malleabilityTimes.h"
#include "malleabilityDataStructures.h"
void def_malleability_times(MPI_Datatype *new_type);
void init_malleability_times() {
#if USE_MAL_DEBUG
DEBUG_FUNC("Initializing recording structure", mall->myId, mall->numP); fflush(stdout); MPI_Barrier(mall->comm);
#endif
mall_conf->times = (malleability_times_t *) malloc(sizeof(malleability_times_t));
if(mall_conf->times == NULL) {
perror("Error al crear la estructura de tiempos interna para maleabilidad\n");
MPI_Abort(MPI_COMM_WORLD, -5);
}
reset_malleability_times();
def_malleability_times(&mall_conf->times->times_type);
#if USE_MAL_DEBUG
DEBUG_FUNC("Initialized recording structure", mall->myId, mall->numP); fflush(stdout); MPI_Barrier(mall->comm);
#endif
}
void reset_malleability_times() {
malleability_times_t *times = mall_conf->times;
times->spawn_start = 0; times->sync_start = 0; times->async_start = 0; times->malleability_start = 0;
times->sync_end = 0; times->async_end = 0; times->malleability_end = 0;
times->spawn_time = 0;
}
void free_malleability_times() {
#if USE_MAL_DEBUG
DEBUG_FUNC("Freeing recording structure", mall->myId, mall->numP); fflush(stdout);
#endif
if(mall_conf->times != NULL) {
if(mall_conf->times->times_type != MPI_DATATYPE_NULL) {
MPI_Type_free(&mall_conf->times->times_type);
mall_conf->times->times_type = MPI_DATATYPE_NULL;
}
free(mall_conf->times);
}
#if USE_MAL_DEBUG
DEBUG_FUNC("Freed recording structure", mall->myId, mall->numP); fflush(stdout);
#endif
}
void malleability_times_broadcast(int root) {
MPI_Bcast(mall_conf->times, 1, mall_conf->times->times_type, root, mall->intercomm);
}
void MAM_I_retrieve_times(double *sp_time, double *sy_time, double *asy_time, double *mall_time) {
malleability_times_t *times = mall_conf->times;
*sp_time = times->spawn_time;
*sy_time = times->sync_end - times->sync_start;
*asy_time = times->async_end - times->async_start;
*mall_time = times->malleability_end - times->malleability_start;
}
void def_malleability_times(MPI_Datatype *new_type) {
int i, counts = 4;
int blocklengths[counts];
MPI_Aint displs[counts], dir;
MPI_Datatype types[counts];
blocklengths[0] = blocklengths[1] = blocklengths[2] = blocklengths[3] = 1;
types[0] = types[1] = types[2] = types[3] = MPI_DOUBLE;
// Se pasa el vector a traves de la direccion de "mall_conf"
// Rellenar vector displs
MPI_Get_address(mall_conf->times, &dir);
// Obtener direccion base
MPI_Get_address(&(mall_conf->times->spawn_time), &displs[0]);
MPI_Get_address(&(mall_conf->times->sync_start), &displs[1]);
MPI_Get_address(&(mall_conf->times->async_start), &displs[2]);
MPI_Get_address(&(mall_conf->times->malleability_start), &displs[3]);
for(i=0;i<counts;i++) displs[i] -= dir;
MPI_Type_create_struct(counts, blocklengths, displs, types, new_type);
MPI_Type_commit(new_type);
}
#ifndef MALLEABILITY_TIMES_H
#define MALLEABILITY_TIMES_H
#include <mpi.h>
void init_malleability_times();
void reset_malleability_times();
void free_malleability_times();
void malleability_times_broadcast(int root);
void MAM_I_retrieve_times(double *sp_time, double *sy_time, double *asy_time, double *mall_time);
#endif
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment