Commit 4f344a7b authored by iker_martin's avatar iker_martin
Browse files

Arreglado error de memoria en spawn background. Sigue sin funcionar si hay...

Arreglado error de memoria en spawn background. Sigue sin funcionar si hay comunicacion en primer plano
parent c437c212
...@@ -169,7 +169,6 @@ int checkpoint(int iter, int state, MPI_Request **comm_req) { ...@@ -169,7 +169,6 @@ int checkpoint(int iter, int state, MPI_Request **comm_req) {
group->numS = config_file->procs[group->grp +1]; group->numS = config_file->procs[group->grp +1];
int comm_type = COMM_SPAWN_PTHREAD; // TODO Pasar a CONFIG int comm_type = COMM_SPAWN_PTHREAD; // TODO Pasar a CONFIG
if(group->myId == ROOT) { printf("Malleability\n");}
state = TC(group->numS, comm_type); state = TC(group->numS, comm_type);
if (state == COMM_FINISHED){ if (state == COMM_FINISHED){
......
...@@ -22,11 +22,11 @@ struct Slurm_data { ...@@ -22,11 +22,11 @@ struct Slurm_data {
int type_creation; int type_creation;
}; };
struct Creation_data { typedef struct {
char **argv; char **argv;
int numP_childs, myId, root, type_dist; int numP_childs, myId, root, type_dist;
MPI_Comm comm; MPI_Comm comm;
}; }Creation_data;
//--------------PRIVATE SPAWN TYPE DECLARATIONS---------------// //--------------PRIVATE SPAWN TYPE DECLARATIONS---------------//
...@@ -76,6 +76,9 @@ int init_slurm_comm(char **argv, int myId, int numP, int root, int type_dist, in ...@@ -76,6 +76,9 @@ int init_slurm_comm(char **argv, int myId, int numP, int root, int type_dist, in
if(myId == root) { if(myId == root) {
processes_dist(argv, numP, type_dist); processes_dist(argv, numP, type_dist);
} else {
slurm_data->cmd = malloc(1 * sizeof(char));
slurm_data->info = MPI_INFO_NULL;
} }
create_processes(myId, root, child, comm); create_processes(myId, root, child, comm);
free(slurm_data); free(slurm_data);
...@@ -84,7 +87,7 @@ int init_slurm_comm(char **argv, int myId, int numP, int root, int type_dist, in ...@@ -84,7 +87,7 @@ int init_slurm_comm(char **argv, int myId, int numP, int root, int type_dist, in
} else if(type_creation == COMM_SPAWN_PTHREAD) { } else if(type_creation == COMM_SPAWN_PTHREAD) {
commSlurm = COMM_IN_PROGRESS; commSlurm = COMM_IN_PROGRESS;
struct Creation_data *creation_data = malloc(sizeof(struct Creation_Data*)); Creation_data *creation_data = (Creation_data *) malloc(sizeof(Creation_data));
creation_data->argv = argv; creation_data->argv = argv;
creation_data->numP_childs = numP; creation_data->numP_childs = numP;
creation_data->myId = myId; creation_data->myId = myId;
...@@ -92,7 +95,7 @@ int init_slurm_comm(char **argv, int myId, int numP, int root, int type_dist, in ...@@ -92,7 +95,7 @@ int init_slurm_comm(char **argv, int myId, int numP, int root, int type_dist, in
creation_data->type_dist = type_dist; creation_data->type_dist = type_dist;
creation_data->comm = comm; creation_data->comm = comm;
if(pthread_create(&slurm_thread, NULL, thread_work, creation_data)) { if(pthread_create(&slurm_thread, NULL, thread_work, (void *)creation_data)) {
printf("Error al crear el hilo de contacto con SLURM\n"); printf("Error al crear el hilo de contacto con SLURM\n");
MPI_Abort(MPI_COMM_WORLD, -1); MPI_Abort(MPI_COMM_WORLD, -1);
return -1; return -1;
...@@ -107,7 +110,7 @@ int init_slurm_comm(char **argv, int myId, int numP, int root, int type_dist, in ...@@ -107,7 +110,7 @@ int init_slurm_comm(char **argv, int myId, int numP, int root, int type_dist, in
* y en caso de que lo este, se devuelve el communicador a estos nuevos procesos. * y en caso de que lo este, se devuelve el communicador a estos nuevos procesos.
*/ */
int check_slurm_comm(int myId, int root, MPI_Comm *child) { int check_slurm_comm(int myId, int root, MPI_Comm *child) {
int state; int state=-10;
if(slurm_data->type_creation == COMM_SPAWN_PTHREAD) { if(slurm_data->type_creation == COMM_SPAWN_PTHREAD) {
MPI_Allreduce(&commSlurm, &state, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD); MPI_Allreduce(&commSlurm, &state, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);
...@@ -123,9 +126,12 @@ int check_slurm_comm(int myId, int root, MPI_Comm *child) { ...@@ -123,9 +126,12 @@ int check_slurm_comm(int myId, int root, MPI_Comm *child) {
return -10; return -10;
} }
commSlurm = COMM_FINISHED;
*child = *returned_comm; *child = *returned_comm;
if(myId == root) {
MPI_Info_free(&(slurm_data->info));
}
free(slurm_data->cmd);
free(slurm_data); free(slurm_data);
return commSlurm; return commSlurm;
...@@ -141,18 +147,20 @@ int check_slurm_comm(int myId, int root, MPI_Comm *child) { ...@@ -141,18 +147,20 @@ int check_slurm_comm(int myId, int root, MPI_Comm *child) {
* se avisa al hilo maestro. * se avisa al hilo maestro.
*/ */
void* thread_work(void* creation_data_arg) { void* thread_work(void* creation_data_arg) {
struct Creation_data *creation_data = (struct Creation_data*) creation_data_arg; Creation_data *creation_data = (Creation_data*) creation_data_arg;
returned_comm = (MPI_Comm *) malloc(sizeof(MPI_Comm)); returned_comm = (MPI_Comm *) malloc(sizeof(MPI_Comm));
if(creation_data->myId == creation_data->root) { if(creation_data->myId == creation_data->root) {
//if(creation_data->myId == creation_data->root) { printf("WORKD SPAWN 1\n");} fflush(stdout);
processes_dist(creation_data->argv, creation_data->numP_childs, creation_data->type_dist); processes_dist(creation_data->argv, creation_data->numP_childs, creation_data->type_dist);
} else {
slurm_data->cmd = malloc(1 * sizeof(char));
slurm_data->info = MPI_INFO_NULL;
} }
create_processes(creation_data->myId, creation_data->root, returned_comm, creation_data->comm); create_processes(creation_data->myId, creation_data->root, returned_comm, creation_data->comm);
commSlurm = COMM_FINISHED; commSlurm = COMM_FINISHED;
//free(creation_data); //FIXME No se libera bien free(creation_data);
pthread_exit(NULL); pthread_exit(NULL);
} }
...@@ -215,18 +223,12 @@ void processes_dist(char *argv[], int numP_childs, int type) { ...@@ -215,18 +223,12 @@ void processes_dist(char *argv[], int numP_childs, int type) {
* "processes_dist()". * "processes_dist()".
*/ */
int create_processes(int myId, int root, MPI_Comm *child, MPI_Comm comm) { int create_processes(int myId, int root, MPI_Comm *child, MPI_Comm comm) {
//if(myId == root) { printf("WORKD SPAWN 2.1 cmd=%s pr=%d\n", slurm_data->cmd, slurm_data->qty_procs);} fflush(stdout);
int spawn_err = MPI_Comm_spawn(slurm_data->cmd, MPI_ARGV_NULL, slurm_data->qty_procs, slurm_data->info, root, comm, child, MPI_ERRCODES_IGNORE); int spawn_err = MPI_Comm_spawn(slurm_data->cmd, MPI_ARGV_NULL, slurm_data->qty_procs, slurm_data->info, root, comm, child, MPI_ERRCODES_IGNORE);
if(spawn_err != MPI_SUCCESS) { if(spawn_err != MPI_SUCCESS) {
printf("Error creating new set of %d procs.\n", slurm_data->qty_procs); printf("Error creating new set of %d procs.\n", slurm_data->qty_procs);
} }
if(myId == root) {
MPI_Info_free(&(slurm_data->info));
free(slurm_data->cmd);
}
return spawn_err; return spawn_err;
} }
...@@ -276,7 +278,7 @@ void node_dist(slurm_job_info_t job_record, int type, int total_procs, int **qty ...@@ -276,7 +278,7 @@ void node_dist(slurm_job_info_t job_record, int type, int total_procs, int **qty
procs[i] += total_procs - asigCores; procs[i] += total_procs - asigCores;
(*used_nodes)++; (*used_nodes)++;
} }
if(*used_nodes > job_record.num_nodes) *used_nodes = job_record.num_nodes; if(*used_nodes > job_record.num_nodes) *used_nodes = job_record.num_nodes; //FIXME Si ocurre esto no es un error?
} }
*qty = calloc(*used_nodes, sizeof(int)); // Numero de procesos por nodo *qty = calloc(*used_nodes, sizeof(int)); // Numero de procesos por nodo
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment