Commit 32a4ca93 authored by Iker Martín Álvarez's avatar Iker Martín Álvarez
Browse files

Merge branch 'malleability-refactor' into 'dev'

Malleability focus refactor of Proteo

See merge request martini/malleability_benchmark!5
parents f1511cb4 06573694
#ifndef MAM_DATA_STRUCTURES_H
#define MAM_DATA_STRUCTURES_H
/*
* Shows available data structures for inner ussage.
*/
#include <stdlib.h>
#include <stdio.h>
#include <mpi.h>
#include <pthread.h>
#include "MAM_Constants.h"
#define DEBUG_FUNC(debug_string, rank, numP) printf("MaM [P%d/%d]: %s -- %s:%s:%d\n", rank, numP, debug_string, __FILE__, __func__, __LINE__)
/* --- MAM INNER CONSTANTS --- */
#define MAM_ROOT 0
enum mam_inner_states{MAM_I_UNRESERVED, MAM_I_NOT_STARTED, MAM_I_RMS_COMPLETED, MAM_I_SPAWN_PENDING, MAM_I_SPAWN_SINGLE_PENDING,
MAM_I_SPAWN_SINGLE_COMPLETED, MAM_I_SPAWN_ADAPT_POSTPONE, MAM_I_SPAWN_COMPLETED, MAM_I_DIST_PENDING, MAM_I_DIST_COMPLETED,
MAM_I_SPAWN_ADAPT_PENDING, MAM_I_USER_START, MAM_I_USER_PENDING, MAM_I_USER_COMPLETED, MAM_I_SPAWN_ADAPTED, MAM_I_COMPLETED};
#define MAM_USE_VALGRIND 1
#define MAM_USE_EXTRAE 2
#define MAM_VALGRIND_SCRIPT "./worker_valgrind.sh"
#define MAM_EXTRAE_SCRIPT "./worker_extrae.sh"
/* --- TIME CAPTURE STRUCTURE --- */
typedef struct {
// Spawn, Sync and Async time
double spawn_start, spawn_time;
double sync_start, sync_end;
double async_start, async_end;
double user_start, user_end;
double malleability_start, malleability_end;
MPI_Datatype times_type;
} malleability_times_t;
/* --- GLOBAL STRUCTURES --- */
typedef struct {
unsigned int spawn_method;
unsigned int spawn_dist;
unsigned int spawn_strategies;
unsigned int red_method;
unsigned int red_strategies;
int external_usage; // Whether a different application should be called by Spawn and which
malleability_times_t *times;
} malleability_config_t;
typedef struct {
int myId, numP, numC, zombie;
int root, root_collectives;
int num_parents, root_parents;
pthread_t async_thread;
MPI_Comm comm, thread_comm, original_comm;
MPI_Comm intercomm, tmp_comm;
MPI_Comm *user_comm;
MPI_Datatype struct_type;
// Specific vars for Wait_targets strat
int wait_targets_posted;
MPI_Request wait_targets;
char *name_exec, *nodelist;
int num_cpus, num_nodes, nodelist_len;
int internode_group;
} malleability_t;
/* --- VARIABLES --- */
extern malleability_config_t *mall_conf;
extern malleability_t *mall;
extern int state;
/* --- FUNCTIONS --- */
void MAM_Def_main_datatype();
void MAM_Free_main_datatype();
void MAM_Comm_main_structures(MPI_Comm comm, int rootBcast);
void MAM_print_comms_state();
void MAM_comms_update(MPI_Comm comm);
#endif
#ifndef MAM_INIT_CONFIGURATION_H
#define MAM_INIT_CONFIGURATION_H
#include <mpi.h>
#include "MAM_Constants.h"
void MAM_Init_configuration();
void MAM_Set_initial_configuration();
void MAM_Check_configuration();
#endif
#include <pthread.h>
#include <string.h>
#include "MAM.h"
#include "MAM_Constants.h"
#include "MAM_DataStructures.h"
#include "MAM_Types.h"
#include "MAM_Zombies.h"
#include "MAM_Times.h"
#include "MAM_RMS.h"
#include "MAM_Init_Configuration.h"
#include "spawn_methods/GenericSpawn.h"
#include "distribution_methods/Distributed_CommDist.h"
#define MAM_USE_SYNCHRONOUS 0
#define MAM_USE_ASYNCHRONOUS 1
void MAM_Commit(int *mam_state);
void send_data(int numP_children, malleability_data_t *data_struct, int is_asynchronous);
void recv_data(int numP_parents, malleability_data_t *data_struct, int is_asynchronous);
int MAM_St_rms(int *mam_state);
int MAM_St_spawn_start();
int MAM_St_spawn_pending(int wait_completed);
int MAM_St_red_start();
int MAM_St_red_pending(int wait_completed);
int MAM_St_user_start(int *mam_state);
int MAM_St_user_pending(int *mam_state, int wait_completed, void (*user_function)(void *), void *user_args);
int MAM_St_user_completed();
int MAM_St_spawn_adapt_pending(int wait_completed);
int MAM_St_spawn_adapted(int *mam_state);
int MAM_St_red_completed(int *mam_state);
int MAM_St_completed(int *mam_state);
void Children_init(void (*user_function)(void *), void *user_args);
int spawn_step();
int start_redistribution();
int check_redistribution(int wait_completed);
int end_redistribution();
int shrink_redistribution();
int thread_creation();
int thread_check(int wait_completed);
void* thread_async_work();
int MAM_I_convert_key(char *key);
void MAM_I_create_user_struct(int is_children_group);
malleability_data_t *rep_s_data;
malleability_data_t *dist_s_data;
malleability_data_t *rep_a_data;
malleability_data_t *dist_a_data;
mam_user_reconf_t *user_reconf;
/*
* Inicializa la reserva de memoria para el modulo de maleabilidad
* creando todas las estructuras necesarias y copias de comunicadores
* para no interferir en la aplicación.
*
* Si es llamada por un grupo de procesos creados de forma dinámica,
* inicializan la comunicacion con sus padres. En este caso, al terminar
* la comunicacion los procesos hijo estan preparados para ejecutar la
* aplicacion.
*/
int MAM_Init(int root, MPI_Comm *comm, char *name_exec, void (*user_function)(void *), void *user_args) {
MPI_Comm dup_comm, thread_comm, original_comm;
mall_conf = (malleability_config_t *) malloc(sizeof(malleability_config_t));
mall = (malleability_t *) malloc(sizeof(malleability_t));
user_reconf = (mam_user_reconf_t *) malloc(sizeof(mam_user_reconf_t));
MPI_Comm_rank(*comm, &(mall->myId));
MPI_Comm_size(*comm, &(mall->numP));
#if MAM_DEBUG
DEBUG_FUNC("Initializing MaM", mall->myId, mall->numP); fflush(stdout); MPI_Barrier(*comm);
#endif
rep_s_data = (malleability_data_t *) malloc(sizeof(malleability_data_t));
dist_s_data = (malleability_data_t *) malloc(sizeof(malleability_data_t));
rep_a_data = (malleability_data_t *) malloc(sizeof(malleability_data_t));
dist_a_data = (malleability_data_t *) malloc(sizeof(malleability_data_t));
MPI_Comm_dup(*comm, &dup_comm);
MPI_Comm_dup(*comm, &thread_comm);
MPI_Comm_dup(*comm, &original_comm);
MPI_Comm_set_name(dup_comm, "MAM_MAIN");
MPI_Comm_set_name(thread_comm, "MAM_THREAD");
MPI_Comm_set_name(original_comm, "MAM_ORIGINAL");
mall->root = root;
mall->root_parents = root;
mall->zombie = 0;
mall->comm = dup_comm;
mall->thread_comm = thread_comm;
mall->original_comm = original_comm;
mall->user_comm = comm;
mall->tmp_comm = MPI_COMM_NULL;
mall->name_exec = name_exec;
mall->nodelist = NULL;
mall->nodelist_len = 0;
rep_s_data->entries = 0;
rep_a_data->entries = 0;
dist_s_data->entries = 0;
dist_a_data->entries = 0;
state = MAM_I_NOT_STARTED;
MAM_Init_configuration();
MAM_Zombies_service_init();
init_malleability_times();
MAM_Def_main_datatype();
// Si son el primer grupo de procesos, obtienen los datos de los padres
MPI_Comm_get_parent(&(mall->intercomm));
if(mall->intercomm != MPI_COMM_NULL) {
Children_init(user_function, user_args);
return MAM_TARGETS;
}
//TODO Check potential improvement - If check_hosts does not use slurm, internode_group could be obtained there
MAM_check_hosts();
mall->internode_group = MAM_Is_internode_group();
MAM_Set_initial_configuration();
#if MAM_USE_BARRIERS && MAM_DEBUG
if(mall->myId == mall->root)
printf("MaM: Using barriers to record times.\n");
#endif
#if MAM_DEBUG
DEBUG_FUNC("MaM has been initialized correctly as parents", mall->myId, mall->numP); fflush(stdout); MPI_Barrier(*comm);
#endif
return MAM_SOURCES;
}
/*
* Elimina toda la memoria reservado por el modulo
* de maleabilidad y asegura que los zombies
* despierten si los hubiese.
*/
int MAM_Finalize() {
int request_abort;
free_malleability_data_struct(rep_s_data);
free_malleability_data_struct(rep_a_data);
free_malleability_data_struct(dist_s_data);
free_malleability_data_struct(dist_a_data);
free(rep_s_data);
free(rep_a_data);
free(dist_s_data);
free(dist_a_data);
if(mall->nodelist != NULL) free(mall->nodelist);
MAM_Free_main_datatype();
request_abort = MAM_Zombies_service_free();
free_malleability_times();
if(mall->comm != MPI_COMM_WORLD && mall->comm != MPI_COMM_NULL) MPI_Comm_disconnect(&(mall->comm));
if(mall->thread_comm != MPI_COMM_WORLD && mall->thread_comm != MPI_COMM_NULL) MPI_Comm_disconnect(&(mall->thread_comm));
if(mall->intercomm != MPI_COMM_WORLD && mall->intercomm != MPI_COMM_NULL) { MPI_Comm_disconnect(&(mall->intercomm)); } //FIXME Error en OpenMPI + Merge
if(mall->original_comm != MPI_COMM_WORLD && mall->original_comm != MPI_COMM_NULL) MPI_Comm_free(&(mall->original_comm));
free(mall);
free(mall_conf);
free(user_reconf);
state = MAM_I_UNRESERVED;
return request_abort;
}
/*
* TODO Reescribir
* Comprueba el estado de la maleabilidad. Intenta avanzar en la misma
* si es posible. Funciona como una máquina de estados.
* Retorna el estado de la maleabilidad concreto y modifica el argumento
* "mam_state" a uno generico.
*
* El argumento "wait_completed" se utiliza para esperar a la finalización de
* las tareas llevadas a cabo por parte de MAM.
*
*/
int MAM_Checkpoint(int *mam_state, int wait_completed, void (*user_function)(void *), void *user_args) {
int call_checkpoint = 0;
//TODO This could be changed to an array with the functions to call in each case
switch(state) {
case MAM_I_UNRESERVED:
*mam_state = MAM_UNRESERVED;
break;
case MAM_I_NOT_STARTED:
call_checkpoint = MAM_St_rms(mam_state);
break;
case MAM_I_RMS_COMPLETED:
call_checkpoint = MAM_St_spawn_start();
break;
case MAM_I_SPAWN_PENDING: // Comprueba si el spawn ha terminado
case MAM_I_SPAWN_SINGLE_PENDING:
call_checkpoint = MAM_St_spawn_pending(wait_completed);
break;
case MAM_I_SPAWN_ADAPT_POSTPONE:
case MAM_I_SPAWN_COMPLETED:
call_checkpoint = MAM_St_red_start();
break;
case MAM_I_DIST_PENDING:
call_checkpoint = MAM_St_red_pending(wait_completed);
break;
case MAM_I_USER_START:
call_checkpoint = MAM_St_user_start(mam_state);
break;
case MAM_I_USER_PENDING:
call_checkpoint = MAM_St_user_pending(mam_state, wait_completed, user_function, user_args);
break;
case MAM_I_USER_COMPLETED:
call_checkpoint = MAM_St_user_completed();
break;
case MAM_I_SPAWN_ADAPT_PENDING:
call_checkpoint = MAM_St_spawn_adapt_pending(wait_completed);
break;
case MAM_I_SPAWN_ADAPTED:
case MAM_I_DIST_COMPLETED:
call_checkpoint = MAM_St_completed(mam_state);
break;
}
if(call_checkpoint) { MAM_Checkpoint(mam_state, wait_completed, user_function, user_args); }
if(state > MAM_I_NOT_STARTED && state < MAM_I_COMPLETED) *mam_state = MAM_PENDING;
return state;
}
/*
* TODO
*/
void MAM_Resume_redistribution(int *mam_state) {
state = MAM_I_USER_COMPLETED;
if(mam_state != NULL) *mam_state = MAM_PENDING;
}
/*
* TODO
*/
void MAM_Commit(int *mam_state) {
int request_abort;
#if MAM_DEBUG
if(mall->myId == mall->root){ DEBUG_FUNC("Trying to commit", mall->myId, mall->numP); } fflush(stdout);
#endif
// Get times before commiting
if(mall_conf->spawn_method == MAM_SPAWN_BASELINE) {
// This communication is only needed when the root process will become a zombie
malleability_times_broadcast(mall->root_collectives);
}
// Free unneded communicators
if(mall->tmp_comm != MPI_COMM_WORLD && mall->tmp_comm != MPI_COMM_NULL) MPI_Comm_disconnect(&(mall->tmp_comm));
if(*(mall->user_comm) != MPI_COMM_WORLD && *(mall->user_comm) != MPI_COMM_NULL) MPI_Comm_disconnect(mall->user_comm);
// Zombies Treatment
MAM_Zombies_update();
if(mall->zombie) {
#if MAM_DEBUG >= 1
DEBUG_FUNC("Is terminating as zombie", mall->myId, mall->numP); fflush(stdout);
#endif
request_abort = MAM_Finalize();
if(request_abort) { MPI_Abort(MPI_COMM_WORLD, -101); }
MPI_Finalize();
exit(0);
}
// Reset/Free communicators
if(mall_conf->spawn_method == MAM_SPAWN_MERGE) { MAM_comms_update(mall->intercomm); }
if(mall->intercomm != MPI_COMM_NULL && mall->intercomm != MPI_COMM_WORLD) { MPI_Comm_disconnect(&(mall->intercomm)); } //FIXME Error en OpenMPI + Merge
MPI_Comm_rank(mall->comm, &mall->myId);
MPI_Comm_size(mall->comm, &mall->numP);
mall->root = mall_conf->spawn_method == MAM_SPAWN_BASELINE ? mall->root : mall->root_parents;
mall->root_parents = mall->root;
state = MAM_I_NOT_STARTED;
if(mam_state != NULL) *mam_state = MAM_COMPLETED;
// Set new communicator
MPI_Comm_dup(mall->comm, mall->user_comm);
#if MAM_DEBUG
if(mall->myId == mall->root) DEBUG_FUNC("Reconfiguration has been commited", mall->myId, mall->numP); fflush(stdout);
#endif
#if MAM_USE_BARRIERS
MPI_Barrier(mall->comm);
#endif
mall_conf->times->malleability_end = MPI_Wtime();
}
/*
* This function adds data to a data structure based on whether the operation is synchronous or asynchronous,
* and whether the data is replicated or distributed. It takes the following parameters:
* - data: a pointer to the data to be added
* - index: a pointer to a size_t variable where the index of the added data will be stored
* - total_qty: the amount of elements in data
* - type: the MPI datatype of the data
* - is_replicated: a flag indicating whether the data is replicated (MAM_DATA_REPLICATED) or not (MAM_DATA_DISTRIBUTED)
* - is_constant: a flag indicating whether the operation is asynchronous (MAM_DATA_CONSTANT) or synchronous (MAM_DATA_VARIABLE)
* Finally, it updates the index with the index of the last added data if index is not NULL.
*/
void MAM_Data_add(void *data, size_t *index, size_t total_qty, MPI_Datatype type, int is_replicated, int is_constant) {
size_t total_reqs = 0, returned_index;
if(is_constant) { //Async
if(is_replicated) {
total_reqs = 1;
add_data(data, total_qty, type, total_reqs, rep_a_data);
returned_index = rep_a_data->entries-1;
} else {
if(mall_conf->red_method == MAM_RED_BASELINE) {
total_reqs = 1;
} else if(mall_conf->red_method == MAM_RED_POINT || mall_conf->red_method == MAM_RED_RMA_LOCK || mall_conf->red_method == MAM_RED_RMA_LOCKALL) {
total_reqs = mall->numC;
}
add_data(data, total_qty, type, total_reqs, dist_a_data);
returned_index = dist_a_data->entries-1;
}
} else { //Sync
if(is_replicated) {
add_data(data, total_qty, type, total_reqs, rep_s_data);
returned_index = rep_s_data->entries-1;
} else {
add_data(data, total_qty, type, total_reqs, dist_s_data);
returned_index = dist_s_data->entries-1;
}
}
if(index != NULL) *index = returned_index;
}
/*
* This function modifies a data entry to a data structure based on whether the operation is synchronous or asynchronous,
* and whether the data is replicated or distributed. It takes the following parameters:
* - data: a pointer to the data to be added
* - index: a value indicating which entry will be modified
* - total_qty: the amount of elements in data
* - type: the MPI datatype of the data
* - is_replicated: a flag indicating whether the data is replicated (MAM_DATA_REPLICATED) or not (MAM_DATA_DISTRIBUTED)
* - is_constant: a flag indicating whether the operation is asynchronous (MAM_DATA_CONSTANT) or synchronous (MAM_DATA_VARIABLE)
*/
void MAM_Data_modify(void *data, size_t index, size_t total_qty, MPI_Datatype type, int is_replicated, int is_constant) {
size_t total_reqs = 0;
if(is_constant) {
if(is_replicated) {
total_reqs = 1;
modify_data(data, index, total_qty, type, total_reqs, rep_a_data); //FIXME total_reqs==0 ???
} else {
if(mall_conf->red_method == MAM_RED_BASELINE) {
total_reqs = 1;
} else if(mall_conf->red_method == MAM_RED_POINT || mall_conf->red_method == MAM_RED_RMA_LOCK || mall_conf->red_method == MAM_RED_RMA_LOCKALL) {
total_reqs = mall->numC;
}
modify_data(data, index, total_qty, type, total_reqs, dist_a_data);
}
} else {
if(is_replicated) {
modify_data(data, index, total_qty, type, total_reqs, rep_s_data);
} else {
modify_data(data, index, total_qty, type, total_reqs, dist_s_data);
}
}
}
/*
* This functions returns how many data entries are available for one of the specific data structures.
* It takes the following parameters:
* - is_replicated: a flag indicating whether the structure is replicated (MAM_DATA_REPLICATED) or not (MAM_DATA_DISTRIBUTED)
* - is_constant: a flag indicating whether the operation is asynchronous (MAM_DATA_CONSTANT) or synchronous (MAM_DATA_VARIABLE)
* - entries: a pointer where the amount of entries will be stored
*/
void MAM_Data_get_entries(int is_replicated, int is_constant, size_t *entries){
if(is_constant) {
if(is_replicated) {
*entries = rep_a_data->entries;
} else {
*entries = dist_a_data->entries;
}
} else {
if(is_replicated) {
*entries = rep_s_data->entries;
} else {
*entries = dist_s_data->entries;
}
}
}
/*
* This function returns a data entry to a data structure based on whether the operation is synchronous or asynchronous,
* and whether the data is replicated or distributed. It takes the following parameters:
* - index: a value indicating which entry will be modified
* - is_replicated: a flag indicating whether the data is replicated (MAM_DATA_REPLICATED) or not (MAM_DATA_DISTRIBUTED)
* - is_constant: a flag indicating whether the operation is asynchronous (MAM_DATA_CONSTANT) or synchronous (MAM_DATA_VARIABLE)
* - data: a pointer where the data will be stored. The user must free it
* - total_qty: the amount of elements in data for all ranks
* - local_qty: the amount of elements in data for this rank
*/
void MAM_Data_get_pointer(void **data, size_t index, size_t *total_qty, MPI_Datatype *type, int is_replicated, int is_constant) {
malleability_data_t *data_struct;
if(is_constant) {
if(is_replicated) {
data_struct = rep_a_data;
} else {
data_struct = dist_a_data;
}
} else {
if(is_replicated) {
data_struct = rep_s_data;
} else {
data_struct = dist_s_data;
}
}
*data = data_struct->arrays[index];
*total_qty = data_struct->qty[index];
*type = data_struct->types[index];
//get_block_dist(qty, mall->myId, mall->numP, &dist_data); //FIXME Asegurar que numP es correcto
}
/*
* @brief Returns a structure to perform data redistribution during a reconfiguration.
*
* This function is intended to be called when the state of MaM is MAM_I_USER_PENDING only.
* It is designed to provide the necessary information for the user to perform data redistribution.
*
* Parameters:
* - mam_user_reconf_t *reconf_info: A pointer to a mam_user_reconf_t structure where the function will store the required information for data redistribution.
*
* Return Value:
* - MAM_OK: If the function successfully retrieves the reconfiguration information.
* - MAM_DENIED: If the function is called when the state of the MaM is not MAM_I_USER_PENDING.
*/
int MAM_Get_Reconf_Info(mam_user_reconf_t *reconf_info) {
if(state != MAM_I_USER_PENDING) return MAM_DENIED;
*reconf_info = *user_reconf;
return MAM_OK;
}
//======================================================||
//================PRIVATE FUNCTIONS=====================||
//================DATA COMMUNICATION====================||
//======================================================||
//======================================================||
/*
* Funcion generalizada para enviar datos desde los hijos.
* La asincronizidad se refiere a si el hilo padre e hijo lo hacen
* de forma bloqueante o no. El padre puede tener varios hilos.
*/
void send_data(int numP_children, malleability_data_t *data_struct, int is_asynchronous) {
size_t i;
void *aux_send, *aux_recv;
if(is_asynchronous) {
for(i=0; i < data_struct->entries; i++) {
aux_send = data_struct->arrays[i];
aux_recv = NULL;
async_communication_start(aux_send, &aux_recv, data_struct->qty[i], data_struct->types[i], mall->numP, numP_children, MAM_SOURCES,
mall->intercomm, &(data_struct->requests[i]), &(data_struct->request_qty[i]), &(data_struct->windows[i]));
if(aux_recv != NULL) data_struct->arrays[i] = aux_recv;
}
} else {
for(i=0; i < data_struct->entries; i++) {
aux_send = data_struct->arrays[i];
aux_recv = NULL;
sync_communication(aux_send, &aux_recv, data_struct->qty[i], data_struct->types[i], mall->numP, numP_children, MAM_SOURCES, mall->intercomm);
if(aux_recv != NULL) data_struct->arrays[i] = aux_recv;
}
}
}
/*
* Funcion generalizada para recibir datos desde los hijos.
* La asincronizidad se refiere a si el hilo padre e hijo lo hacen
* de forma bloqueante o no. El padre puede tener varios hilos.
*/
void recv_data(int numP_parents, malleability_data_t *data_struct, int is_asynchronous) {
size_t i;
void *aux, *aux_s = NULL;
if(is_asynchronous) {
for(i=0; i < data_struct->entries; i++) {
aux = data_struct->arrays[i];
async_communication_start(aux_s, &aux, data_struct->qty[i], data_struct->types[i], mall->numP, numP_parents, MAM_TARGETS,
mall->intercomm, &(data_struct->requests[i]), &(data_struct->request_qty[i]), &(data_struct->windows[i]));
data_struct->arrays[i] = aux;
}
} else {
for(i=0; i < data_struct->entries; i++) {
aux = data_struct->arrays[i];
sync_communication(aux_s, &aux, data_struct->qty[i], data_struct->types[i], mall->numP, numP_parents, MAM_TARGETS, mall->intercomm);
data_struct->arrays[i] = aux;
}
}
}
//======================================================||
//================PRIVATE FUNCTIONS=====================||
//====================MAM STAGES========================||
//======================================================||
//======================================================||
//======================================================||
//======================================================||
//======================================================||
//======================================================||
int MAM_St_rms(int *mam_state) {
reset_malleability_times();
#if MAM_USE_BARRIERS
MPI_Barrier(mall->comm);
#endif
mall_conf->times->malleability_start = MPI_Wtime();
MAM_Check_configuration();
*mam_state = MAM_NOT_STARTED;
state = MAM_I_RMS_COMPLETED;
mall->wait_targets_posted = 0;
//if(CHECK_RMS()) {return MAM_DENIED;}
return 1;
}
int MAM_St_spawn_start() {
mall->num_parents = mall->numP;
state = spawn_step();
//FIXME Esto es necesario pero feo
if(mall_conf->spawn_method == MAM_SPAWN_MERGE && mall->myId >= mall->numC){ mall->zombie = 1; }
else if(mall_conf->spawn_method == MAM_SPAWN_BASELINE){ mall->zombie = 1; }
if (state == MAM_I_SPAWN_COMPLETED || state == MAM_I_SPAWN_ADAPT_POSTPONE){
return 1;
}
return 0;
}
int MAM_St_spawn_pending(int wait_completed) {
state = check_spawn_state(&(mall->intercomm), mall->comm, wait_completed);
if (state == MAM_I_SPAWN_COMPLETED || state == MAM_I_SPAWN_ADAPTED) {
#if MAM_USE_BARRIERS
MPI_Barrier(mall->comm);
#endif
mall_conf->times->spawn_time = MPI_Wtime() - mall_conf->times->malleability_start;
return 1;
}
return 0;
}
int MAM_St_red_start() {
if(MAM_Contains_strat(MAM_SPAWN_STRATEGIES, MAM_STRAT_SPAWN_INTERCOMM, NULL)) {
mall->root_collectives = mall->myId == mall->root ? MPI_ROOT : MPI_PROC_NULL;
} else {
mall->root_collectives = mall->root;
}
state = start_redistribution();
return 1;
}
int MAM_St_red_pending(int wait_completed) {
if(MAM_Contains_strat(MAM_RED_STRATEGIES, MAM_STRAT_RED_PTHREAD, NULL)) {
state = thread_check(wait_completed);
} else {
state = check_redistribution(wait_completed);
}
if(state != MAM_I_DIST_PENDING) {
state = MAM_I_USER_START;
return 1;
}
return 0;
}
int MAM_St_user_start(int *mam_state) {
#if MAM_USE_BARRIERS
MPI_Barrier(mall->intercomm);
#endif
mall_conf->times->user_start = MPI_Wtime(); // Obtener timestamp de cuando termina user redist
if(MAM_Contains_strat(MAM_SPAWN_STRATEGIES, MAM_STRAT_SPAWN_INTERCOMM, NULL)) {
MPI_Intercomm_merge(mall->intercomm, MAM_SOURCES, &mall->tmp_comm); //El que pone 0 va primero
} else {
MPI_Comm_dup(mall->intercomm, &mall->tmp_comm);
}
MPI_Comm_set_name(mall->tmp_comm, "MAM_USER_TMP");
state = MAM_I_USER_PENDING;
*mam_state = MAM_USER_PENDING;
return 1;
}
int MAM_St_user_pending(int *mam_state, int wait_completed, void (*user_function)(void *), void *user_args) {
#if MAM_DEBUG
if(mall->myId == mall->root) DEBUG_FUNC("Starting USER redistribution", mall->myId, mall->numP); fflush(stdout);
#endif
if(user_function != NULL) {
MAM_I_create_user_struct(MAM_SOURCES);
do {
user_function(user_args);
} while(wait_completed && state == MAM_I_USER_PENDING);
} else {
MAM_Resume_redistribution(mam_state);
}
if(state != MAM_I_USER_PENDING) {
#if MAM_USE_BARRIERS
MPI_Barrier(mall->intercomm);
#endif
if(mall_conf->spawn_method == MAM_SPAWN_MERGE) mall_conf->times->user_end = MPI_Wtime(); // Obtener timestamp de cuando termina user redist
#if MAM_DEBUG
if(mall->myId == mall->root) DEBUG_FUNC("Ended USER redistribution", mall->myId, mall->numP); fflush(stdout);
#endif
return 1;
}
return 0;
}
int MAM_St_user_completed() {
state = end_redistribution();
return 1;
}
int MAM_St_spawn_adapt_pending(int wait_completed) {
wait_completed = MAM_WAIT_COMPLETION;
#if MAM_USE_BARRIERS
MPI_Barrier(mall->comm);
#endif
mall_conf->times->spawn_start = MPI_Wtime();
unset_spawn_postpone_flag(state);
state = check_spawn_state(&(mall->intercomm), mall->comm, wait_completed);
/* TODO Comentar problema, basicamente indicar que no es posible de la forma actual
* Ademas es solo para una operación que hemos visto como "extremadamente" rápida
* NO es posible debido a que solo se puede hacer tras enviar los datos variables
* y por tanto pierden validez dichos datos
if(!MAM_Contains_strat(MAM_SPAWN_STRATEGIES, MAM_STRAT_SPAWN_PTHREAD, NULL)) {
#if MAM_USE_BARRIERS
MPI_Barrier(mall->comm);
#endif
mall_conf->times->spawn_time = MPI_Wtime() - mall_conf->times->spawn_start;
return 1;
}
return 0;
*/
#if MAM_USE_BARRIERS
MPI_Barrier(mall->comm);
#endif
mall_conf->times->spawn_time = MPI_Wtime() - mall_conf->times->spawn_start;
return 1;
}
int MAM_St_completed(int *mam_state) {
MAM_Commit(mam_state);
return 0;
}
//======================================================||
//================PRIVATE FUNCTIONS=====================||
//=====================CHILDREN=========================||
//======================================================||
//======================================================||
//======================================================||
//======================================================||
//======================================================||
//======================================================||
/*
* Inicializacion de los datos de los hijos.
* En la misma se reciben datos de los padres: La configuracion
* de la ejecucion a realizar; y los datos a recibir de los padres
* ya sea de forma sincrona, asincrona o ambas.
*/
void Children_init(void (*user_function)(void *), void *user_args) {
size_t i;
#if MAM_DEBUG
DEBUG_FUNC("MaM will now initialize spawned processes", mall->myId, mall->numP); fflush(stdout); MPI_Barrier(MPI_COMM_WORLD);
#endif
malleability_connect_children(&(mall->intercomm));
if(mall_conf->spawn_method == MAM_SPAWN_MERGE) { // For Merge Method, these processes will be added
MPI_Comm_rank(mall->intercomm, &mall->myId);
MPI_Comm_size(mall->intercomm, &mall->numP);
}
mall->root_collectives = mall->root_parents;
if(MAM_Contains_strat(MAM_SPAWN_STRATEGIES, MAM_STRAT_SPAWN_MULTIPLE, NULL)) {
mall->internode_group = 0;
} else {
mall->internode_group = MAM_Is_internode_group();
}
#if MAM_DEBUG
DEBUG_FUNC("Spawned have completed spawn step", mall->myId, mall->numP); fflush(stdout); MPI_Barrier(MPI_COMM_WORLD);
#endif
comm_data_info(rep_a_data, dist_a_data, MAM_TARGETS);
if(dist_a_data->entries || rep_a_data->entries) { // Recibir datos asincronos
#if MAM_DEBUG >= 2
DEBUG_FUNC("Spawned start asynchronous redistribution", mall->myId, mall->numP); fflush(stdout); MPI_Barrier(MPI_COMM_WORLD);
#endif
#if MAM_USE_BARRIERS
MPI_Barrier(mall->intercomm);
#endif
if(MAM_Contains_strat(MAM_RED_STRATEGIES, MAM_STRAT_RED_PTHREAD, NULL)) {
recv_data(mall->num_parents, dist_a_data, MAM_USE_SYNCHRONOUS);
for(i=0; i<rep_a_data->entries; i++) {
MPI_Bcast(rep_a_data->arrays[i], rep_a_data->qty[i], rep_a_data->types[i], mall->root_collectives, mall->intercomm);
}
} else {
recv_data(mall->num_parents, dist_a_data, MAM_USE_ASYNCHRONOUS);
for(i=0; i<rep_a_data->entries; i++) {
MPI_Ibcast(rep_a_data->arrays[i], rep_a_data->qty[i], rep_a_data->types[i], mall->root_collectives, mall->intercomm, &(rep_a_data->requests[i][0]));
}
#if MAM_DEBUG >= 2
DEBUG_FUNC("Spawned started asynchronous redistribution", mall->myId, mall->numP); fflush(stdout); MPI_Barrier(MPI_COMM_WORLD);
#endif
for(i=0; i<rep_a_data->entries; i++) {
async_communication_wait(rep_a_data->requests[i], rep_a_data->request_qty[i]);
}
for(i=0; i<dist_a_data->entries; i++) {
async_communication_wait(dist_a_data->requests[i], dist_a_data->request_qty[i]);
}
if(MAM_Contains_strat(MAM_RED_STRATEGIES, MAM_STRAT_RED_WAIT_TARGETS, NULL)) {
MPI_Ibarrier(mall->intercomm, &mall->wait_targets);
mall->wait_targets_posted = 1;
MPI_Wait(&mall->wait_targets, MPI_STATUS_IGNORE);
}
#if MAM_DEBUG >= 2
DEBUG_FUNC("Spawned waited for all asynchronous redistributions", mall->myId, mall->numP); fflush(stdout); MPI_Barrier(MPI_COMM_WORLD);
#endif
for(i=0; i<dist_a_data->entries; i++) {
async_communication_end(dist_a_data->requests[i], dist_a_data->request_qty[i], &(dist_a_data->windows[i]));
}
for(i=0; i<rep_a_data->entries; i++) {
async_communication_end(rep_a_data->requests[i], rep_a_data->request_qty[i], &(rep_a_data->windows[i]));
}
}
#if MAM_USE_BARRIERS
MPI_Barrier(mall->intercomm);
#endif
mall_conf->times->async_end= MPI_Wtime(); // Obtener timestamp de cuando termina comm asincrona
}
#if MAM_DEBUG
DEBUG_FUNC("Spawned have completed asynchronous data redistribution step", mall->myId, mall->numP); fflush(stdout); MPI_Barrier(MPI_COMM_WORLD);
#endif
#if MAM_USE_BARRIERS
MPI_Barrier(mall->intercomm);
#endif
if(MAM_Contains_strat(MAM_SPAWN_STRATEGIES, MAM_STRAT_SPAWN_INTERCOMM, NULL)) {
MPI_Intercomm_merge(mall->intercomm, MAM_TARGETS, &mall->tmp_comm); //El que pone 0 va primero
} else {
MPI_Comm_dup(mall->intercomm, &mall->tmp_comm);
}
MPI_Comm_set_name(mall->tmp_comm, "MAM_USER_TMP");
if(user_function != NULL) {
state = MAM_I_USER_PENDING;
MAM_I_create_user_struct(MAM_TARGETS);
user_function(user_args);
}
#if MAM_USE_BARRIERS
MPI_Barrier(mall->intercomm);
#endif
mall_conf->times->user_end = MPI_Wtime(); // Obtener timestamp de cuando termina user redist
comm_data_info(rep_s_data, dist_s_data, MAM_TARGETS);
if(dist_s_data->entries || rep_s_data->entries) { // Recibir datos sincronos
#if MAM_USE_BARRIERS
MPI_Barrier(mall->intercomm);
#endif
recv_data(mall->num_parents, dist_s_data, MAM_USE_SYNCHRONOUS);
for(i=0; i<rep_s_data->entries; i++) {
MPI_Bcast(rep_s_data->arrays[i], rep_s_data->qty[i], rep_s_data->types[i], mall->root_collectives, mall->intercomm);
}
#if MAM_USE_BARRIERS
MPI_Barrier(mall->intercomm);
#endif
mall_conf->times->sync_end = MPI_Wtime(); // Obtener timestamp de cuando termina comm sincrona
}
#if MAM_DEBUG
DEBUG_FUNC("Targets have completed synchronous data redistribution step", mall->myId, mall->numP); fflush(stdout); MPI_Barrier(MPI_COMM_WORLD);
#endif
MAM_Commit(NULL);
#if MAM_DEBUG
DEBUG_FUNC("MaM has been initialized correctly for new ranks", mall->myId, mall->numP); fflush(stdout); MPI_Barrier(MPI_COMM_WORLD);
#endif
}
//======================================================||
//================PRIVATE FUNCTIONS=====================||
//=====================PARENTS==========================||
//======================================================||
//======================================================||
//======================================================||
//======================================================||
/*
* Se encarga de realizar la creacion de los procesos hijos.
* Si se pide en segundo plano devuelve el estado actual.
*/
int spawn_step(){
#if MAM_USE_BARRIERS
MPI_Barrier(mall->comm);
#endif
mall_conf->times->spawn_start = MPI_Wtime();
state = init_spawn(mall->thread_comm, &(mall->intercomm));
if(!MAM_Contains_strat(MAM_SPAWN_STRATEGIES, MAM_STRAT_SPAWN_PTHREAD, NULL)) {
#if MAM_USE_BARRIERS
MPI_Barrier(mall->comm);
#endif
mall_conf->times->spawn_time = MPI_Wtime() - mall_conf->times->malleability_start;
}
return state;
}
/*
* Comienza la redistribucion de los datos con el nuevo grupo de procesos.
*
* Primero se envia la configuracion a utilizar al nuevo grupo de procesos y a continuacion
* se realiza el envio asincrono y/o sincrono si lo hay.
*
* En caso de que haya comunicacion asincrona, se comienza y se termina la funcion
* indicando que se ha comenzado un envio asincrono.
*
* Si no hay comunicacion asincrono se pasa a realizar la sincrona si la hubiese.
*
* Finalmente se envian datos sobre los resultados a los hijos y se desconectan ambos
* grupos de procesos.
*/
int start_redistribution() {
size_t i;
if(mall->intercomm == MPI_COMM_NULL) {
// Si no tiene comunicador creado, se debe a que se ha pospuesto el Spawn
// y se trata del spawn Merge Shrink
MPI_Comm_dup(mall->comm, &(mall->intercomm));
}
comm_data_info(rep_a_data, dist_a_data, MAM_SOURCES);
if(dist_a_data->entries || rep_a_data->entries) { // Enviar datos asincronos
#if MAM_USE_BARRIERS
MPI_Barrier(mall->intercomm);
#endif
mall_conf->times->async_start = MPI_Wtime();
if(MAM_Contains_strat(MAM_RED_STRATEGIES, MAM_STRAT_RED_PTHREAD, NULL)) {
return thread_creation();
} else {
send_data(mall->numC, dist_a_data, MAM_USE_ASYNCHRONOUS);
for(i=0; i<rep_a_data->entries; i++) {
MPI_Ibcast(rep_a_data->arrays[i], rep_a_data->qty[i], rep_a_data->types[i], mall->root_collectives, mall->intercomm, &(rep_a_data->requests[i][0]));
}
if(mall->zombie && MAM_Contains_strat(MAM_RED_STRATEGIES, MAM_STRAT_RED_WAIT_TARGETS, NULL)) {
MPI_Ibarrier(mall->intercomm, &mall->wait_targets);
mall->wait_targets_posted = 1;
}
return MAM_I_DIST_PENDING;
}
}
return MAM_I_USER_START;
}
/*
* Comprueba si la redistribucion asincrona ha terminado.
* Si no ha terminado la funcion termina indicandolo, en caso contrario,
* se continua con la comunicacion sincrona, el envio de resultados y
* se desconectan los grupos de procesos.
*
* Esta funcion permite dos modos de funcionamiento al comprobar si la
* comunicacion asincrona ha terminado.
* Si se utiliza el modo "MAL_USE_NORMAL" o "MAL_USE_POINT", se considera
* terminada cuando los padres terminan de enviar.
* Si se utiliza el modo "MAL_USE_IBARRIER", se considera terminada cuando
* los hijos han terminado de recibir.
* //FIXME Modificar para que se tenga en cuenta rep_a_data
*/
int check_redistribution(int wait_completed) {
int completed, local_completed, all_completed;
size_t i, req_qty;
MPI_Request *req_completed;
MPI_Win window;
local_completed = 1;
#if MAM_DEBUG >= 2
DEBUG_FUNC("Sources are testing for all asynchronous redistributions", mall->myId, mall->numP); fflush(stdout); MPI_Barrier(MPI_COMM_WORLD);
#endif
if(wait_completed) {
if(MAM_Contains_strat(MAM_RED_STRATEGIES, MAM_STRAT_RED_WAIT_TARGETS, NULL) && !mall->wait_targets_posted) {
MPI_Ibarrier(mall->intercomm, &mall->wait_targets);
mall->wait_targets_posted = 1;
}
for(i=0; i<dist_a_data->entries; i++) {
req_completed = dist_a_data->requests[i];
req_qty = dist_a_data->request_qty[i];
async_communication_wait(req_completed, req_qty);
}
for(i=0; i<rep_a_data->entries; i++) {
req_completed = rep_a_data->requests[i];
req_qty = rep_a_data->request_qty[i];
async_communication_wait(req_completed, req_qty);
}
if(MAM_Contains_strat(MAM_RED_STRATEGIES, MAM_STRAT_RED_WAIT_TARGETS, NULL)) { MPI_Wait(&mall->wait_targets, MPI_STATUS_IGNORE); }
} else {
if(mall->wait_targets_posted) {
MPI_Test(&mall->wait_targets, &local_completed, MPI_STATUS_IGNORE);
} else {
for(i=0; i<dist_a_data->entries; i++) {
req_completed = dist_a_data->requests[i];
req_qty = dist_a_data->request_qty[i];
completed = async_communication_check(MAM_SOURCES, req_completed, req_qty);
local_completed = local_completed && completed;
}
for(i=0; i<rep_a_data->entries; i++) {
req_completed = rep_a_data->requests[i];
req_qty = rep_a_data->request_qty[i];
completed = async_communication_check(MAM_SOURCES, req_completed, req_qty);
local_completed = local_completed && completed;
}
if(local_completed && MAM_Contains_strat(MAM_RED_STRATEGIES, MAM_STRAT_RED_WAIT_TARGETS, NULL)) {
MPI_Ibarrier(mall->intercomm, &mall->wait_targets);
mall->wait_targets_posted = 1;
MPI_Test(&mall->wait_targets, &local_completed, MPI_STATUS_IGNORE); //TODO - Figure out if last process takes profit from calling here
}
}
#if MAM_DEBUG >= 2
DEBUG_FUNC("Sources will now check a global decision", mall->myId, mall->numP); fflush(stdout); MPI_Barrier(MPI_COMM_WORLD);
#endif
MPI_Allreduce(&local_completed, &all_completed, 1, MPI_INT, MPI_MIN, mall->comm);
if(!all_completed) return MAM_I_DIST_PENDING; // Continue only if asynchronous send has ended
}
#if MAM_DEBUG >= 2
DEBUG_FUNC("Sources sent asynchronous redistributions", mall->myId, mall->numP); fflush(stdout); MPI_Barrier(MPI_COMM_WORLD);
#endif
for(i=0; i<dist_a_data->entries; i++) {
req_completed = dist_a_data->requests[i];
req_qty = dist_a_data->request_qty[i];
window = dist_a_data->windows[i];
async_communication_end(req_completed, req_qty, &window);
}
for(i=0; i<rep_a_data->entries; i++) {
req_completed = rep_a_data->requests[i];
req_qty = rep_a_data->request_qty[i];
window = rep_a_data->windows[i];
async_communication_end(req_completed, req_qty, &window);
}
#if MAM_USE_BARRIERS
MPI_Barrier(mall->intercomm);
#endif
if(mall_conf->spawn_method == MAM_SPAWN_MERGE) mall_conf->times->async_end = MPI_Wtime(); // Merge method only
return MAM_I_USER_START;
}
/*
* Termina la redistribución de los datos con los hijos, comprobando
* si se han realizado iteraciones con comunicaciones en segundo plano
* y enviando cuantas iteraciones se han realizado a los hijos.
*
* Además se realizan las comunicaciones síncronas se las hay.
* Finalmente termina enviando los datos temporales a los hijos.
*/
int end_redistribution() {
size_t i;
int local_state;
comm_data_info(rep_s_data, dist_s_data, MAM_SOURCES);
if(dist_s_data->entries || rep_s_data->entries) { // Enviar datos sincronos
#if MAM_USE_BARRIERS
MPI_Barrier(mall->intercomm);
#endif
mall_conf->times->sync_start = MPI_Wtime();
send_data(mall->numC, dist_s_data, MAM_USE_SYNCHRONOUS);
for(i=0; i<rep_s_data->entries; i++) {
MPI_Bcast(rep_s_data->arrays[i], rep_s_data->qty[i], rep_s_data->types[i], mall->root_collectives, mall->intercomm);
}
#if MAM_USE_BARRIERS
MPI_Barrier(mall->intercomm);
#endif
if(mall_conf->spawn_method == MAM_SPAWN_MERGE) mall_conf->times->sync_end = MPI_Wtime(); // Merge method only
}
#if MAM_DEBUG
DEBUG_FUNC("Sources have completed synchronous data redistribution step", mall->myId, mall->numP); fflush(stdout); MPI_Barrier(mall->comm);
#endif
local_state = MAM_I_DIST_COMPLETED;
if(mall_conf->spawn_method == MAM_SPAWN_MERGE && mall->numP > mall->numC) { // Merge Shrink
local_state = MAM_I_SPAWN_ADAPT_PENDING;
}
return local_state;
}
// TODO MOVER A OTRO LADO??
//======================================================||
//================PRIVATE FUNCTIONS=====================||
//===============COMM PARENTS THREADS===================||
//======================================================||
//======================================================||
int comm_state; //FIXME Usar un handler
/*
* Crea una hebra para ejecutar una comunicación en segundo plano.
*/
int thread_creation() {
comm_state = MAM_I_DIST_PENDING;
if(pthread_create(&(mall->async_thread), NULL, thread_async_work, NULL)) {
printf("Error al crear el hilo\n");
MPI_Abort(MPI_COMM_WORLD, -1);
return -1;
}
return comm_state;
}
/*
* Comprobación por parte de una hebra maestra que indica
* si una hebra esclava ha terminado su comunicación en segundo plano.
*
* El estado de la comunicación es devuelto al finalizar la función.
*/
int thread_check(int wait_completed) {
int all_completed = 0;
if(wait_completed && comm_state == MAM_I_DIST_PENDING) {
if(pthread_join(mall->async_thread, NULL)) {
printf("Error al esperar al hilo\n");
MPI_Abort(MPI_COMM_WORLD, -1);
return -2;
}
}
// Comprueba que todos los hilos han terminado la distribucion (Mismo valor en commAsync)
MPI_Allreduce(&comm_state, &all_completed, 1, MPI_INT, MPI_MAX, mall->comm);
if(all_completed != MAM_I_DIST_COMPLETED) return MAM_I_DIST_PENDING; // Continue only if asynchronous send has ended
if(pthread_join(mall->async_thread, NULL)) {
printf("Error al esperar al hilo\n");
MPI_Abort(MPI_COMM_WORLD, -1);
return -2;
}
#if MAM_USE_BARRIERS
MPI_Barrier(mall->intercomm);
#endif
if(mall_conf->spawn_method == MAM_SPAWN_MERGE) mall_conf->times->async_end = MPI_Wtime(); // Merge method only
return MAM_I_USER_START;
}
/*
* Función ejecutada por una hebra.
* Ejecuta una comunicación síncrona con los hijos que
* para el usuario se puede considerar como en segundo plano.
*
* Cuando termina la comunicación la hebra maestra puede comprobarlo
* por el valor "commAsync".
*/
void* thread_async_work() {
size_t i;
send_data(mall->numC, dist_a_data, MAM_USE_SYNCHRONOUS);
for(i=0; i<rep_a_data->entries; i++) {
MPI_Bcast(rep_a_data->arrays[i], rep_a_data->qty[i], rep_a_data->types[i], mall->root_collectives, mall->intercomm);
}
comm_state = MAM_I_DIST_COMPLETED;
pthread_exit(NULL);
}
//==============================================================================
/*
* TODO Por hacer
*/
void MAM_I_create_user_struct(int is_children_group) {
user_reconf->comm = mall->tmp_comm;
if(is_children_group) {
user_reconf->rank_state = MAM_PROC_NEW_RANK;
user_reconf->numS = mall->num_parents;
user_reconf->numT = mall->numP;
} else {
user_reconf->numS = mall->numP;
user_reconf->numT = mall->numC;
if(mall->zombie) user_reconf->rank_state = MAM_PROC_ZOMBIE;
else user_reconf->rank_state = MAM_PROC_CONTINUE;
}
}
#ifndef MAM_MANAGER_H
#define MAM_MANAGER_H
#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>
#include <fcntl.h>
#include <sys/stat.h>
#include <mpi.h>
typedef struct {
int numS, numT;
int rank_state;
MPI_Comm comm;
} mam_user_reconf_t;
int MAM_Init(int root, MPI_Comm *comm, char *name_exec, void (*user_function)(void *), void *user_args);
int MAM_Finalize();
int MAM_Checkpoint(int *mam_state, int wait_completed, void (*user_function)(void *), void *user_args);
void MAM_Resume_redistribution(int *mam_state);
int MAM_Get_Reconf_Info(mam_user_reconf_t *reconf_info);
void MAM_Data_add(void *data, size_t *index, size_t total_qty, MPI_Datatype type, int is_replicated, int is_constant);
void MAM_Data_modify(void *data, size_t index, size_t total_qty, MPI_Datatype type, int is_replicated, int is_constant);
void MAM_Data_get_entries(int is_replicated, int is_constant, size_t *entries);
void MAM_Data_get_pointer(void **data, size_t index, size_t *total_qty, MPI_Datatype *type, int is_replicated, int is_constant);
#endif
#define _GNU_SOURCE
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include <sched.h>
#include <mpi.h>
#include "MAM_RMS.h"
#include "MAM_DataStructures.h"
#if MAM_USE_SLURM
#include <slurm/slurm.h>
int MAM_I_slurm_getenv_hosts_info();
int MAM_I_slurm_getjob_hosts_info();
#endif
int MAM_I_get_hosts_info();
int GetCPUCount();
void MAM_check_hosts() {
int not_filled = 1;
#if MAM_USE_SLURM
not_filled = MAM_I_slurm_getjob_hosts_info();
if(not_filled) {
if(mall->nodelist != NULL) {
free(mall->nodelist);
mall->nodelist = NULL;
}
not_filled = MAM_I_slurm_getenv_hosts_info();
}
#endif
if(not_filled) {
if(mall->nodelist != NULL) {
free(mall->nodelist);
mall->nodelist = NULL;
}
not_filled = MAM_I_get_hosts_info();
}
if(not_filled) {
if(mall->myId == mall->root) printf("MAM FATAL ERROR: It has not been possible to obtain the nodelist\n");
fflush(stdout);
MPI_Abort(mall->comm, -50);
}
#if MAM_DEBUG >= 2
if(mall->myId == mall->root) {
DEBUG_FUNC("Obtained Nodelist", mall->myId, mall->numP);
printf("NODELIST: %s\nNODE_COUNT: %d NUM_CPUS_PER_NODE: %d\n", mall->nodelist, mall->num_nodes, mall->num_cpus);
fflush(stdout);
}
#endif
}
/*
* @brief Get if a group of processes uses an internode comunicator
*
* This function checks the physical distribution of all ranks in the
* original communicator passed to MaM. If all of them reside in the
* same host, false is returned. True is returned otherwise.
*
* @return Integer indicating if more than one node is used by the
* original communicator (>0) or only one (0).
*/
int MAM_Is_internode_group() {
int i, name_len, max_name_len, unique_count;
int myId, numP;
char *my_host, *all_hosts, *tested_host;
MPI_Comm_rank(mall->original_comm, &myId);
MPI_Comm_size(mall->original_comm, &numP);
unique_count = 0; //First node is not counted
if(numP == 1) return unique_count;
all_hosts = NULL;
my_host = (char *) malloc(MPI_MAX_PROCESSOR_NAME * sizeof(char));
MPI_Get_processor_name(my_host, &name_len);
MPI_Allreduce(&name_len, &max_name_len, 1, MPI_INT, MPI_MAX, mall->original_comm);
my_host[max_name_len] = '\0';
max_name_len++; // Len does not consider terminating character
if(myId == MAM_ROOT) {
all_hosts = (char *) malloc(numP * max_name_len * sizeof(char));
}
//FIXME Should be a Gatherv as each host could have unitialised chars between name_len and max_name_len
MPI_Gather(my_host, max_name_len, MPI_CHAR, all_hosts, max_name_len, MPI_CHAR, MAM_ROOT, mall->original_comm);
if(myId == MAM_ROOT) {
for (i = 1; i < numP; i++) {
tested_host = all_hosts + (i * max_name_len);
if (strcmp(my_host, tested_host) != 0) {
unique_count++;
break;
}
}
free(all_hosts);
}
MPI_Bcast(&unique_count, 1, MPI_INT, MAM_ROOT, mall->original_comm);
free(my_host);
return unique_count;
}
/*
* TODO
* FIXME Does not consider heterogenous machines for num_cpus
* FIXME Always returns 0... -- Perform error checking?
*/
int MAM_I_get_hosts_info() {
int i, j, name_len, max_name_len, unique_count, *unique_hosts;
char *my_host, *all_hosts, *confirmed_host, *tested_host;
all_hosts = NULL;
my_host = (char *) malloc(MPI_MAX_PROCESSOR_NAME * sizeof(char));
MPI_Get_processor_name(my_host, &name_len);
MPI_Allreduce(&name_len, &max_name_len, 1, MPI_INT, MPI_MAX, mall->comm);
my_host[max_name_len] = '\0';
max_name_len++; // Len does not consider terminating character
if(mall->myId == mall->root) {
all_hosts = (char *) malloc(mall->numP * max_name_len * sizeof(char));
unique_hosts = (int *) malloc(mall->numP * sizeof(int));
unique_hosts[0] = 0; //First host will always be unique
unique_count = 1;
}
//FIXME Should be a Gatherv as each host could have unitialised chars between name_len and max_name_len
MPI_Gather(my_host, max_name_len, MPI_CHAR, all_hosts, max_name_len, MPI_CHAR, mall->root, mall->comm);
if(mall->myId == mall->root) {
for (i = 1; i < mall->numP; i++) {
for (j = 0; j < unique_count; j++) {
tested_host = all_hosts + (i * max_name_len);
confirmed_host = all_hosts + (unique_hosts[j] * max_name_len);
if (strcmp(tested_host, confirmed_host) != 0) {
unique_hosts[unique_count] = i;
unique_count++;
break;
}
}
}
mall->num_nodes = unique_count;
mall->num_cpus = GetCPUCount();
mall->nodelist_len = unique_count*max_name_len;
mall->nodelist = (char *) malloc(mall->nodelist_len * sizeof(char));
strcpy(mall->nodelist, ""); //FIXME Strcat can be very inneficient...
for (i = 0; i < unique_count; i++) {
confirmed_host = all_hosts + (unique_hosts[i] * max_name_len);
strcat(mall->nodelist, confirmed_host);
if (i < unique_count - 1) {
strcat(mall->nodelist, ",");
}
}
free(all_hosts);
free(unique_hosts);
}
free(my_host);
return 0;
}
/*
* @brief Get the total number of CPUs available to the process.
*
* This function uses sched_getaffinity to obtain the CPU affinity of the current process
* and counts the number of CPUs in the affinity set. It adjusts the loop based on the
* maximum number of CPUs allowed on the system.
*
* @return The total number of CPUs available to the process.
*
* Code obtained from: https://stackoverflow.com/questions/4586405/how-to-get-the-number-of-cpus-in-linux-using-c
* The code has been slightly modified.
*/
int GetCPUCount() {
cpu_set_t cs;
CPU_ZERO(&cs);
sched_getaffinity(0, sizeof(cs), &cs);
int count = 0;
int max_cpus = sysconf(_SC_NPROCESSORS_ONLN);
for (int i = 0; i < max_cpus; i++) {
if (CPU_ISSET(i, &cs)) {
count++;
} else {
break;
}
}
return count;
}
#if MAM_USE_SLURM
/*
* TODO
*/
int MAM_I_slurm_getenv_hosts_info() {
char *tmp = NULL, *tmp_copy, *token;
int cpus, count;
//int i, *cpus_counts, *nodes_counts, *aux;
tmp = getenv("SLURM_JOB_NUM_NODES");
if(tmp == NULL) return 1;
mall->num_nodes = atoi(tmp);
tmp = NULL;
tmp = getenv("SLURM_JOB_NODELIST");
if(tmp == NULL) return 1;
mall->nodelist_len = strlen(tmp)+1;
mall->nodelist = (char *) malloc(mall->nodelist_len * sizeof(char));
strcpy(mall->nodelist, tmp);
tmp = NULL;
//EXAMPLE - SLURM_JOB_CPUS_PER_NODE='72(x2),36'
//It indicates two nodes have 72 CPUs each and third has 36 cpus
tmp = getenv("SLURM_JOB_CPUS_PER_NODE");
if(tmp == NULL) return 1;
tmp_copy = (char *) malloc((strlen(tmp)+1) * sizeof(char));
strcpy(tmp_copy, tmp);
token = strtok(tmp_copy, ",");
//TODO When MaM considers heteregenous allocations, these will be needed instead of num_cpus.
//cpus_counts = (int *) malloc(mall->num_nodes * sizeof(int));
//nodes_counts = (int *) malloc(mall->num_nodes * sizeof(int));
//i = 0;
mall->num_cpus = 0;
while (token != NULL) {
// If actual token contains only one node, the second portion
// does not appear and sscanf does not modify "count"
// First portion --> "%d"
// Second portion -> "(x%d)"
count = 1;
if (sscanf(token, "%d(x%d)", &cpus, &count) >= 1) {
mall->num_cpus = cpus; // num_cpus stores the amount of cores per cpu
//cpus_per_node[i] = cpus;
//nodes_count[i] = count;
//i++;
}
token = strtok(NULL, ",");
}
/*
if(i < mall->num_nodes) {
aux = (int *) realloc(cpus_per_node, i * sizeof(int));
if(cpus_per_node != aux && cpus_per_node != NULL) free(cpus_per_node);
cpus_per_node = aux;
aux = (int *) realloc(nodes_counts, i * sizeof(int));
if(nodes_count != aux && nodes_count != NULL) free(nodes_count);
nodes_count = aux;
}
*/
free(tmp_copy);
return 0;
}
/*
* TODO
* FIXME Does not consider heterogenous machines
*/
int MAM_I_slurm_getjob_hosts_info() {
int jobId, err;
char *tmp = NULL;
job_info_msg_t *j_info;
slurm_job_info_t last_record;
tmp = getenv("SLURM_JOB_ID");
if(tmp == NULL) return 1;
jobId = atoi(tmp);
err = slurm_load_job(&j_info, jobId, 1);
if(err) return err;
last_record = j_info->job_array[j_info->record_count - 1];
mall->num_nodes = last_record.num_nodes;
mall->num_cpus = last_record.num_cpus / last_record.num_nodes;
mall->nodelist_len = strlen(last_record.nodes)+1;
mall->nodelist = (char *) malloc(mall->nodelist_len * sizeof(char));
strcpy(mall->nodelist, last_record.nodes);
slurm_free_job_info_msg(j_info);
return 0;
}
#endif
//TODO REFACTOR PARA CUANDO SE COMUNIQUE CON RMS
// Get Slurm job info
//int jobId;
//char *tmp;
//job_info_msg_t *j_info;
//slurm_job_info_t last_record;
//tmp = getenv("SLURM_JOB_ID");
//jobId = atoi(tmp);
//slurm_load_job(&j_info, jobId, 1);
//last_record = j_info->job_array[j_info->record_count - 1];
// Free JOB INFO
//slurm_free_job_info_msg(j_info);
#ifndef MAM_RMS_H
#define MAM_RMS_H
void MAM_check_hosts();
int MAM_Is_internode_group();
#endif
#include "MAM_Times.h"
#include "MAM_DataStructures.h"
void def_malleability_times(MPI_Datatype *new_type);
void init_malleability_times() {
#if MAM_DEBUG
DEBUG_FUNC("Initializing recording structure", mall->myId, mall->numP); fflush(stdout); MPI_Barrier(mall->comm);
#endif
mall_conf->times = (malleability_times_t *) malloc(sizeof(malleability_times_t));
if(mall_conf->times == NULL) {
perror("Error al crear la estructura de tiempos interna para maleabilidad\n");
MPI_Abort(MPI_COMM_WORLD, -5);
}
reset_malleability_times();
def_malleability_times(&mall_conf->times->times_type);
#if MAM_DEBUG
DEBUG_FUNC("Initialized recording structure", mall->myId, mall->numP); fflush(stdout); MPI_Barrier(mall->comm);
#endif
}
void reset_malleability_times() {
malleability_times_t *times = mall_conf->times;
times->spawn_start = 0; times->sync_start = 0; times->async_start = 0; times->user_start = 0; times->malleability_start = 0;
times->sync_end = 0; times->async_end = 0; times->user_end = 0; times->malleability_end = 0;
times->spawn_time = 0;
}
void free_malleability_times() {
#if MAM_DEBUG
DEBUG_FUNC("Freeing recording structure", mall->myId, mall->numP); fflush(stdout);
#endif
if(mall_conf->times != NULL) {
if(mall_conf->times->times_type != MPI_DATATYPE_NULL) {
MPI_Type_free(&mall_conf->times->times_type);
mall_conf->times->times_type = MPI_DATATYPE_NULL;
}
free(mall_conf->times);
}
#if MAM_DEBUG
DEBUG_FUNC("Freed recording structure", mall->myId, mall->numP); fflush(stdout);
#endif
}
/*
* @brief Returns the times used for the different steps of last reconfiguration.
*
* This function is intended to be called when a reconfiguration has ended.
* It is designed to provide the necessary information for the user to perform data redistribution.
*
* Parameters:
* - double *sp_time: A pointer where the spawn time will be saved.
* - double *sy_time: A pointer where the sychronous data redistribution time will be saved.
* - double *asy_time: A pointer where the asychronous data redistribution time will be saved.
* - double *user_time: A pointer where the user data redistribution time will be saved.
* - double *mall_time: A pointer where the malleability time will be saved.
*/
void MAM_Retrieve_times(double *sp_time, double *sy_time, double *asy_time, double *user_time, double *mall_time) {
malleability_times_t *times = mall_conf->times;
*sp_time = times->spawn_time;
*sy_time = times->sync_end - times->sync_start;
*asy_time = times->async_end - times->async_start;
*user_time = times->user_end - times->user_start;
*mall_time = times->malleability_end - times->malleability_start;
}
void malleability_times_broadcast(int root) {
MPI_Bcast(mall_conf->times, 1, mall_conf->times->times_type, root, mall->intercomm);
}
void def_malleability_times(MPI_Datatype *new_type) {
int i, counts = 5;
int blocklengths[counts];
MPI_Aint displs[counts], dir;
MPI_Datatype types[counts];
blocklengths[0] = blocklengths[1] = blocklengths[2] = blocklengths[3] = blocklengths[4] = 1;
types[0] = types[1] = types[2] = types[3] = types[4] = MPI_DOUBLE;
// Se pasa el vector a traves de la direccion de "mall_conf"
// Rellenar vector displs
MPI_Get_address(mall_conf->times, &dir);
// Obtener direccion base
MPI_Get_address(&(mall_conf->times->spawn_time), &displs[0]);
MPI_Get_address(&(mall_conf->times->sync_start), &displs[1]);
MPI_Get_address(&(mall_conf->times->async_start), &displs[2]);
MPI_Get_address(&(mall_conf->times->user_start), &displs[3]);
MPI_Get_address(&(mall_conf->times->malleability_start), &displs[4]);
for(i=0;i<counts;i++) displs[i] -= dir;
MPI_Type_create_struct(counts, blocklengths, displs, types, new_type);
MPI_Type_commit(new_type);
}
#ifndef MAM_TIMES_H
#define MAM_TIMES_H
#include <mpi.h>
void init_malleability_times();
void reset_malleability_times();
void free_malleability_times();
void malleability_times_broadcast(int root);
void MAM_I_retrieve_times(double *sp_time, double *sy_time, double *asy_time, double *user_time, double *mall_time);
#endif
#ifndef MAM_TIMES_RETRIEVE_H
#define MAM_TIMES_RETRIEVE_H
void MAM_Retrieve_times(double *sp_time, double *sy_time, double *asy_time, double *user_time, double *mall_time);
#endif
#include "malleabilityTypes.h"
#include "MAM_Types.h"
#include "MAM_DataStructures.h"
#include "MAM_Configuration.h"
void init_malleability_data_struct(malleability_data_t *data_struct, size_t size);
......@@ -20,13 +22,13 @@ void def_malleability_qty_type(malleability_data_t *data_struct_rep, malleabilit
* todos los padres. La nueva serie "data" solo representa los datos
* que tiene este padre.
*/
void add_data(void *data, size_t total_qty, int type, size_t request_qty, malleability_data_t *data_struct) {
void add_data(void *data, size_t total_qty, MPI_Datatype type, size_t request_qty, malleability_data_t *data_struct) {
size_t i;
if(data_struct->entries == 0) {
init_malleability_data_struct(data_struct, MALLEABILITY_INIT_DATA_QTY);
init_malleability_data_struct(data_struct, MAM_TYPES_INIT_DATA_QTY);
} else if(data_struct->entries == data_struct->max_entries) {
realloc_malleability_data_struct(data_struct, MALLEABILITY_INIT_DATA_QTY);
realloc_malleability_data_struct(data_struct, MAM_TYPES_INIT_DATA_QTY);
}
data_struct->qty[data_struct->entries] = total_qty;
......@@ -49,7 +51,7 @@ void add_data(void *data, size_t total_qty, int type, size_t request_qty, mallea
* todos los padres. La nueva serie "data" solo representa los datos
* que tiene este padre.
*/
void modify_data(void *data, size_t index, size_t total_qty, int type, size_t request_qty, malleability_data_t *data_struct) {
void modify_data(void *data, size_t index, size_t total_qty, MPI_Datatype type, size_t request_qty, malleability_data_t *data_struct) {
size_t i;
if(data_struct->entries < index) { // Index does not exist
......@@ -80,41 +82,34 @@ void modify_data(void *data, size_t index, size_t total_qty, int type, size_t re
* En el argumento "root" todos tienen que indicar quien es el proceso raiz de los padres
* unicamente.
*/
void comm_data_info(malleability_data_t *data_struct_rep, malleability_data_t *data_struct_dist, int is_children_group, int myId, int root, MPI_Comm intercomm) {
int is_intercomm, rootBcast = MPI_PROC_NULL;
void comm_data_info(malleability_data_t *data_struct_rep, malleability_data_t *data_struct_dist, int is_children_group) {
int type_size;
size_t i, j;
MPI_Datatype entries_type, struct_type;
MPI_Comm_test_inter(intercomm, &is_intercomm);
if(is_intercomm && !is_children_group) {
rootBcast = myId == root ? MPI_ROOT : MPI_PROC_NULL;
} else {
rootBcast = root;
}
// Mandar primero numero de entradas
def_malleability_entries(data_struct_dist, data_struct_rep, &entries_type);
MPI_Bcast(MPI_BOTTOM, 1, entries_type, rootBcast, intercomm);
MPI_Bcast(MPI_BOTTOM, 1, entries_type, mall->root_collectives, mall->intercomm);
if(is_children_group && ( data_struct_rep->entries != 0 || data_struct_dist->entries != 0 )) { //FIXME Que pasa si ambos valores son 0?
if(is_children_group && ( data_struct_rep->entries != 0 || data_struct_dist->entries != 0 )) {
init_malleability_data_struct(data_struct_rep, data_struct_rep->entries);
init_malleability_data_struct(data_struct_dist, data_struct_dist->entries);
}
def_malleability_qty_type(data_struct_dist, data_struct_rep, &struct_type);
MPI_Bcast(MPI_BOTTOM, 1, struct_type, rootBcast, intercomm);
MPI_Bcast(MPI_BOTTOM, 1, struct_type, mall->root_collectives, mall->intercomm);
if(is_children_group) {
for(i=0; i < data_struct_rep->entries; i++) {
data_struct_rep->arrays[i] = (void *) malloc(data_struct_rep->qty[i] * sizeof(int)); //TODO Tener en cuenta que no siempre es int
MPI_Type_size(data_struct_rep->types[i], &type_size);
data_struct_rep->arrays[i] = (void *) malloc(data_struct_rep->qty[i] * type_size); //FIXME This memory is not freed -- How should be done?
data_struct_rep->requests[i] = (MPI_Request *) malloc(data_struct_rep->request_qty[i] * sizeof(MPI_Request));
for(j=0; j < data_struct_rep->request_qty[i]; j++) {
data_struct_rep->requests[i][j] = MPI_REQUEST_NULL;
}
}
for(i=0; i < data_struct_dist->entries; i++) {
data_struct_dist->arrays[i] = (void *) NULL;
data_struct_dist->arrays[i] = (void *) NULL; // TODO Se podria inicializar aqui?
data_struct_dist->requests[i] = (MPI_Request *) malloc(data_struct_dist->request_qty[i] * sizeof(MPI_Request));
for(j=0; j < data_struct_dist->request_qty[i]; j++) {
data_struct_dist->requests[i][j] = MPI_REQUEST_NULL;
......@@ -142,7 +137,7 @@ void init_malleability_data_struct(malleability_data_t *data_struct, size_t size
data_struct->max_entries = size;
data_struct->qty = (size_t *) malloc(size * sizeof(size_t));
data_struct->types = (int *) malloc(size * sizeof(int));
data_struct->types = (MPI_Datatype *) malloc(size * sizeof(MPI_Datatype));
data_struct->request_qty = (size_t *) malloc(size * sizeof(size_t));
data_struct->requests = (MPI_Request **) malloc(size * sizeof(MPI_Request *));
data_struct->windows = (MPI_Win *) malloc(size * sizeof(MPI_Win));
......@@ -150,6 +145,7 @@ void init_malleability_data_struct(malleability_data_t *data_struct, size_t size
for(i=0; i<size; i++) { //calloc and memset does not ensure a NULL value
data_struct->requests[i] = NULL;
data_struct->windows[i] = MPI_WIN_NULL;
data_struct->arrays[i] = NULL;
}
}
......@@ -161,14 +157,14 @@ void init_malleability_data_struct(malleability_data_t *data_struct, size_t size
*/
void realloc_malleability_data_struct(malleability_data_t *data_struct, size_t qty_to_add) {
size_t i, needed, *qty_aux, *request_qty_aux;
int *types_aux;
MPI_Datatype *types_aux;
MPI_Win *windows_aux;
MPI_Request **requests_aux;
void **arrays_aux;
needed = data_struct->max_entries + qty_to_add;
qty_aux = (size_t *) realloc(data_struct->qty, needed * sizeof(int));
types_aux = (int *) realloc(data_struct->types, needed * sizeof(int));
types_aux = (MPI_Datatype *) realloc(data_struct->types, needed * sizeof(MPI_Datatype));
request_qty_aux = (size_t *) realloc(data_struct->request_qty, needed * sizeof(int));
requests_aux = (MPI_Request **) realloc(data_struct->requests, needed * sizeof(MPI_Request *));
windows_aux = (MPI_Win *) realloc(data_struct->windows, needed * sizeof(MPI_Win));
......@@ -181,9 +177,18 @@ void realloc_malleability_data_struct(malleability_data_t *data_struct, size_t q
for(i=data_struct->max_entries; i<needed; i++) { //realloc does not ensure a NULL value
requests_aux[i] = NULL;
windows_aux[i] = MPI_WIN_NULL;
arrays_aux[i] = NULL;
}
// Check if old array can be freed
if(data_struct->qty != qty_aux && data_struct->qty != NULL) free(data_struct->qty);
if(data_struct->types != types_aux && data_struct->types != NULL) free(data_struct->types);
if(data_struct->request_qty != request_qty_aux && data_struct->request_qty != NULL) free(data_struct->request_qty);
if(data_struct->requests != requests_aux && data_struct->requests != NULL) free(data_struct->requests);
if(data_struct->windows != windows_aux && data_struct->windows != NULL) free(data_struct->windows);
if(data_struct->arrays != arrays_aux && data_struct->arrays != NULL) free(data_struct->arrays);
data_struct->qty = qty_aux;
data_struct->types = types_aux;
data_struct->request_qty = request_qty_aux;
......@@ -198,10 +203,6 @@ void free_malleability_data_struct(malleability_data_t *data_struct) {
max = data_struct->entries;
if(max != 0) {
for(i=0; i<max; i++) {
//free(data_struct->arrays[i]); //FIXME Valores alojados con 1 elemento no se liberan?
}
if(data_struct->qty != NULL) {
free(data_struct->qty);
}
......@@ -248,10 +249,11 @@ void def_malleability_entries(malleability_data_t *data_struct_rep, malleability
int counts = 2;
int blocklengths[counts];
MPI_Aint displs[counts];
MPI_Datatype types[counts];
MPI_Datatype types[counts], type_size_t;
MPI_Type_match_size(MPI_TYPECLASS_INTEGER, sizeof(size_t), &type_size_t);
blocklengths[0] = blocklengths[1] = 1;
types[0] = types[1] = MPI_UNSIGNED_LONG;
types[0] = types[1] = type_size_t;
// Obtener direccion base
MPI_Get_address(&(data_struct_rep->entries), &displs[0]);
......@@ -272,20 +274,24 @@ void def_malleability_qty_type(malleability_data_t *data_struct_rep, malleabilit
int counts = 6;
int blocklengths[counts];
MPI_Aint displs[counts];
MPI_Datatype types[counts];
MPI_Datatype types[counts], type_size_t;
MPI_Type_match_size(MPI_TYPECLASS_INTEGER, sizeof(size_t), &type_size_t);
types[0] = types[1] = types[3] = types[4] = MPI_UNSIGNED_LONG;
types[0] = types[1] = types[3] = types[4] = type_size_t;
types[2] = types[5] = MPI_INT;
blocklengths[0] = blocklengths[1] = blocklengths[2] = data_struct_rep->entries;
blocklengths[3] = blocklengths[4] = blocklengths[5] = data_struct_dist->entries;
MPI_Get_address((data_struct_rep->qty), &displs[0]);
MPI_Get_address((data_struct_rep->request_qty), &displs[1]);
MPI_Get_address((data_struct_rep->types), &displs[2]);
MPI_Get_address((data_struct_rep->types), &displs[2]); // MPI_Datatype uses typedef int to be declared
MPI_Get_address((data_struct_dist->qty), &displs[3]);
MPI_Get_address((data_struct_dist->request_qty), &displs[4]);
MPI_Get_address((data_struct_dist->types), &displs[5]);
MPI_Get_address((data_struct_dist->types), &displs[5]); // MPI_Datatype uses typedef int to be declared
MPI_Type_create_struct(counts, blocklengths, displs, types, new_type);
MPI_Type_commit(new_type);
}
#ifndef MALLEABILITY_TYPES_H
#define MALLEABILITY_TYPES_H
#ifndef MAM_TYPES_H
#define MAM_TYPES_H
#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>
#include <fcntl.h>
#include <sys/stat.h>
#include "malleabilityStates.h"
#include "MAM_Constants.h"
#define MALLEABILITY_INIT_DATA_QTY 100
#define MAM_TYPES_INIT_DATA_QTY 100
typedef struct {
size_t entries; // Indica numero de vectores a comunicar (replicated data)
size_t max_entries;
size_t *qty; // Indica numero de elementos en cada subvector de sync_array
int *types;
MPI_Datatype *types;
// Vector de vectores de request. En cada elemento superior se indican los requests a comprobar para dar por finalizada
// la comunicacion de ese dato
......@@ -25,9 +25,9 @@ typedef struct {
} malleability_data_t;
void add_data(void *data, size_t total_qty, int type, size_t request_qty, malleability_data_t *data_struct);
void modify_data(void *data, size_t index, size_t total_qty, int type, size_t request_qty, malleability_data_t *data_struct);
void comm_data_info(malleability_data_t *data_struct_rep, malleability_data_t *data_struct_dist, int is_children_group, int myId, int root, MPI_Comm intercomm);
void add_data(void *data, size_t total_qty, MPI_Datatype type, size_t request_qty, malleability_data_t *data_struct);
void modify_data(void *data, size_t index, size_t total_qty, MPI_Datatype type, size_t request_qty, malleability_data_t *data_struct);
void comm_data_info(malleability_data_t *data_struct_rep, malleability_data_t *data_struct_dist, int is_children_group);
void free_malleability_data_struct(malleability_data_t *data_struct);
#endif
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <fcntl.h>
#include <unistd.h>
#include <mpi.h>
#include <signal.h>
#include "MAM_Zombies.h"
#include "MAM_DataStructures.h"
#define PIDS_QTY 320
//TODO Add option to allow the usage of signal USR2 or not.
//This code asumes ROOT of each group will be the last to be zombified
//
void MAM_I_zombies_collect(int new_zombies);
void MAM_I_zombies_split();
void MAM_I_zombies_suspend();
int MAM_I_zombies_awake();
void zombies_handler_usr2() {}
int *pids = NULL;
int zombies_qty = 0;
void MAM_Zombies_service_init() {
zombies_qty = 0;
pids = malloc(PIDS_QTY * sizeof(int));
for(int i=0; i<PIDS_QTY; i++) {
pids[i] = 0;
}
}
int MAM_Zombies_service_free() {
int request_abort = MAM_I_zombies_awake();
free(pids);
return request_abort;
}
void MAM_Zombies_update() {
int myId, numP, new_zombies;
MPI_Comm_rank(mall->original_comm, &myId);
MPI_Comm_size(mall->original_comm, &numP);
MPI_Allreduce(&mall->zombie, &new_zombies, 1, MPI_INT, MPI_SUM, mall->original_comm);
if(new_zombies && new_zombies < numP) {
MAM_I_zombies_collect(new_zombies);
MAM_I_zombies_split();
MAM_I_zombies_suspend();
if(myId == MAM_ROOT) zombies_qty += new_zombies;
}
}
void MAM_I_zombies_collect(int new_zombies) {
int pid = getpid();
int *pids_counts, *pids_displs;
int i, count, active;
int myId, numP;
MPI_Comm_rank(mall->original_comm, &myId);
MPI_Comm_size(mall->original_comm, &numP);
pids_counts = (int *) malloc(numP * sizeof(int));
pids_displs = (int *) malloc(numP * sizeof(int));
#if MAM_DEBUG > 2
if(myId == MAM_ROOT){ DEBUG_FUNC("Collecting zombies", mall->myId, mall->numP); } fflush(stdout);
#endif
count = mall->zombie;
if(myId == MAM_ROOT) {
active = numP - new_zombies;
for(i=0; i < active; i++) {
pids_counts[i] = 0;
}
pids_displs[i-1] = -1;
for(; i< active+new_zombies; i++) {
pids_counts[i] = 1;
pids_displs[i] = (pids_displs[i-1] + 1) + zombies_qty;
}
}
MPI_Gatherv(&pid, count, MPI_INT, pids, pids_counts, pids_displs, MPI_INT, MAM_ROOT, mall->original_comm);
free(pids_counts);
free(pids_displs);
}
void MAM_I_zombies_split() {
int myId, color;
MPI_Comm new_original_comm;
MPI_Comm_rank(mall->original_comm, &myId);
color = mall->zombie ? MPI_UNDEFINED : 1;
MPI_Comm_split(mall->original_comm, color, myId, &new_original_comm);
if(mall->original_comm != MPI_COMM_WORLD) MPI_Comm_free(&mall->original_comm);
if(new_original_comm != MPI_COMM_NULL) MPI_Comm_set_name(new_original_comm, "MAM_ORIGINAL");
mall->original_comm = new_original_comm;
}
void MAM_I_zombies_suspend() {
struct sigaction act;
if(!mall->zombie) return;
sigemptyset(&act.sa_mask);
act.sa_flags=0;
act.sa_handler=zombies_handler_usr2;
sigaction(SIGUSR2, &act, NULL);
sigset_t set;
sigprocmask(SIG_SETMASK,NULL,&set);
sigsuspend(&set);
}
int MAM_I_zombies_awake() {
if(mall->internode_group && zombies_qty) return 1; //Request Abort
for(int i=0; i < zombies_qty; i++) { // Despertar a los zombies
kill(pids[i], SIGUSR2);
}
zombies_qty = 0;
return 0; //Normal termination
}
#ifndef MAM_ZOMBIES_H
#define MAM_ZOMBIES_H
void MAM_Zombies_service_init();
int MAM_Zombies_service_free();
void MAM_Zombies_update();
#endif
......@@ -2,24 +2,24 @@
#include <stdlib.h>
#include <mpi.h>
#include <string.h>
#include "distribution_methods/block_distribution.h"
#include "CommDist.h"
#include "block_distribution.h"
#include "Distributed_CommDist.h"
#include "../MAM_Constants.h"
#include "../MAM_Configuration.h"
#include "../MAM_DataStructures.h"
void prepare_redistribution(int qty, int myId, int numP, int numO, int is_children_group, int is_intercomm, char **recv, struct Counts *s_counts, struct Counts *r_counts);
void check_requests(struct Counts s_counts, struct Counts r_counts, int red_strategies, MPI_Request **requests, size_t *request_qty);
void prepare_redistribution(int qty, MPI_Datatype datatype, int numP, int numO, int is_children_group, void **recv, struct Counts *s_counts, struct Counts *r_counts);
void check_requests(struct Counts s_counts, struct Counts r_counts, MPI_Request **requests, size_t *request_qty);
void sync_point2point(char *send, char *recv, int is_intercomm, int myId, struct Counts s_counts, struct Counts r_counts, MPI_Comm comm);
void sync_rma(char *send, char *recv, struct Counts r_counts, int tamBl, MPI_Comm comm, int red_method);
void sync_rma_lock(char *recv, struct Counts r_counts, MPI_Win win);
void sync_rma_lockall(char *recv, struct Counts r_counts, MPI_Win win);
void sync_point2point(void *send, void *recv, MPI_Datatype datatype, struct Counts s_counts, struct Counts r_counts, MPI_Comm comm);
void sync_rma(void *send, void *recv, MPI_Datatype datatype, struct Counts r_counts, int tamBl, MPI_Comm comm);
void sync_rma_lock(void *recv, MPI_Datatype datatype, struct Counts r_counts, MPI_Win win);
void sync_rma_lockall(void *recv, MPI_Datatype datatype, struct Counts r_counts, MPI_Win win);
void async_point2point(char *send, char *recv, struct Counts s_counts, struct Counts r_counts, MPI_Comm comm, MPI_Request *requests);
void async_rma(char *send, char *recv, struct Counts r_counts, int tamBl, MPI_Comm comm, int red_method, MPI_Request *requests, MPI_Win *win);
void async_rma_lock(char *recv, struct Counts r_counts, MPI_Win win, MPI_Request *requests);
void async_rma_lockall(char *recv, struct Counts r_counts, MPI_Win win, MPI_Request *requests);
void perform_manual_communication(char *send, char *recv, int myId, struct Counts s_counts, struct Counts r_counts);
void async_point2point(void *send, void *recv, MPI_Datatype datatype, struct Counts s_counts, struct Counts r_counts, MPI_Comm comm, MPI_Request *requests);
void async_rma(void *send, void *recv, MPI_Datatype datatype, struct Counts r_counts, int tamBl, MPI_Comm comm, MPI_Request *requests, MPI_Win *win);
void async_rma_lock(void *recv, MPI_Datatype datatype, struct Counts r_counts, MPI_Win win, MPI_Request *requests);
void async_rma_lockall(void *recv, MPI_Datatype datatype, struct Counts r_counts, MPI_Win win, MPI_Request *requests);
/*
* Reserva memoria para un vector de hasta "qty" elementos.
......@@ -60,7 +60,6 @@ void malloc_comm_array(char **array, int qty, int myId, int numP) {
* - recv (OUT): Array where data will be written. A NULL value is allowed if the process is not going to receive data.
* If the process receives data and is NULL, the behaviour is undefined.
* - qty (IN): Sum of elements shared by all processes that will send data.
* - myId (IN): Rank of the MPI process in the local communicator. For the parents is not the rank obtained from "comm".
* - numP (IN): Size of the local group. If it is a children group, this parameter must correspond to using
* "MPI_Comm_size(comm)". For the parents is not always the size obtained from "comm".
* - numO (IN): Amount of processes in the remote group. For the parents is the target quantity of processes after the
......@@ -68,50 +67,37 @@ void malloc_comm_array(char **array, int qty, int myId, int numP) {
* - is_children_group (IN): Indicates wether this MPI rank is a children(TRUE) or a parent(FALSE).
* - comm (IN): Communicator to use to perform the redistribution.
*
* returns: An integer indicating if the operation has been completed(TRUE) or not(FALSE). //FIXME In this case is always true...
*/
int sync_communication(char *send, char **recv, int qty, int myId, int numP, int numO, int is_children_group, int red_method, MPI_Comm comm) {
int is_intercomm, aux_comm_used = 0;
void sync_communication(void *send, void **recv, int qty, MPI_Datatype datatype, int numP, int numO, int is_children_group, MPI_Comm comm) {
struct Counts s_counts, r_counts;
struct Dist_data dist_data;
MPI_Comm aux_comm = MPI_COMM_NULL;
/* PREPARE COMMUNICATION */
MPI_Comm_test_inter(comm, &is_intercomm);
prepare_redistribution(qty, myId, numP, numO, is_children_group, is_intercomm, recv, &s_counts, &r_counts);
prepare_redistribution(qty, datatype, numP, numO, is_children_group, recv, &s_counts, &r_counts);
/* PERFORM COMMUNICATION */
switch(red_method) {
case MALL_RED_RMA_LOCKALL:
case MALL_RED_RMA_LOCK:
switch(mall_conf->red_method) {
case MAM_RED_RMA_LOCKALL:
case MAM_RED_RMA_LOCK:
if(is_children_group) {
dist_data.tamBl = 0;
} else {
get_block_dist(qty, myId, numO, &dist_data);
get_block_dist(qty, mall->myId, numO, &dist_data);
}
if(is_intercomm) {
MPI_Intercomm_merge(comm, is_children_group, &aux_comm);
aux_comm_used = 1;
} else { aux_comm = comm; }
sync_rma(send, *recv, r_counts, dist_data.tamBl, aux_comm, red_method);
sync_rma(send, *recv, datatype, r_counts, dist_data.tamBl, comm);
break;
case MALL_RED_POINT:
sync_point2point(send, *recv, is_intercomm, myId, s_counts, r_counts, comm);
case MAM_RED_POINT:
sync_point2point(send, *recv, datatype, s_counts, r_counts, comm);
break;
case MALL_RED_BASELINE:
case MAM_RED_BASELINE:
default:
MPI_Alltoallv(send, s_counts.counts, s_counts.displs, MPI_CHAR, *recv, r_counts.counts, r_counts.displs, MPI_CHAR, comm);
MPI_Alltoallv(send, s_counts.counts, s_counts.displs, datatype, *recv, r_counts.counts, r_counts.displs, datatype, comm);
break;
}
if(aux_comm_used) {
MPI_Comm_free(&aux_comm);
}
freeCounts(&s_counts);
freeCounts(&r_counts);
return 1; //FIXME In this case is always true...
}
/*
......@@ -121,9 +107,6 @@ int sync_communication(char *send, char **recv, int qty, int myId, int numP, int
* - send (IN): Array with the data to send. This value can not be NULL for parents.
* - recv (OUT): Array where data will be written. A NULL value is allowed if the process is not going to
* receive data. If the process receives data and is NULL, the behaviour is undefined.
* - is_intercomm (IN): Indicates wether the communicator is an intercommunicator (TRUE) or an
* intracommunicator (FALSE).
* - myId (IN): Rank of the MPI process in the local communicator. For the parents is not the rank obtained from "comm".
* - s_counts (IN): Struct which describes how many elements will send this process to each children and
* the displacements.
* - r_counts (IN): Structure which describes how many elements will receive this process from each parent
......@@ -131,16 +114,20 @@ int sync_communication(char *send, char **recv, int qty, int myId, int numP, int
* - comm (IN): Communicator to use to perform the redistribution.
*
*/
void sync_point2point(char *send, char *recv, int is_intercomm, int myId, struct Counts s_counts, struct Counts r_counts, MPI_Comm comm) {
int i, j, init, end, total_sends;
void sync_point2point(void *send, void *recv, MPI_Datatype datatype, struct Counts s_counts, struct Counts r_counts, MPI_Comm comm) {
int i, j, init, end, total_sends, datasize;
size_t offset, offset2;
MPI_Request *sends;
MPI_Type_size(datatype, &datasize);
init = s_counts.idI;
end = s_counts.idE;
if(!is_intercomm && (s_counts.idI == myId || s_counts.idE == myId + 1)) {
perform_manual_communication(send, recv, myId, s_counts, r_counts);
if(s_counts.idI == myId) init = s_counts.idI+1;
if(mall_conf->spawn_method == MAM_SPAWN_MERGE && (s_counts.idI == mall->myId || s_counts.idE == mall->myId + 1)) {
offset = s_counts.displs[mall->myId] * datasize;
offset2 = r_counts.displs[mall->myId] * datasize;
memcpy(recv+offset2, send+offset, s_counts.counts[mall->myId]);
if(s_counts.idI == mall->myId) init = s_counts.idI+1;
else end = s_counts.idE-1;
}
......@@ -151,23 +138,26 @@ void sync_point2point(char *send, char *recv, int is_intercomm, int myId, struct
}
for(i=init; i<end; i++) {
sends[j] = MPI_REQUEST_NULL;
MPI_Isend(send+s_counts.displs[i], s_counts.counts[i], MPI_CHAR, i, 99, comm, &(sends[j]));
offset = s_counts.displs[i] * datasize;
MPI_Isend(send+offset, s_counts.counts[i], datatype, i, 99, comm, &(sends[j]));
j++;
}
init = r_counts.idI;
end = r_counts.idE;
if(!is_intercomm) {
if(r_counts.idI == myId) init = r_counts.idI+1;
else if(r_counts.idE == myId + 1) end = r_counts.idE-1;
if(mall_conf->spawn_method == MAM_SPAWN_MERGE) {
if(r_counts.idI == mall->myId) init = r_counts.idI+1;
else if(r_counts.idE == mall->myId + 1) end = r_counts.idE-1;
}
for(i=init; i<end; i++) {
MPI_Recv(recv+r_counts.displs[i], r_counts.counts[i], MPI_CHAR, i, 99, comm, MPI_STATUS_IGNORE);
offset = r_counts.displs[i] * datasize;
MPI_Recv(recv+offset, r_counts.counts[i], datatype, i, 99, comm, MPI_STATUS_IGNORE);
}
if(total_sends > 0) {
MPI_Waitall(total_sends, sends, MPI_STATUSES_IGNORE);
free(sends);
}
}
......@@ -182,22 +172,34 @@ void sync_point2point(char *send, char *recv, int is_intercomm, int myId, struct
* displacements.
* - tamBl (IN): How many elements are stored in the parameter "send".
* - comm (IN): Communicator to use to perform the redistribution. Must be an intracommunicator as MPI-RMA requirements.
* - red_method (IN): Type of data redistribution to use. In this case indicates the RMA operation(Lock or LockAll).
*
* FIXME: In libfabric one of these macros defines the maximum amount of BYTES that can be communicated in a SINGLE MPI_Get
* A window can have more bytes than the amount shown in those macros, therefore, if you want to read more than that amount
* you need to perform multiples Gets.
* prov/psm3/psm3/psm_config.h:179:#define MQ_SHM_THRESH_RNDV 16000
* prov/psm3/psm3/ptl_am/am_config.h:62:#define PSMI_MQ_RV_THRESH_CMA 16000
* prov/psm3/psm3/ptl_am/am_config.h:65:#define PSMI_MQ_RV_THRESH_NO_KASSIST 16000
*/
void sync_rma(char *send, char *recv, struct Counts r_counts, int tamBl, MPI_Comm comm, int red_method) {
void sync_rma(void *send, void *recv, MPI_Datatype datatype, struct Counts r_counts, int tamBl, MPI_Comm comm) {
int datasize;
MPI_Win win;
MPI_Win_create(send, (MPI_Aint)tamBl, sizeof(char), MPI_INFO_NULL, comm, &win);
switch(red_method) {
case MALL_RED_RMA_LOCKALL:
sync_rma_lockall(recv, r_counts, win);
MPI_Type_size(datatype, &datasize);
MPI_Win_create(send, (MPI_Aint)tamBl * datasize, datasize, MPI_INFO_NULL, comm, &win);
#if MAM_DEBUG >= 3
DEBUG_FUNC("Created Window for synchronous RMA communication", mall->myId, mall->numP); fflush(stdout); MPI_Barrier(comm);
#endif
switch(mall_conf->red_method) {
case MAM_RED_RMA_LOCKALL:
sync_rma_lockall(recv, datatype, r_counts, win);
break;
case MALL_RED_RMA_LOCK:
sync_rma_lock(recv, r_counts, win);
case MAM_RED_RMA_LOCK:
sync_rma_lock(recv, datatype, r_counts, win);
break;
}
#if MAM_DEBUG >= 3
DEBUG_FUNC("Completed synchronous RMA communication", mall->myId, mall->numP); fflush(stdout); MPI_Barrier(comm);
#endif
MPI_Win_free(&win);
}
......@@ -212,19 +214,22 @@ void sync_rma(char *send, char *recv, struct Counts r_counts, int tamBl, MPI_Com
* - win (IN): Window to use to perform the redistribution.
*
*/
void sync_rma_lock(char *recv, struct Counts r_counts, MPI_Win win) {
int i, target_displs;
void sync_rma_lock(void *recv, MPI_Datatype datatype, struct Counts r_counts, MPI_Win win) {
int i, target_displs, datasize;
size_t offset;
MPI_Type_size(datatype, &datasize);
target_displs = r_counts.first_target_displs; //TODO Check that datasize is not needed
target_displs = r_counts.first_target_displs;
for(i=r_counts.idI; i<r_counts.idE; i++) {
offset = r_counts.displs[i] * datasize;
MPI_Win_lock(MPI_LOCK_SHARED, i, MPI_MODE_NOCHECK, win);
MPI_Get(recv+r_counts.displs[i], r_counts.counts[i], MPI_CHAR, i, target_displs, r_counts.counts[i], MPI_CHAR, win);
MPI_Get(recv+offset, r_counts.counts[i], datatype, i, target_displs, r_counts.counts[i], datatype, win);
MPI_Win_unlock(i, win);
target_displs=0;
}
}
/*
* Performs a passive MPI-RMA data redistribution for a single array using the passive epochs Lockall/Unlockall.
* - recv (OUT): Array where data will be written. A NULL value is allowed if the process is not going to receive data.
......@@ -234,13 +239,17 @@ void sync_rma_lock(char *recv, struct Counts r_counts, MPI_Win win) {
* - win (IN): Window to use to perform the redistribution.
*
*/
void sync_rma_lockall(char *recv, struct Counts r_counts, MPI_Win win) {
int i, target_displs;
void sync_rma_lockall(void *recv, MPI_Datatype datatype, struct Counts r_counts, MPI_Win win) {
int i, target_displs, datasize;
size_t offset;
MPI_Type_size(datatype, &datasize);
target_displs = r_counts.first_target_displs; //TODO Check that datasize is not needed
target_displs = r_counts.first_target_displs;
MPI_Win_lock_all(MPI_MODE_NOCHECK, win);
for(i=r_counts.idI; i<r_counts.idE; i++) {
MPI_Get(recv+r_counts.displs[i], r_counts.counts[i], MPI_CHAR, i, target_displs, r_counts.counts[i], MPI_CHAR, win);
offset = r_counts.displs[i] * datasize;
MPI_Get(recv+offset, r_counts.counts[i], datatype, i, target_displs, r_counts.counts[i], datatype, win);
target_displs=0;
}
MPI_Win_unlock_all(win);
......@@ -261,7 +270,6 @@ void sync_rma_lockall(char *recv, struct Counts r_counts, MPI_Win win) {
* - recv (OUT): Array where data will be written. A NULL value is allowed if the process is not going to receive data.
* If the process receives data and is NULL, the behaviour is undefined.
* - qty (IN): Sum of elements shared by all processes that will send data.
* - myId (IN): Rank of the MPI process in the local communicator. For the parents is not the rank obtained from "comm".
* - numP (IN): Size of the local group. If it is a children group, this parameter must correspond to using
* "MPI_Comm_size(comm)". For the parents is not always the size obtained from "comm".
* - numO (IN): Amount of processes in the remote group. For the parents is the target quantity of processes after the
......@@ -273,106 +281,65 @@ void sync_rma_lockall(char *recv, struct Counts r_counts, MPI_Win win) {
* - request_qty (OUT): Quantity of requests to be used. If a process sends and receives data, this value will be
* modified to the expected value.
*
* returns: An integer indicating if the operation has been completed(TRUE) or not(FALSE). //FIXME In this case is always false...
*/
int async_communication_start(char *send, char **recv, int qty, int myId, int numP, int numO, int is_children_group, int red_method, int red_strategies, MPI_Comm comm, MPI_Request **requests, size_t *request_qty, MPI_Win *win) {
int is_intercomm, aux_comm_used = 0;
void async_communication_start(void *send, void **recv, int qty, MPI_Datatype datatype, int numP, int numO, int is_children_group, MPI_Comm comm, MPI_Request **requests, size_t *request_qty, MPI_Win *win) {
struct Counts s_counts, r_counts;
struct Dist_data dist_data;
MPI_Comm aux_comm = MPI_COMM_NULL;
/* PREPARE COMMUNICATION */
MPI_Comm_test_inter(comm, &is_intercomm);
prepare_redistribution(qty, myId, numP, numO, is_children_group, is_intercomm, recv, &s_counts, &r_counts);
check_requests(s_counts, r_counts, red_strategies, requests, request_qty);
prepare_redistribution(qty, datatype, numP, numO, is_children_group, recv, &s_counts, &r_counts);
check_requests(s_counts, r_counts, requests, request_qty); //FIXME Error related to second reconf if Merge Shrink + P2P -->Invalid requests
/* PERFORM COMMUNICATION */
switch(red_method) {
switch(mall_conf->red_method) {
case MALL_RED_RMA_LOCKALL:
case MALL_RED_RMA_LOCK:
case MAM_RED_RMA_LOCKALL:
case MAM_RED_RMA_LOCK:
if(is_children_group) {
dist_data.tamBl = 0;
} else {
get_block_dist(qty, myId, numO, &dist_data);
get_block_dist(qty, mall->myId, numO, &dist_data);
}
if(is_intercomm) {
MPI_Intercomm_merge(comm, is_children_group, &aux_comm);
aux_comm_used = 1;
} else { aux_comm = comm; }
async_rma(send, *recv, r_counts, dist_data.tamBl, aux_comm, red_method, *requests, win);
async_rma(send, *recv, datatype, r_counts, dist_data.tamBl, comm, *requests, win);
break;
case MALL_RED_POINT:
async_point2point(send, *recv, s_counts, r_counts, comm, *requests);
case MAM_RED_POINT:
async_point2point(send, *recv, datatype, s_counts, r_counts, comm, *requests);
break;
case MALL_RED_BASELINE:
case MAM_RED_BASELINE:
default:
MPI_Ialltoallv(send, s_counts.counts, s_counts.displs, MPI_CHAR, *recv, r_counts.counts, r_counts.displs, MPI_CHAR, comm, &((*requests)[0]));
MPI_Ialltoallv(send, s_counts.counts, s_counts.displs, datatype, *recv, r_counts.counts, r_counts.displs, datatype, comm, &((*requests)[0]));
break;
}
/* POST REQUESTS CHECKS */
if(malleability_red_contains_strat(red_strategies, MALL_RED_IBARRIER, NULL)) {
if(!is_children_group && (is_intercomm || myId >= numO)) {
MPI_Ibarrier(comm, &((*requests)[*request_qty-1]) ); //FIXME Not easy to read...
}
}
if(aux_comm_used) {
MPI_Comm_free(&aux_comm);
}
freeCounts(&s_counts);
freeCounts(&r_counts);
return 0; //FIXME In this case is always false...
}
/*
* Checks if a set of requests have been completed (1) or not (0).
*
* - myId (IN): Rank of the MPI process in the local communicator. For the parents is not the rank obtained from "comm".
* - is_children_group (IN): Indicates wether this MPI rank is a children(TRUE) or a parent(FALSE).
* - red_strategies (IN):
* - requests (IN): Pointer to array of requests to be used to determine if the communication has ended.
* - request_qty (IN): Quantity of requests in "requests".
*
* returns: An integer indicating if the operation has been completed(TRUE) or not(FALSE).
*/
int async_communication_check(int myId, int is_children_group, int red_strategies, MPI_Comm comm, MPI_Request *requests, size_t request_qty) {
int completed, req_completed, all_req_null, test_err, aux_condition;
int async_communication_check(int is_children_group, MPI_Request *requests, size_t request_qty) {
int completed, req_completed, test_err;
size_t i;
completed = 1;
all_req_null = 1;
test_err = MPI_SUCCESS;
if (is_children_group) return 1;
if(malleability_red_contains_strat(red_strategies, MALL_RED_IBARRIER, NULL)) {
// The Ibarrier should only be posted at this point if the process
// has other requests which has not confirmed as completed yet,
// but are confirmed now.
if (requests[request_qty-1] == MPI_REQUEST_NULL) {
for(i=0; i<request_qty; i++) {
aux_condition = requests[i] == MPI_REQUEST_NULL;
all_req_null = all_req_null && aux_condition;
test_err = MPI_Test(&(requests[i]), &req_completed, MPI_STATUS_IGNORE);
completed = completed && req_completed;
}
if(completed && !all_req_null) { MPI_Ibarrier(comm, &(requests[request_qty-1])); }
}
test_err = MPI_Test(&(requests[request_qty-1]), &completed, MPI_STATUS_IGNORE);
if (is_children_group) return 1; //FIXME Deberia devolver un num negativo
} else {
for(i=0; i<request_qty; i++) {
test_err = MPI_Test(&(requests[i]), &req_completed, MPI_STATUS_IGNORE);
completed = completed && req_completed;
}
// test_err = MPI_Testall(request_qty, requests, &completed, MPI_STATUSES_IGNORE); //FIXME Some kind of bug with Mpich.
for(i=0; i<request_qty; i++) {
test_err = MPI_Test(&(requests[i]), &req_completed, MPI_STATUS_IGNORE);
completed = completed && req_completed;
}
//test_err = MPI_Testall(request_qty, requests, &completed, MPI_STATUSES_IGNORE); //FIXME Some kind of bug with Mpich.
if (test_err != MPI_SUCCESS && test_err != MPI_ERR_PENDING) {
printf("P%d aborting -- Test Async\n", myId);
printf("P%d aborting -- Test Async\n", mall->myId);
MPI_Abort(MPI_COMM_WORLD, test_err);
}
......@@ -384,36 +351,33 @@ int async_communication_check(int myId, int is_children_group, int red_strategie
* Waits until the completion of a set of requests. If the Ibarrier strategy
* is being used, the corresponding ibarrier is posted.
*
* - red_strategies (IN):
* - comm (IN): Communicator to use to confirm finalizations of redistribution
* - requests (IN): Pointer to array of requests to be used to determine if the communication has ended.
* - request_qty (IN): Quantity of requests in "requests".
*/
void async_communication_wait(int red_strategies, MPI_Comm comm, MPI_Request *requests, size_t request_qty) {
void async_communication_wait(MPI_Request *requests, size_t request_qty) {
MPI_Waitall(request_qty, requests, MPI_STATUSES_IGNORE);
if(malleability_red_contains_strat(red_strategies, MALL_RED_IBARRIER, NULL)) {
MPI_Ibarrier(comm, &(requests[request_qty-1]) );
MPI_Wait(&(requests[request_qty-1]), MPI_STATUS_IGNORE); //TODO Is it really needed? It will be ensured later
}
#if MAM_DEBUG >= 3
DEBUG_FUNC("Processes Waitall completed", mall->myId, mall->numP); fflush(stdout); MPI_Barrier(MPI_COMM_WORLD);
#endif
}
/*
* Frees Requests/Windows associated to a particular redistribution.
* Should be called for each output result of calling "async_communication_start".
*
* - red_method (IN):
* - red_strategies (IN):
* - requests (IN): Pointer to array of requests to be used to determine if the communication has ended.
* - request_qty (IN): Quantity of requests in "requests".
* - win (IN): Window to free.
*/
void async_communication_end(int red_method, int red_strategies, MPI_Request *requests, size_t request_qty, MPI_Win *win) {
void async_communication_end(MPI_Request *requests, size_t request_qty, MPI_Win *win) {
//Para la desconexión de ambos grupos de procesos es necesario indicar a MPI que esta comm
//ha terminado, aunque solo se pueda llegar a este punto cuando ha terminado
if(malleability_red_contains_strat(red_strategies, MALL_RED_IBARRIER, NULL)) { MPI_Waitall(request_qty, requests, MPI_STATUSES_IGNORE); }
if(MAM_Contains_strat(MAM_RED_STRATEGIES, MAM_STRAT_RED_WAIT_TARGETS, NULL)) { MPI_Waitall(request_qty, requests, MPI_STATUSES_IGNORE); }
if(red_method == MALL_RED_RMA_LOCKALL || red_method == MALL_RED_RMA_LOCK) { MPI_Win_free(win); }
if((mall_conf->red_method == MAM_RED_RMA_LOCKALL || mall_conf->red_method == MAM_RED_RMA_LOCK)
&& *win != MPI_WIN_NULL) { MPI_Win_free(win); }
}
/*
......@@ -431,16 +395,20 @@ void async_communication_end(int red_method, int red_strategies, MPI_Request *re
* - requests (OUT): Pointer to array of requests to be used to determine if the communication has ended.
*
*/
void async_point2point(char *send, char *recv, struct Counts s_counts, struct Counts r_counts, MPI_Comm comm, MPI_Request *requests) {
int i, j = 0;
void async_point2point(void *send, void *recv, MPI_Datatype datatype, struct Counts s_counts, struct Counts r_counts, MPI_Comm comm, MPI_Request *requests) {
int i, j = 0, datasize;
size_t offset;
MPI_Type_size(datatype, &datasize);
for(i=s_counts.idI; i<s_counts.idE; i++) {
MPI_Isend(send+s_counts.displs[i], s_counts.counts[i], MPI_CHAR, i, 99, comm, &(requests[j]));
offset = s_counts.displs[i] * datasize;
MPI_Isend(send+offset, s_counts.counts[i], datatype, i, 99, comm, &(requests[j]));
j++;
}
for(i=r_counts.idI; i<r_counts.idE; i++) {
MPI_Irecv(recv+r_counts.displs[i], r_counts.counts[i], MPI_CHAR, i, 99, comm, &(requests[j]));
offset = r_counts.displs[i] * datasize;
MPI_Irecv(recv+offset, r_counts.counts[i], datatype, i, 99, comm, &(requests[j]));
j++;
}
}
......@@ -456,20 +424,21 @@ void async_point2point(char *send, char *recv, struct Counts s_counts, struct Co
* displacements.
* - tamBl (IN): How many elements are stored in the parameter "send".
* - comm (IN): Communicator to use to perform the redistribution. Must be an intracommunicator as MPI-RMA requirements.
* - red_method (IN): Type of data redistribution to use. In this case indicates the RMA operation(Lock or LockAll).
* - window (OUT): Pointer to a window object used for the RMA operations.
* - requests (OUT): Pointer to array of requests to be used to determine if the communication has ended.
*
*/
void async_rma(char *send, char *recv, struct Counts r_counts, int tamBl, MPI_Comm comm, int red_method, MPI_Request *requests, MPI_Win *win) {
MPI_Win_create(send, (MPI_Aint)tamBl, sizeof(char), MPI_INFO_NULL, comm, win);
switch(red_method) {
case MALL_RED_RMA_LOCKALL:
async_rma_lockall(recv, r_counts, *win, requests);
void async_rma(void *send, void *recv, MPI_Datatype datatype, struct Counts r_counts, int tamBl, MPI_Comm comm, MPI_Request *requests, MPI_Win *win) {
int datasize;
MPI_Type_size(datatype, &datasize);
MPI_Win_create(send, (MPI_Aint)tamBl * datasize, datasize, MPI_INFO_NULL, comm, win);
switch(mall_conf->red_method) {
case MAM_RED_RMA_LOCKALL:
async_rma_lockall(recv, datatype, r_counts, *win, requests);
break;
case MALL_RED_RMA_LOCK:
async_rma_lock(recv, r_counts, *win, requests);
case MAM_RED_RMA_LOCK:
async_rma_lock(recv, datatype, r_counts, *win, requests);
break;
}
}
......@@ -484,13 +453,17 @@ void async_rma(char *send, char *recv, struct Counts r_counts, int tamBl, MPI_Co
* - requests (OUT): Pointer to array of requests to be used to determine if the communication has ended.
*
*/
void async_rma_lock(char *recv, struct Counts r_counts, MPI_Win win, MPI_Request *requests) {
int i, target_displs, j = 0;
void async_rma_lock(void *recv, MPI_Datatype datatype, struct Counts r_counts, MPI_Win win, MPI_Request *requests) {
int i, target_displs, j = 0, datasize;
size_t offset;
MPI_Type_size(datatype, &datasize);
target_displs = r_counts.first_target_displs; //TODO Check that datasize is not needed
target_displs = r_counts.first_target_displs;
for(i=r_counts.idI; i<r_counts.idE; i++) {
offset = r_counts.displs[i] * datasize;
MPI_Win_lock(MPI_LOCK_SHARED, i, MPI_MODE_NOCHECK, win);
MPI_Rget(recv+r_counts.displs[i], r_counts.counts[i], MPI_CHAR, i, target_displs, r_counts.counts[i], MPI_CHAR, win, &(requests[j]));
MPI_Rget(recv+offset, r_counts.counts[i], datatype, i, target_displs, r_counts.counts[i], datatype, win, &(requests[j]));
MPI_Win_unlock(i, win);
target_displs=0;
j++;
......@@ -506,13 +479,17 @@ void async_rma_lock(char *recv, struct Counts r_counts, MPI_Win win, MPI_Request
* - requests (OUT): Pointer to array of requests to be used to determine if the communication has ended.
*
*/
void async_rma_lockall(char *recv, struct Counts r_counts, MPI_Win win, MPI_Request *requests) {
int i, target_displs, j = 0;
void async_rma_lockall(void *recv, MPI_Datatype datatype, struct Counts r_counts, MPI_Win win, MPI_Request *requests) {
int i, target_displs, j = 0, datasize;
size_t offset;
MPI_Type_size(datatype, &datasize);
target_displs = r_counts.first_target_displs; //TODO Check that datasize is not needed
target_displs = r_counts.first_target_displs;
MPI_Win_lock_all(MPI_MODE_NOCHECK, win);
for(i=r_counts.idI; i<r_counts.idE; i++) {
MPI_Rget(recv+r_counts.displs[i], r_counts.counts[i], MPI_CHAR, i, target_displs, r_counts.counts[i], MPI_CHAR, win, &(requests[j]));
offset = r_counts.displs[i] * datasize;
MPI_Rget(recv+offset, r_counts.counts[i], datatype, i, target_displs, r_counts.counts[i], datatype, win, &(requests[j]));
target_displs=0;
j++;
}
......@@ -532,52 +509,64 @@ void async_rma_lockall(char *recv, struct Counts r_counts, MPI_Win win, MPI_Requ
* how many elements sends/receives to other processes for the new group.
*
* - qty (IN): Sum of elements shared by all processes that will send data.
* - myId (IN): Rank of the MPI process in the local communicator. For the parents is not the rank obtained from "comm".
* - numP (IN): Size of the local group. If it is a children group, this parameter must correspond to using
* "MPI_Comm_size(comm)". For the parents is not always the size obtained from "comm".
* - numO (IN): Amount of processes in the remote group. For the parents is the target quantity of processes after the
* resize, while for the children is the amount of parents.
* - is_children_group (IN): Indicates wether this MPI rank is a children(TRUE) or a parent(FALSE).
* - is_intercomm (IN): Indicates wether the used communicator is a intercomunicator(TRUE) or intracommunicator(FALSE).
* - recv (OUT): Array where data will be written. A NULL value is allowed if the process is not going to receive data.
* process receives data and is NULL, the behaviour is undefined.
* - s_counts (OUT): Struct where is indicated how many elements sends this process to processes in the new group.
* - r_counts (OUT): Struct where is indicated how many elements receives this process from other processes in the previous group.
*
*/
void prepare_redistribution(int qty, int myId, int numP, int numO, int is_children_group, int is_intercomm, char **recv, struct Counts *s_counts, struct Counts *r_counts) {
void prepare_redistribution(int qty, MPI_Datatype datatype, int numP, int numO, int is_children_group, void **recv, struct Counts *s_counts, struct Counts *r_counts) {
int array_size = numO;
int offset_ids = 0;
int datasize;
struct Dist_data dist_data;
if(is_intercomm) {
//offset_ids = !is_children_group ? numP : 0; //FIXME Modify only if active?
if(mall_conf->spawn_method == MAM_SPAWN_BASELINE) {
offset_ids = MAM_Contains_strat(MAM_SPAWN_STRATEGIES, MAM_STRAT_SPAWN_INTERCOMM, NULL) ?
0 : numP;
} else {
array_size = numP > numO ? numP : numO;
}
mallocCounts(s_counts, array_size+offset_ids);
mallocCounts(r_counts, array_size+offset_ids);
MPI_Type_size(datatype, &datasize); //FIXME Right now derived datatypes are not ensured to work
if(is_children_group) {
prepare_comm_alltoall(myId, numP, numO, qty, offset_ids, r_counts);
offset_ids = 0;
prepare_comm_alltoall(mall->myId, numP, numO, qty, offset_ids, r_counts);
// Obtener distribución para este hijo
get_block_dist(qty, myId, numP, &dist_data);
*recv = malloc(dist_data.tamBl * sizeof(char));
//get_block_dist(qty, myId, numP, &dist_data);
//print_counts(dist_data, r_counts->counts, r_counts->displs, numO+offset_ids, 0, "Children C ");
get_block_dist(qty, mall->myId, numP, &dist_data);
*recv = malloc(dist_data.tamBl * datasize);
#if MAM_DEBUG >= 4
get_block_dist(qty, mall->myId, numP, &dist_data);
print_counts(dist_data, r_counts->counts, r_counts->displs, numO+offset_ids, 0, "Targets Recv");
#endif
} else {
//get_block_dist(qty, myId, numP, &dist_data);
prepare_comm_alltoall(myId, numP, numO, qty, offset_ids, s_counts);
if(!is_intercomm && myId < numO) {
prepare_comm_alltoall(myId, numO, numP, qty, offset_ids, r_counts);
// Obtener distribución para este hijo y reservar vector de recibo
get_block_dist(qty, myId, numO, &dist_data);
*recv = malloc(dist_data.tamBl * sizeof(char));
//print_counts(dist_data, r_counts->counts, r_counts->displs, array_size, 0, "Children P ");
#if MAM_DEBUG >= 4
get_block_dist(qty, mall->myId, numP, &dist_data);
#endif
prepare_comm_alltoall(mall->myId, numP, numO, qty, offset_ids, s_counts);
if(mall_conf->spawn_method == MAM_SPAWN_MERGE && mall->myId < numO) {
prepare_comm_alltoall(mall->myId, numO, numP, qty, offset_ids, r_counts);
// Obtener distribución para este hijo y reservar vector de recibo
get_block_dist(qty, mall->myId, numO, &dist_data);
*recv = malloc(dist_data.tamBl * datasize);
#if MAM_DEBUG >= 4
print_counts(dist_data, r_counts->counts, r_counts->displs, array_size, 0, "Sources&Targets Recv");
#endif
}
//print_counts(dist_data, s_counts->counts, s_counts->displs, numO+offset_ids, 0, "Parents ");
#if MAM_DEBUG >= 4
print_counts(dist_data, s_counts->counts, s_counts->displs, numO+offset_ids, 0, "Sources Send");
#endif
}
}
......@@ -593,14 +582,19 @@ void prepare_redistribution(int qty, int myId, int numP, int numO, int is_childr
* - request_qty (IN/OUT): Quantity of requests to be used. If the value is smaller than the amount of communication
* functions to perform, it is modified to the minimum value.
*/
void check_requests(struct Counts s_counts, struct Counts r_counts, int red_strategies, MPI_Request **requests, size_t *request_qty) {
void check_requests(struct Counts s_counts, struct Counts r_counts, MPI_Request **requests, size_t *request_qty) {
size_t i, sum;
MPI_Request *aux;
sum = (size_t) s_counts.idE - s_counts.idI;
sum += (size_t) r_counts.idE - r_counts.idI;
if(malleability_red_contains_strat(red_strategies, MALL_RED_IBARRIER, NULL)) {
sum++;
switch(mall_conf->red_method) {
case MAM_RED_BASELINE:
sum = 1;
break;
case MAM_RED_POINT:
default:
sum = (size_t) s_counts.idE - s_counts.idI;
sum += (size_t) r_counts.idE - r_counts.idI;
break;
}
if (*requests != NULL && sum <= *request_qty) return; // Expected amount of requests
......@@ -622,47 +616,3 @@ void check_requests(struct Counts s_counts, struct Counts r_counts, int red_stra
}
*request_qty = sum;
}
/*
* Special case to perform a manual copy of data when a process has to send data to itself. Only used
* when the MPI communication is not able to hand this situation. An example is when using point to point
* communications and the process has to perform a Send and Recv to itself
* - send (IN): Array with the data to send. This value can not be NULL.
* - recv (OUT): Array where data will be written. This value can not be NULL.
* - myId (IN): Rank of the MPI process in the local communicator. For the parents is not the rank obtained from "comm".
* - s_counts (IN): Struct where is indicated how many elements sends this process to processes in the new group.
* - r_counts (IN): Struct where is indicated how many elements receives this process from other processes in the previous group.
*/
void perform_manual_communication(char *send, char *recv, int myId, struct Counts s_counts, struct Counts r_counts) {
int i;
for(i=0; i<s_counts.counts[myId];i++) {
recv[i+r_counts.displs[myId]] = send[i+s_counts.displs[myId]];
}
}
/*
* Función para obtener si entre las estrategias elegidas, se utiliza
* la estrategia pasada como segundo argumento.
*
* Devuelve en "result" 1(Verdadero) si utiliza la estrategia, 0(Falso) en caso
* contrario.
*/
int malleability_red_contains_strat(int comm_strategies, int strategy, int *result) {
int value = comm_strategies % strategy ? 0 : 1;
if(result != NULL) *result = value;
return value;
}
/*
* Función para anyadir una estrategia a un conjunto.
*
* Devuelve en "result" 1(Verdadero) si se ha anyadido, 0(Falso) en caso
* contrario.
*/
int malleability_red_add_strat(int *comm_strategies, int strategy) {
if(malleability_red_contains_strat(*comm_strategies, strategy, NULL)) return 1;
*comm_strategies = *comm_strategies * strategy;
return 1;
}
#ifndef MAM_DISTRIBUTED_COMMDIST_H
#define MAM_DISTRIBUTED_COMMDIST_H
#include <mpi.h>
void sync_communication(void *send, void **recv, int qty, MPI_Datatype datatype, int numP, int numO, int is_children_group, MPI_Comm comm);
void async_communication_start(void *send, void **recv, int qty, MPI_Datatype datatype, int numP, int numO, int is_children_group, MPI_Comm comm, MPI_Request **requests, size_t *request_qty, MPI_Win *win);
int async_communication_check(int is_children_group, MPI_Request *requests, size_t request_qty);
void async_communication_wait(MPI_Request *requests, size_t request_qty);
void async_communication_end(MPI_Request *requests, size_t request_qty, MPI_Win *win);
void malloc_comm_array(char **array, int qty, int myId, int numP);
#endif
#ifndef mall_block_distribution
#define mall_block_distribution
#ifndef MAM_BLOCK_DISTRIBUTION_H
#define MAM_BLOCK_DISTRIBUTION_H
#include <stdio.h>
#include <stdlib.h>
......
#ifndef MALLEABILITY_DATA_STRUCTURES_H
#define MALLEABILITY_DATA_STRUCTURES_H
/*
* Shows available data structures for inner ussage.
*/
#include <mpi.h>
/* --- SPAWN STRUCTURES --- */
struct physical_dist {
int num_cpus, num_nodes;
char *nodelist;
int target_qty, already_created;
int dist_type, info_type;
};
typedef struct {
int myId, root, root_parents;
int spawn_qty, initial_qty, target_qty;
int already_created;
int spawn_method, spawn_is_single, spawn_is_async;
char *cmd; //Executable name
MPI_Info mapping;
MPI_Datatype dtype;
struct physical_dist dist; // Used to create mapping var
MPI_Comm comm, returned_comm;
} Spawn_data;
#endif
#include <pthread.h>
#include <string.h>
#include "malleabilityManager.h"
#include "malleabilityStates.h"
#include "malleabilityDataStructures.h"
#include "malleabilityTypes.h"
#include "malleabilityZombies.h"
#include "spawn_methods/GenericSpawn.h"
#include "CommDist.h"
#define MALLEABILITY_USE_SYNCHRONOUS 0
#define MALLEABILITY_USE_ASYNCHRONOUS 1
void send_data(int numP_children, malleability_data_t *data_struct, int is_asynchronous);
void recv_data(int numP_parents, malleability_data_t *data_struct, int is_asynchronous);
void Children_init();
int spawn_step();
int start_redistribution();
int check_redistribution();
int end_redistribution();
int shrink_redistribution();
void comm_node_data(int rootBcast, int is_child_group);
void def_nodeinfo_type(MPI_Datatype *node_type);
int thread_creation();
int thread_check();
void* thread_async_work();
void print_comms_state();
void malleability_comms_update(MPI_Comm comm);
typedef struct {
int spawn_method;
int spawn_dist;
int spawn_strategies;
int red_method;
int red_strategies;
int grp;
configuration *config_file;
results_data *results;
} malleability_config_t;
typedef struct { //FIXME numC_spawned no se esta usando
int myId, numP, numC, numC_spawned, root, root_parents;
pthread_t async_thread;
MPI_Comm comm, thread_comm;
MPI_Comm intercomm;
MPI_Comm user_comm;
int dup_user_comm;
char *name_exec, *nodelist;
int num_cpus, num_nodes, nodelist_len;
} malleability_t;
int state = MALL_UNRESERVED; //FIXME Mover a otro lado
malleability_config_t *mall_conf;
malleability_t *mall;
malleability_data_t *rep_s_data;
malleability_data_t *dist_s_data;
malleability_data_t *rep_a_data;
malleability_data_t *dist_a_data;
/*
* Inicializa la reserva de memoria para el modulo de maleabilidad
* creando todas las estructuras necesarias y copias de comunicadores
* para no interferir en la aplicación.
*
* Si es llamada por un grupo de procesos creados de forma dinámica,
* inicializan la comunicacion con sus padres. En este caso, al terminar
* la comunicacion los procesos hijo estan preparados para ejecutar la
* aplicacion.
*/
int init_malleability(int myId, int numP, int root, MPI_Comm comm, char *name_exec, char *nodelist, int num_cpus, int num_nodes) {
MPI_Comm dup_comm, thread_comm;
mall_conf = (malleability_config_t *) malloc(sizeof(malleability_config_t));
mall = (malleability_t *) malloc(sizeof(malleability_t));
rep_s_data = (malleability_data_t *) malloc(sizeof(malleability_data_t));
dist_s_data = (malleability_data_t *) malloc(sizeof(malleability_data_t));
rep_a_data = (malleability_data_t *) malloc(sizeof(malleability_data_t));
dist_a_data = (malleability_data_t *) malloc(sizeof(malleability_data_t));
mall->dup_user_comm = 0;
MPI_Comm_dup(comm, &dup_comm);
MPI_Comm_dup(comm, &thread_comm);
MPI_Comm_set_name(dup_comm, "MPI_COMM_MALL");
MPI_Comm_set_name(thread_comm, "MPI_COMM_MALL_THREAD");
mall->myId = myId;
mall->numP = numP;
mall->root = root;
mall->comm = dup_comm;
mall->thread_comm = thread_comm;
mall->user_comm = comm;
mall->name_exec = name_exec;
mall->nodelist = nodelist;
mall->num_cpus = num_cpus;
mall->num_nodes = num_nodes;
rep_s_data->entries = 0;
rep_a_data->entries = 0;
dist_s_data->entries = 0;
dist_a_data->entries = 0;
state = MALL_NOT_STARTED;
zombies_service_init();
// Si son el primer grupo de procesos, obtienen los datos de los padres
MPI_Comm_get_parent(&(mall->intercomm));
if(mall->intercomm != MPI_COMM_NULL ) {
Children_init();
return MALLEABILITY_CHILDREN;
}
if(nodelist != NULL) { //TODO To be deprecated by using Slurm or else statement
mall->nodelist_len = strlen(nodelist);
} else { // If no nodelist is detected, get it from the actual run
mall->nodelist = malloc(MPI_MAX_PROCESSOR_NAME * sizeof(char));
MPI_Get_processor_name(mall->nodelist, &mall->nodelist_len);
//TODO Get name of each process and create real nodelist
}
return MALLEABILITY_NOT_CHILDREN;
}
/*
* Elimina toda la memoria reservado por el modulo
* de maleabilidad y asegura que los zombies
* despierten si los hubiese.
*/
void free_malleability() {
free_malleability_data_struct(rep_s_data);
free_malleability_data_struct(rep_a_data);
free_malleability_data_struct(dist_s_data);
free_malleability_data_struct(dist_a_data);
free(rep_s_data);
free(rep_a_data);
free(dist_s_data);
free(dist_a_data);
if(mall->comm != MPI_COMM_WORLD) MPI_Comm_free(&(mall->comm));
if(mall->thread_comm != MPI_COMM_WORLD) MPI_Comm_free(&(mall->thread_comm));
free(mall);
free(mall_conf);
zombies_awake();
zombies_service_free();
state = MALL_UNRESERVED;
}
/*
* TODO Reescribir
* Se realiza el redimensionado de procesos por parte de los padres.
*
* Se crean los nuevos procesos con la distribucion fisica elegida y
* a continuacion se transmite la informacion a los mismos.
*
* Si hay datos asincronos a transmitir, primero se comienza a
* transmitir estos y se termina la funcion. Se tiene que comprobar con
* llamando a la función de nuevo que se han terminado de enviar
*
* Si hay ademas datos sincronos a enviar, no se envian aun.
*
* Si solo hay datos sincronos se envian tras la creacion de los procesos
* y finalmente se desconectan los dos grupos de procesos.
*/
int malleability_checkpoint() {
double end_real_time;
switch(state) {
case MALL_UNRESERVED:
break;
case MALL_NOT_STARTED:
// Comprobar si se tiene que realizar un redimensionado
mall_conf->results->malleability_time[mall_conf->grp] = MPI_Wtime();
//if(CHECK_RMS()) {return MALL_DENIED;}
state = spawn_step();
if (state == MALL_SPAWN_COMPLETED || state == MALL_SPAWN_ADAPT_POSTPONE){
malleability_checkpoint();
}
break;
case MALL_SPAWN_PENDING: // Comprueba si el spawn ha terminado y comienza la redistribucion
case MALL_SPAWN_SINGLE_PENDING:
state = check_spawn_state(&(mall->intercomm), mall->comm, &end_real_time);
if (state == MALL_SPAWN_COMPLETED || state == MALL_SPAWN_ADAPTED) {
mall_conf->results->spawn_time[mall_conf->grp] = MPI_Wtime() - mall_conf->results->spawn_start;
mall_conf->results->spawn_real_time[mall_conf->grp] = end_real_time - mall_conf->results->spawn_start;
malleability_checkpoint();
}
break;
case MALL_SPAWN_ADAPT_POSTPONE:
case MALL_SPAWN_COMPLETED:
state = start_redistribution();
malleability_checkpoint();
break;
case MALL_DIST_PENDING:
if(malleability_red_contains_strat(mall_conf->red_strategies, MALL_RED_THREAD, NULL)) {
state = thread_check();
} else {
state = check_redistribution();
}
if(state != MALL_DIST_PENDING) {
malleability_checkpoint();
}
break;
case MALL_SPAWN_ADAPT_PENDING:
mall_conf->results->spawn_start = MPI_Wtime();
unset_spawn_postpone_flag(state);
state = check_spawn_state(&(mall->intercomm), mall->comm, &end_real_time);
if(!malleability_spawn_contains_strat(mall_conf->spawn_strategies, MALL_SPAWN_PTHREAD, NULL)) {
mall_conf->results->spawn_time[mall_conf->grp] = MPI_Wtime() - mall_conf->results->spawn_start;
malleability_checkpoint();
}
break;
case MALL_SPAWN_ADAPTED:
state = shrink_redistribution();
malleability_checkpoint();
break;
case MALL_DIST_COMPLETED: //TODO No es esto muy feo?
mall_conf->results->malleability_end = MPI_Wtime();
state = MALL_COMPLETED;
break;
}
return state;
}
// Funciones solo necesarias por el benchmark
//-------------------------------------------------------------------------------------------------------------
void set_benchmark_grp(int grp) {
mall_conf->grp = grp;
}
void set_benchmark_configuration(configuration *config_file) {
mall_conf->config_file = config_file;
}
void get_benchmark_configuration(configuration **config_file) {
*config_file = mall_conf->config_file;
}
void set_benchmark_results(results_data *results) {
mall_conf->results = results;
}
void get_benchmark_results(results_data **results) {
*results = mall_conf->results;
}
//-------------------------------------------------------------------------------------------------------------
void set_malleability_configuration(int spawn_method, int spawn_strategies, int spawn_dist, int red_method, int red_strategies) {
mall_conf->spawn_method = spawn_method;
mall_conf->spawn_strategies = spawn_strategies;
mall_conf->spawn_dist = spawn_dist;
mall_conf->red_method = red_method;
mall_conf->red_strategies = red_strategies;
if(!malleability_red_contains_strat(mall_conf->red_strategies, MALL_RED_IBARRIER, NULL) &&
(mall_conf->red_method == MALL_RED_RMA_LOCK || mall_conf->red_method == MALL_RED_RMA_LOCKALL)) {
malleability_red_add_strat(&(mall_conf->red_strategies), MALL_RED_IBARRIER);
}
}
/*
* To be deprecated
* Tiene que ser llamado despues de setear la config
*/
void set_children_number(int numC){
if((mall_conf->spawn_method == MALL_SPAWN_MERGE) && (numC >= mall->numP)) {
mall->numC = numC;
mall->numC_spawned = numC - mall->numP;
if(numC == mall->numP) { // Migrar
mall->numC_spawned = numC;
mall_conf->spawn_method = MALL_SPAWN_BASELINE;
}
} else {
mall->numC = numC;
mall->numC_spawned = numC;
}
}
/*
* TODO
*/
void get_malleability_user_comm(MPI_Comm *comm) {
if(mall->dup_user_comm) {
if(mall->user_comm != MPI_COMM_WORLD) MPI_Comm_free(&(mall->user_comm));
MPI_Comm_dup(mall->comm, &(mall->user_comm));
MPI_Comm_set_name(mall->user_comm, "MPI_COMM_MALL_USER");
mall->dup_user_comm = 0;
}
*comm = mall->user_comm;
}
/*
* Anyade a la estructura concreta de datos elegida
* el nuevo set de datos "data" de un total de "total_qty" elementos.
*
* Los datos variables se tienen que anyadir cuando quieran ser mandados, no antes
*
* Mas informacion en la funcion "add_data".
*
* //FIXME Si es constante se debería ir a asincrono, no sincrono
*/
void malleability_add_data(void *data, size_t total_qty, int type, int is_replicated, int is_constant) {
size_t total_reqs = 0;
if(is_constant) {
if(is_replicated) {
add_data(data, total_qty, type, total_reqs, rep_s_data);
} else {
add_data(data, total_qty, type, total_reqs, dist_s_data);
}
} else {
if(is_replicated) {
add_data(data, total_qty, type, total_reqs, rep_a_data); //FIXME total_reqs==0 ???
} else {
if(mall_conf->red_method == MALL_RED_BASELINE) {
total_reqs = 1;
} else if(mall_conf->red_method == MALL_RED_POINT || mall_conf->red_method == MALL_RED_RMA_LOCK || mall_conf->red_method == MALL_RED_RMA_LOCKALL) {
total_reqs = mall->numC;
}
if(malleability_red_contains_strat(mall_conf->red_strategies, MALL_RED_IBARRIER, NULL)) {
total_reqs++;
}
add_data(data, total_qty, type, total_reqs, dist_a_data);
}
}
}
/*
* Modifica en la estructura concreta de datos elegida en el indice "index"
* con el set de datos "data" de un total de "total_qty" elementos.
*
* Los datos variables se tienen que modificar cuando quieran ser mandados, no antes
*
* Mas informacion en la funcion "modify_data".
* //FIXME Si es constante se debería ir a asincrono, no sincrono
*/
void malleability_modify_data(void *data, size_t index, size_t total_qty, int type, int is_replicated, int is_constant) {
size_t total_reqs = 0;
if(is_constant) {
if(is_replicated) {
modify_data(data, index, total_qty, type, total_reqs, rep_s_data);
} else {
modify_data(data, index, total_qty, type, total_reqs, dist_s_data);
}
} else {
if(is_replicated) {
modify_data(data, index, total_qty, type, total_reqs, rep_a_data); //FIXME total_reqs==0 ???
} else {
if(mall_conf->red_method == MALL_RED_BASELINE) {
total_reqs = 1;
} else if(mall_conf->red_method == MALL_RED_POINT || mall_conf->red_method == MALL_RED_RMA_LOCK || mall_conf->red_method == MALL_RED_RMA_LOCKALL) {
total_reqs = mall->numC;
}
if(malleability_red_contains_strat(mall_conf->red_strategies, MALL_RED_IBARRIER, NULL)) {
total_reqs++;
}
modify_data(data, index, total_qty, type, total_reqs, dist_a_data);
}
}
}
/*
* Devuelve el numero de entradas para la estructura de descripcion de
* datos elegida.
* //FIXME Si es constante se debería ir a asincrono, no sincrono
*/
void malleability_get_entries(size_t *entries, int is_replicated, int is_constant){
if(is_constant) {
if(is_replicated) {
*entries = rep_s_data->entries;
} else {
*entries = dist_s_data->entries;
}
} else {
if(is_replicated) {
*entries = rep_a_data->entries;
} else {
*entries = dist_a_data->entries;
}
}
}
/*
* Devuelve el elemento de la lista "index" al usuario.
* La devolución es en el mismo orden que lo han metido los padres
* con la funcion "malleability_add_data()".
* Es tarea del usuario saber el tipo de esos datos.
* TODO Refactor a que sea automatico
* //FIXME Si es constante se debería ir a asincrono, no sincrono
*/
void malleability_get_data(void **data, size_t index, int is_replicated, int is_constant) {
malleability_data_t *data_struct;
if(is_constant) {
if(is_replicated) {
data_struct = rep_s_data;
} else {
data_struct = dist_s_data;
}
} else {
if(is_replicated) {
data_struct = rep_a_data;
} else {
data_struct = dist_a_data;
}
}
*data = data_struct->arrays[index];
}
//======================================================||
//================PRIVATE FUNCTIONS=====================||
//================DATA COMMUNICATION====================||
//======================================================||
//======================================================||
/*
* Funcion generalizada para enviar datos desde los hijos.
* La asincronizidad se refiere a si el hilo padre e hijo lo hacen
* de forma bloqueante o no. El padre puede tener varios hilos.
*/
void send_data(int numP_children, malleability_data_t *data_struct, int is_asynchronous) {
size_t i;
char *aux_send, *aux_recv;
if(is_asynchronous) {
for(i=0; i < data_struct->entries; i++) {
aux_send = (char *) data_struct->arrays[i]; //TODO Comprobar que realmente es un char
aux_recv = NULL;
async_communication_start(aux_send, &aux_recv, data_struct->qty[i], mall->myId, mall->numP, numP_children, MALLEABILITY_NOT_CHILDREN, mall_conf->red_method, mall_conf->red_strategies,
mall->intercomm, &(data_struct->requests[i]), &(data_struct->request_qty[i]), &(data_struct->windows[i]));
if(aux_recv != NULL) data_struct->arrays[i] = (void *) aux_recv;
}
} else {
for(i=0; i < data_struct->entries; i++) {
aux_send = (char *) data_struct->arrays[i]; //TODO Comprobar que realmente es un char
aux_recv = NULL;
sync_communication(aux_send, &aux_recv, data_struct->qty[i], mall->myId, mall->numP, numP_children, MALLEABILITY_NOT_CHILDREN, mall_conf->red_method, mall->intercomm);
if(aux_recv != NULL) data_struct->arrays[i] = (void *) aux_recv;
}
}
}
/*
* Funcion generalizada para recibir datos desde los hijos.
* La asincronizidad se refiere a si el hilo padre e hijo lo hacen
* de forma bloqueante o no. El padre puede tener varios hilos.
*/
void recv_data(int numP_parents, malleability_data_t *data_struct, int is_asynchronous) {
size_t i;
char *aux, aux_s;
if(is_asynchronous) {
for(i=0; i < data_struct->entries; i++) {
aux = (char *) data_struct->arrays[i]; //TODO Comprobar que realmente es un char
async_communication_start(&aux_s, &aux, data_struct->qty[i], mall->myId, mall->numP, numP_parents, MALLEABILITY_CHILDREN, mall_conf->red_method, mall_conf->red_strategies,
mall->intercomm, &(data_struct->requests[i]), &(data_struct->request_qty[i]), &(data_struct->windows[i]));
data_struct->arrays[i] = (void *) aux;
}
} else {
for(i=0; i < data_struct->entries; i++) {
aux = (char *) data_struct->arrays[i]; //TODO Comprobar que realmente es un char
sync_communication(&aux_s, &aux, data_struct->qty[i], mall->myId, mall->numP, numP_parents, MALLEABILITY_CHILDREN, mall_conf->red_method, mall->intercomm);
data_struct->arrays[i] = (void *) aux;
}
}
}
//======================================================||
//================PRIVATE FUNCTIONS=====================||
//=====================CHILDREN=========================||
//======================================================||
//======================================================||
/*
* Inicializacion de los datos de los hijos.
* En la misma se reciben datos de los padres: La configuracion
* de la ejecucion a realizar; y los datos a recibir de los padres
* ya sea de forma sincrona, asincrona o ambas.
*/
void Children_init() {
size_t i;
int numP_parents, root_parents;
int is_intercomm;
malleability_connect_children(mall->myId, mall->numP, mall->root, mall->comm, &numP_parents, &root_parents, &(mall->intercomm));
MPI_Comm_test_inter(mall->intercomm, &is_intercomm);
if(!is_intercomm) { // For intracommunicators, these processes will be added
MPI_Comm_rank(mall->intercomm, &(mall->myId));
MPI_Comm_size(mall->intercomm, &(mall->numP));
}
recv_config_file(mall->root, mall->intercomm, &(mall_conf->config_file));
comm_node_data(root_parents, MALLEABILITY_CHILDREN);
MPI_Bcast(&(mall_conf->red_method), 1, MPI_INT, root_parents, mall->intercomm);
MPI_Bcast(&(mall_conf->red_strategies), 1, MPI_INT, root_parents, mall->intercomm);
mall_conf->results = (results_data *) malloc(sizeof(results_data));
init_results_data(mall_conf->results, mall_conf->config_file->n_resizes, mall_conf->config_file->n_stages, RESULTS_INIT_DATA_QTY);
comm_data_info(rep_a_data, dist_a_data, MALLEABILITY_CHILDREN, mall->myId, root_parents, mall->intercomm);
if(dist_a_data->entries || rep_a_data->entries) { // Recibir datos asincronos
if(malleability_red_contains_strat(mall_conf->red_strategies, MALL_RED_THREAD, NULL)) {
recv_data(numP_parents, dist_a_data, MALLEABILITY_USE_SYNCHRONOUS);
} else {
recv_data(numP_parents, dist_a_data, MALLEABILITY_USE_ASYNCHRONOUS);
for(i=0; i<dist_a_data->entries; i++) {
async_communication_wait(mall_conf->red_strategies, mall->intercomm, dist_a_data->requests[i], dist_a_data->request_qty[i]);
}
for(i=0; i<dist_a_data->entries; i++) {
async_communication_end(mall_conf->red_method, mall_conf->red_strategies, dist_a_data->requests[i], dist_a_data->request_qty[i], &(dist_a_data->windows[i]));
}
}
mall_conf->results->async_end= MPI_Wtime(); // Obtener timestamp de cuando termina comm asincrona
}
comm_data_info(rep_s_data, dist_s_data, MALLEABILITY_CHILDREN, mall->myId, root_parents, mall->intercomm);
if(dist_s_data->entries || rep_s_data->entries) { // Recibir datos sincronos
recv_data(numP_parents, dist_s_data, MALLEABILITY_USE_SYNCHRONOUS);
mall_conf->results->sync_end = MPI_Wtime(); // Obtener timestamp de cuando termina comm sincrona
// TODO Crear funcion especifica y anyadir para Asinc
// TODO Tener en cuenta el tipo y qty
for(i=0; i<rep_s_data->entries; i++) {
MPI_Datatype datatype;
if(rep_s_data->types[i] == MAL_INT) {
datatype = MPI_INT;
} else {
datatype = MPI_CHAR;
}
MPI_Bcast(rep_s_data->arrays[i], rep_s_data->qty[i], datatype, root_parents, mall->intercomm);
}
}
mall_conf->results->malleability_end = MPI_Wtime(); // Obtener timestamp de cuando termina maleabilidad
// Guardar los resultados de esta transmision
comm_results(mall_conf->results, mall->root, mall_conf->config_file->n_resizes, mall->intercomm);
if(!is_intercomm) {
malleability_comms_update(mall->intercomm);
}
MPI_Comm_disconnect(&(mall->intercomm)); //FIXME Error en OpenMPI + Merge
}
//======================================================||
//================PRIVATE FUNCTIONS=====================||
//=====================PARENTS==========================||
//======================================================||
//======================================================||
/*
* Se encarga de realizar la creacion de los procesos hijos.
* Si se pide en segundo plano devuelve el estado actual.
*/
int spawn_step(){
mall_conf->results->spawn_start = MPI_Wtime();
state = init_spawn(mall->name_exec, mall->num_cpus, mall->num_nodes, mall->nodelist, mall->myId, mall->numP, mall->numC, mall->root, mall_conf->spawn_dist, mall_conf->spawn_method, mall_conf->spawn_strategies, mall->thread_comm, &(mall->intercomm));
if(!malleability_spawn_contains_strat(mall_conf->spawn_strategies, MALL_SPAWN_PTHREAD, NULL)) {
mall_conf->results->spawn_time[mall_conf->grp] = MPI_Wtime() - mall_conf->results->spawn_start;
}
return state;
}
/*
* Comienza la redistribucion de los datos con el nuevo grupo de procesos.
*
* Primero se envia la configuracion a utilizar al nuevo grupo de procesos y a continuacion
* se realiza el envio asincrono y/o sincrono si lo hay.
*
* En caso de que haya comunicacion asincrona, se comienza y se termina la funcion
* indicando que se ha comenzado un envio asincrono.
*
* Si no hay comunicacion asincrono se pasa a realizar la sincrona si la hubiese.
*
* Finalmente se envian datos sobre los resultados a los hijos y se desconectan ambos
* grupos de procesos.
*/
int start_redistribution() {
int rootBcast, is_intercomm;
is_intercomm = 0;
if(mall->intercomm != MPI_COMM_NULL) {
MPI_Comm_test_inter(mall->intercomm, &is_intercomm);
} else {
// Si no tiene comunicador creado, se debe a que se ha pospuesto el Spawn
// y se trata del spawn Merge Shrink
MPI_Comm_dup(mall->comm, &(mall->intercomm));
}
if(is_intercomm) {
rootBcast = mall->myId == mall->root ? MPI_ROOT : MPI_PROC_NULL;
} else {
rootBcast = mall->root;
}
send_config_file(mall_conf->config_file, rootBcast, mall->intercomm);
comm_node_data(rootBcast, MALLEABILITY_NOT_CHILDREN);
MPI_Bcast(&(mall_conf->red_method), 1, MPI_INT, rootBcast, mall->intercomm);
MPI_Bcast(&(mall_conf->red_strategies), 1, MPI_INT, rootBcast, mall->intercomm);
comm_data_info(rep_a_data, dist_a_data, MALLEABILITY_NOT_CHILDREN, mall->myId, mall->root, mall->intercomm);
if(dist_a_data->entries || rep_a_data->entries) { // Enviar datos asincronos
//FIXME No se envian los datos replicados (rep_a_data)
mall_conf->results->async_time[mall_conf->grp] = MPI_Wtime();
if(malleability_red_contains_strat(mall_conf->red_strategies, MALL_RED_THREAD, NULL)) {
return thread_creation();
} else {
send_data(mall->numC, dist_a_data, MALLEABILITY_USE_ASYNCHRONOUS);
return MALL_DIST_PENDING;
}
}
return end_redistribution();
}
/*
* Comprueba si la redistribucion asincrona ha terminado.
* Si no ha terminado la funcion termina indicandolo, en caso contrario,
* se continua con la comunicacion sincrona, el envio de resultados y
* se desconectan los grupos de procesos.
*
* Esta funcion permite dos modos de funcionamiento al comprobar si la
* comunicacion asincrona ha terminado.
* Si se utiliza el modo "MAL_USE_NORMAL" o "MAL_USE_POINT", se considera
* terminada cuando los padres terminan de enviar.
* Si se utiliza el modo "MAL_USE_IBARRIER", se considera terminada cuando
* los hijos han terminado de recibir.
* //FIXME Modificar para que se tenga en cuenta rep_a_data
*/
int check_redistribution() {
int is_intercomm, completed, local_completed, all_completed;
size_t i, req_qty;
MPI_Request *req_completed;
MPI_Win window;
local_completed = 1;
for(i=0; i<dist_a_data->entries; i++) {
req_completed = dist_a_data->requests[i];
req_qty = dist_a_data->request_qty[i];
completed = async_communication_check(mall->myId, MALLEABILITY_NOT_CHILDREN, mall_conf->red_strategies, mall->intercomm, req_completed, req_qty);
local_completed = local_completed && completed;
}
MPI_Allreduce(&local_completed, &all_completed, 1, MPI_INT, MPI_MIN, mall->comm);
if(!all_completed) return MALL_DIST_PENDING; // Continue only if asynchronous send has ended
for(i=0; i<dist_a_data->entries; i++) {
req_completed = dist_a_data->requests[i];
req_qty = dist_a_data->request_qty[i];
window = dist_a_data->windows[i];
async_communication_end(mall_conf->red_method, mall_conf->red_strategies, req_completed, req_qty, &window);
}
MPI_Comm_test_inter(mall->intercomm, &is_intercomm);
if(!is_intercomm) mall_conf->results->async_end = MPI_Wtime(); // Merge method only
return end_redistribution();
}
/*
* Termina la redistribución de los datos con los hijos, comprobando
* si se han realizado iteraciones con comunicaciones en segundo plano
* y enviando cuantas iteraciones se han realizado a los hijos.
*
* Además se realizan las comunicaciones síncronas se las hay.
* Finalmente termina enviando los datos temporales a los hijos.
*/
int end_redistribution() {
size_t i;
int is_intercomm, rootBcast, local_state;
MPI_Comm_test_inter(mall->intercomm, &is_intercomm);
if(is_intercomm) {
rootBcast = mall->myId == mall->root ? MPI_ROOT : MPI_PROC_NULL;
} else {
rootBcast = mall->root;
}
comm_data_info(rep_s_data, dist_s_data, MALLEABILITY_NOT_CHILDREN, mall->myId, mall->root, mall->intercomm);
if(dist_s_data->entries || rep_s_data->entries) { // Enviar datos sincronos
mall_conf->results->sync_time[mall_conf->grp] = MPI_Wtime();
send_data(mall->numC, dist_s_data, MALLEABILITY_USE_SYNCHRONOUS);
if(!is_intercomm) mall_conf->results->sync_end = MPI_Wtime(); // Merge method only
// TODO Crear funcion especifica y anyadir para Asinc
// TODO Tener en cuenta el tipo
for(i=0; i<rep_s_data->entries; i++) {
MPI_Datatype datatype;
if(rep_s_data->types[i] == MAL_INT) {
datatype = MPI_INT;
} else {
datatype = MPI_CHAR;
}
MPI_Bcast(rep_s_data->arrays[i], rep_s_data->qty[i], datatype, rootBcast, mall->intercomm);
}
}
comm_results(mall_conf->results, rootBcast, mall_conf->config_file->n_resizes, mall->intercomm);
local_state = MALL_DIST_COMPLETED;
if(!is_intercomm) { // Merge Spawn
if(mall->numP < mall->numC) { // Expand
malleability_comms_update(mall->intercomm);
} else { // Shrink || Merge Shrink requiere de mas tareas
local_state = MALL_SPAWN_ADAPT_PENDING;
}
}
if(mall->intercomm != MPI_COMM_NULL && mall->intercomm != MPI_COMM_WORLD) {
MPI_Comm_disconnect(&(mall->intercomm)); //FIXME Error en OpenMPI + Merge
}
return local_state;
}
///=============================================
///=============================================
///=============================================
//TODO Add comment
int shrink_redistribution() {
double time_extra = MPI_Wtime();
//TODO Create new state before collecting zombies. Processes can perform tasks before that. Then call again Malleability to commit the change
zombies_collect_suspended(mall->user_comm, mall->myId, mall->numP, mall->numC, mall->root, (void *) mall_conf->results, mall_conf->config_file->n_stages);
if(mall->myId < mall->numC) {
if(mall->thread_comm != MPI_COMM_WORLD) MPI_Comm_free(&(mall->thread_comm)); //FIXME Modificar a que se pida pro el usuario el cambio y se llama a comms_update
if(mall->comm != MPI_COMM_WORLD) MPI_Comm_free(&(mall->comm));
mall->dup_user_comm = 1;
MPI_Comm_dup(mall->intercomm, &(mall->thread_comm));
MPI_Comm_dup(mall->intercomm, &(mall->comm));
MPI_Comm_set_name(mall->thread_comm, "MPI_COMM_MALL_THREAD");
MPI_Comm_set_name(mall->comm, "MPI_COMM_MALL");
MPI_Comm_free(&(mall->intercomm));
mall_conf->results->spawn_time[mall_conf->grp] += MPI_Wtime() - time_extra;
if(malleability_spawn_contains_strat(mall_conf->spawn_strategies,MALL_SPAWN_PTHREAD, NULL)) {
mall_conf->results->spawn_real_time[mall_conf->grp] += MPI_Wtime() - time_extra;
}
return MALL_DIST_COMPLETED;
} else {
return MALL_ZOMBIE;
}
}
//======================================================||
//================PRIVATE FUNCTIONS=====================||
//=================COMM NODE INFO ======================||
//======================================================||
//======================================================||
//TODO Add comment
void comm_node_data(int rootBcast, int is_child_group) {
MPI_Datatype node_type;
def_nodeinfo_type(&node_type);
MPI_Bcast(mall, 1, node_type, rootBcast, mall->intercomm);
if(is_child_group) {
mall->nodelist = malloc((mall->nodelist_len+1) * sizeof(char));
mall->nodelist[mall->nodelist_len] = '\0';
}
MPI_Bcast(mall->nodelist, mall->nodelist_len, MPI_CHAR, rootBcast, mall->intercomm);
MPI_Type_free(&node_type);
}
//TODO Add comment
void def_nodeinfo_type(MPI_Datatype *node_type) {
int i, counts = 3;
int blocklengths[3] = {1, 1, 1};
MPI_Aint displs[counts], dir;
MPI_Datatype types[counts];
// Rellenar vector types
types[0] = types[1] = types[2] = MPI_INT;
// Rellenar vector displs
MPI_Get_address(mall, &dir);
MPI_Get_address(&(mall->num_cpus), &displs[0]);
MPI_Get_address(&(mall->num_nodes), &displs[1]);
MPI_Get_address(&(mall->nodelist_len), &displs[2]);
for(i=0;i<counts;i++) displs[i] -= dir;
MPI_Type_create_struct(counts, blocklengths, displs, types, node_type);
MPI_Type_commit(node_type);
}
// TODO MOVER A OTRO LADO??
//======================================================||
//================PRIVATE FUNCTIONS=====================||
//===============COMM PARENTS THREADS===================||
//======================================================||
//======================================================||
int comm_state; //FIXME Usar un handler
/*
* Crea una hebra para ejecutar una comunicación en segundo plano.
*/
int thread_creation() {
comm_state = MALL_DIST_PENDING;
if(pthread_create(&(mall->async_thread), NULL, thread_async_work, NULL)) {
printf("Error al crear el hilo\n");
MPI_Abort(MPI_COMM_WORLD, -1);
return -1;
}
return comm_state;
}
/*
* Comprobación por parte de una hebra maestra que indica
* si una hebra esclava ha terminado su comunicación en segundo plano.
*
* El estado de la comunicación es devuelto al finalizar la función.
*/
int thread_check() {
int all_completed = 0, is_intercomm;
// Comprueba que todos los hilos han terminado la distribucion (Mismo valor en commAsync)
MPI_Allreduce(&comm_state, &all_completed, 1, MPI_INT, MPI_MAX, mall->comm);
if(all_completed != MALL_DIST_COMPLETED) return MALL_DIST_PENDING; // Continue only if asynchronous send has ended
//FIXME No se tiene en cuenta el estado MALL_APP_ENDED
if(pthread_join(mall->async_thread, NULL)) {
printf("Error al esperar al hilo\n");
MPI_Abort(MPI_COMM_WORLD, -1);
return -2;
}
MPI_Comm_test_inter(mall->intercomm, &is_intercomm);
if(!is_intercomm) mall_conf->results->async_end = MPI_Wtime(); // Merge method only
return end_redistribution();
}
/*
* Función ejecutada por una hebra.
* Ejecuta una comunicación síncrona con los hijos que
* para el usuario se puede considerar como en segundo plano.
*
* Cuando termina la comunicación la hebra maestra puede comprobarlo
* por el valor "commAsync".
*/
void* thread_async_work() {
send_data(mall->numC, dist_a_data, MALLEABILITY_USE_SYNCHRONOUS);
comm_state = MALL_DIST_COMPLETED;
pthread_exit(NULL);
}
//==============================================================================
/*
* Muestra por pantalla el estado actual de todos los comunicadores
*/
void print_comms_state() {
int tester;
char *test = malloc(MPI_MAX_OBJECT_NAME * sizeof(char));
MPI_Comm_get_name(mall->comm, test, &tester);
printf("P%d Comm=%d Name=%s\n", mall->myId, mall->comm, test);
MPI_Comm_get_name(mall->user_comm, test, &tester);
printf("P%d Comm=%d Name=%s\n", mall->myId, mall->user_comm, test);
if(mall->intercomm != MPI_COMM_NULL) {
MPI_Comm_get_name(mall->intercomm, test, &tester);
printf("P%d Comm=%d Name=%s\n", mall->myId, mall->intercomm, test);
}
free(test);
}
void malleability_comms_update(MPI_Comm comm) {
if(mall->thread_comm != MPI_COMM_WORLD) MPI_Comm_free(&(mall->thread_comm));
if(mall->comm != MPI_COMM_WORLD) MPI_Comm_free(&(mall->comm));
if(mall->user_comm != MPI_COMM_WORLD) MPI_Comm_free(&(mall->user_comm)); //TODO No es peligroso?
MPI_Comm_dup(comm, &(mall->thread_comm));
MPI_Comm_dup(comm, &(mall->comm));
MPI_Comm_dup(comm, &(mall->user_comm));
MPI_Comm_set_name(mall->thread_comm, "MPI_COMM_MALL_THREAD");
MPI_Comm_set_name(mall->comm, "MPI_COMM_MALL");
MPI_Comm_set_name(mall->user_comm, "MPI_COMM_MALL_USER");
}
#ifndef MALLEABILITY_MANAGER_H
#define MALLEABILITY_MANAGER_H
#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>
#include <fcntl.h>
#include <sys/stat.h>
#include <mpi.h>
#include "../IOcodes/results.h"
#include "../Main/configuration.h"
#include "../Main/Main_datatypes.h"
#include "malleabilityStates.h"
int init_malleability(int myId, int numP, int root, MPI_Comm comm, char *name_exec, char *nodelist, int num_cpus, int num_nodes);
void free_malleability();
void indicate_ending_malleability(int new_outside_state);
int malleability_checkpoint();
void set_benchmark_grp(int grp);
void set_malleability_configuration(int spawn_method, int spawn_strategies, int spawn_dist, int red_method, int red_strategies);
void set_children_number(int numC); // TODO TO BE DEPRECATED
void get_malleability_user_comm(MPI_Comm *comm);
void malleability_add_data(void *data, size_t total_qty, int type, int is_replicated, int is_constant);
void malleability_modify_data(void *data, size_t index, size_t total_qty, int type, int is_replicated, int is_constant);
void malleability_get_entries(size_t *entries, int is_replicated, int is_constant);
void malleability_get_data(void **data, size_t index, int is_replicated, int is_constant);
void set_benchmark_configuration(configuration *config_file);
void get_benchmark_configuration(configuration **config_file);
void set_benchmark_results(results_data *results);
void get_benchmark_results(results_data **results);
#endif
#ifndef MALLEABILITY_STATES_H
#define MALLEABILITY_STATES_H
#include <stdio.h>
#include <stdlib.h>
//States
#define MALL_DENIED -1
enum mall_states{MALL_UNRESERVED, MALL_NOT_STARTED, MALL_ZOMBIE, MALL_SPAWN_PENDING, MALL_SPAWN_SINGLE_PENDING,
MALL_SPAWN_SINGLE_COMPLETED, MALL_SPAWN_ADAPT_POSTPONE, MALL_SPAWN_COMPLETED, MALL_DIST_PENDING, MALL_DIST_COMPLETED,
MALL_SPAWN_ADAPT_PENDING, MALL_SPAWN_ADAPTED, MALL_COMPLETED};
enum mall_spawn_methods{MALL_SPAWN_BASELINE, MALL_SPAWN_MERGE};
#define MALL_SPAWN_PTHREAD 2
#define MALL_SPAWN_SINGLE 3
enum mall_redistribution_methods{MALL_RED_BASELINE, MALL_RED_POINT, MALL_RED_RMA_LOCK, MALL_RED_RMA_LOCKALL};
#define MALL_RED_THREAD 2
#define MALL_RED_IBARRIER 3
#define MALLEABILITY_ROOT 0
#define MAL_APP_EXECUTING 0
#define MAL_APP_ENDED 1
//TODO DEPRECATE
#define MAL_INT 0
#define MAL_CHAR 1
////////////////
#define MALLEABILITY_CHILDREN 1
#define MALLEABILITY_NOT_CHILDREN 0
#endif
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment