Commit e83b5922 authored by Iker Martín Álvarez's avatar Iker Martín Álvarez
Browse files

New version of Proteo

parent 26305fac
#include <pthread.h>
#include "malleabilityManager.h"
#include "malleabilityStates.h"
#include "malleabilityTypes.h"
#include "malleabilityZombies.h"
#include "ProcessDist.h"
#include "CommDist.h"
#define MALLEABILITY_ROOT 0
#define MALLEABILITY_USE_SYNCHRONOUS 0
#define MALLEABILITY_USE_ASYNCHRONOUS 1
void send_data(int numP_children, malleability_data_t *data_struct, int is_asynchronous);
void recv_data(int numP_parents, malleability_data_t *data_struct, int is_asynchronous);
void Children_init();
int spawn_step();
int start_redistribution();
int check_redistribution();
int end_redistribution();
int shrink_redistribution();
int thread_creation();
int thread_check();
void* thread_async_work(void* void_arg);
typedef struct {
int spawn_type;
int spawn_dist;
int spawn_is_single;
int spawn_threaded;
int comm_type;
int comm_threaded;
int grp;
configuration *config_file;
results_data *results;
} malleability_config_t;
typedef struct { //FIXME numC_spawned no se esta usando
int myId, numP, numC, numC_spawned, root, root_parents;
pthread_t async_thread;
MPI_Comm comm, thread_comm;
MPI_Comm intercomm;
MPI_Comm user_comm;
char *name_exec, *nodelist;
int num_cpus, num_nodes;
} malleability_t;
int state = MAL_UNRESERVED; //FIXME Mover a otro lado
malleability_config_t *mall_conf;
malleability_t *mall;
malleability_data_t *rep_s_data;
malleability_data_t *dist_s_data;
malleability_data_t *rep_a_data;
malleability_data_t *dist_a_data;
/*
* Inicializa la reserva de memoria para el modulo de maleabilidad
* creando todas las estructuras necesarias y copias de comunicadores
* para no interferir en la aplicación.
*
* Si es llamada por un grupo de procesos creados de forma dinámica,
* inicializan la comunicacion con sus padres. En este caso, al terminar
* la comunicacion los procesos hijo estan preparados para ejecutar la
* aplicacion.
*/
int init_malleability(int myId, int numP, int root, MPI_Comm comm, char *name_exec, char *nodelist, int num_cpus, int num_nodes) {
MPI_Comm dup_comm, thread_comm;
mall_conf = (malleability_config_t *) malloc(sizeof(malleability_config_t));
mall = (malleability_t *) malloc(sizeof(malleability_t));
rep_s_data = (malleability_data_t *) malloc(sizeof(malleability_data_t));
dist_s_data = (malleability_data_t *) malloc(sizeof(malleability_data_t));
rep_a_data = (malleability_data_t *) malloc(sizeof(malleability_data_t));
dist_a_data = (malleability_data_t *) malloc(sizeof(malleability_data_t));
MPI_Comm_dup(comm, &dup_comm);
MPI_Comm_dup(comm, &thread_comm);
mall->myId = myId;
mall->numP = numP;
mall->root = root;
mall->comm = dup_comm;
mall->thread_comm = thread_comm; // TODO Refactor -- Crear solo si es necesario?
mall->user_comm = comm;
mall->name_exec = name_exec;
mall->nodelist = nodelist;
mall->num_cpus = num_cpus;
mall->num_nodes = num_nodes;
rep_s_data->entries = 0;
rep_a_data->entries = 0;
dist_s_data->entries = 0;
dist_a_data->entries = 0;
state = MAL_NOT_STARTED;
// Si son el primer grupo de procesos, obtienen los datos de los padres
MPI_Comm_get_parent(&(mall->intercomm));
if(mall->intercomm != MPI_COMM_NULL ) {
Children_init();
return MALLEABILITY_CHILDREN;
}
zombies_service_init();
return MALLEABILITY_NOT_CHILDREN;
}
/*
* Elimina toda la memoria reservado por el modulo
* de maleabilidad y asegura que los zombies
* despierten si los hubiese.
*/
void free_malleability() {
free_malleability_data_struct(rep_s_data);
free_malleability_data_struct(rep_a_data);
free_malleability_data_struct(dist_s_data);
free_malleability_data_struct(dist_a_data);
free(rep_s_data);
free(rep_a_data);
free(dist_s_data);
free(dist_a_data);
//MPI_Comm_free(&(mall->comm)); // TODO Revisar si hace falta?
//MPI_Comm_free(&(mall->thread_comm));
free(mall);
free(mall_conf);
zombies_awake();
zombies_service_free();
state = MAL_UNRESERVED;
}
/*
* Se realiza el redimensionado de procesos por parte de los padres.
*
* Se crean los nuevos procesos con la distribucion fisica elegida y
* a continuacion se transmite la informacion a los mismos.
*
* Si hay datos asincronos a transmitir, primero se comienza a
* transmitir estos y se termina la funcion. Se tiene que comprobar con
* llamando a la función de nuevo que se han terminado de enviar
*
* Si hay ademas datos sincronos a enviar, no se envian aun.
*
* Si solo hay datos sincronos se envian tras la creacion de los procesos
* y finalmente se desconectan los dos grupos de procesos.
*/
int malleability_checkpoint() {
if(state == MAL_UNRESERVED) return MAL_UNRESERVED;
if(state == MAL_NOT_STARTED) {
// Comprobar si se tiene que realizar un redimensionado
//if(CHECK_RMS()) {return MAL_DENIED;}
state = spawn_step();
if (state == MAL_SPAWN_COMPLETED){
state = start_redistribution();
}
} else if(state == MAL_SPAWN_PENDING || state == MAL_SPAWN_SINGLE_PENDING) { // Comprueba si el spawn ha terminado y comienza la redistribucion
double end_real_time;
if(mall_conf->spawn_type == COMM_SPAWN_MERGE_PTHREAD && mall->numP > mall->numC) {
state = shrink_redistribution(); //TODO REFACTOR
} else {
state = check_slurm_comm(mall->myId, mall->root, mall->numP, &(mall->intercomm), mall->comm, mall->thread_comm, &end_real_time);
if (state == MAL_SPAWN_COMPLETED) {
mall_conf->results->spawn_time[mall_conf->grp] = MPI_Wtime() - mall_conf->results->spawn_start;
if(mall_conf->spawn_type == COMM_SPAWN_PTHREAD || mall_conf->spawn_type == COMM_SPAWN_MERGE_PTHREAD) {
mall_conf->results->spawn_real_time[mall_conf->grp] = end_real_time - mall_conf->results->spawn_start;
}
//TODO Si es MERGE SHRINK, metodo diferente de redistribucion de datos
state = start_redistribution();
}
}
} else if(state == MAL_DIST_PENDING) {
if(mall_conf->comm_type == MAL_USE_THREAD) {
state = thread_check();
} else {
state = check_redistribution();
}
}
return state;
}
// Funciones solo necesarias por el benchmark
//-------------------------------------------------------------------------------------------------------------
void set_benchmark_grp(int grp) {
mall_conf->grp = grp;
}
void set_benchmark_configuration(configuration *config_file) {
mall_conf->config_file = config_file;
}
void get_benchmark_configuration(configuration **config_file) {
*config_file = mall_conf->config_file;
}
void set_benchmark_results(results_data *results) {
mall_conf->results = results;
}
void get_benchmark_results(results_data **results) {
*results = mall_conf->results;
}
//-------------------------------------------------------------------------------------------------------------
void set_malleability_configuration(int spawn_type, int spawn_is_single, int spawn_dist, int spawn_threaded, int comm_type, int comm_threaded) {
mall_conf->spawn_type = spawn_type;
mall_conf->spawn_is_single = spawn_is_single;
mall_conf->spawn_dist = spawn_dist;
mall_conf->spawn_threaded = spawn_threaded;
mall_conf->comm_type = comm_type;
mall_conf->comm_threaded = comm_threaded;
}
/*
* To be deprecated
* Tiene que ser llamado despues de setear la config
*/
void set_children_number(int numC){
if((mall_conf->spawn_type == COMM_SPAWN_MERGE || mall_conf->spawn_type == COMM_SPAWN_MERGE_PTHREAD) && (numC - mall->numP >= 0)) {
mall->numC = numC;
mall->numC_spawned = numC - mall->numP;
if(numC == mall->numP) { // Migrar
mall->numC_spawned = numC;
if(mall_conf->spawn_type == COMM_SPAWN_MERGE)
mall_conf->spawn_type = COMM_SPAWN_SERIAL;
else
mall_conf->spawn_type = COMM_SPAWN_PTHREAD;
}
} else {
mall->numC = numC;
mall->numC_spawned = numC;
}
}
/*
* TODO
*/
void get_malleability_user_comm(MPI_Comm *comm) {
*comm = mall->user_comm;
}
/*
* Anyade a la estructura concreta de datos elegida
* el nuevo set de datos "data" de un total de "total_qty" elementos.
*
* Los datos variables se tienen que anyadir cuando quieran ser mandados, no antes
*
* Mas informacion en la funcion "add_data".
*/
void malleability_add_data(void *data, int total_qty, int type, int is_replicated, int is_constant) {
if(is_constant) {
if(is_replicated) {
add_data(data, total_qty, type, 0, rep_s_data); //FIXME Numero magico
} else {
add_data(data, total_qty, type, 0, dist_s_data); //FIXME Numero magico
}
} else {
if(is_replicated) {
add_data(data, total_qty, type, 0, rep_a_data); //FIXME Numero magico || Un request?
} else {
int total_reqs = 0;
if(mall_conf->comm_type == MAL_USE_NORMAL) {
total_reqs = 1;
} else if(mall_conf->comm_type == MAL_USE_IBARRIER) {
total_reqs = 2;
} else if(mall_conf->comm_type == MAL_USE_POINT) {
total_reqs = mall->numC;
}
add_data(data, total_qty, type, total_reqs, dist_a_data);
}
}
}
/*
* Devuelve el numero de entradas para la estructura de descripcion de
* datos elegida.
*/
void malleability_get_entries(int *entries, int is_replicated, int is_constant){
if(is_constant) {
if(is_replicated) {
*entries = rep_s_data->entries;
} else {
*entries = dist_s_data->entries;
}
} else {
if(is_replicated) {
*entries = rep_a_data->entries;
} else {
*entries = dist_a_data->entries;
}
}
}
/*
* Devuelve el elemento de la lista "index" al usuario.
* La devolución es en el mismo orden que lo han metido los padres
* con la funcion "malleability_add_data()".
* Es tarea del usuario saber el tipo de esos datos.
* TODO Refactor a que sea automatico
*/
void malleability_get_data(void **data, int index, int is_replicated, int is_constant) {
malleability_data_t *data_struct;
if(is_constant) {
if(is_replicated) {
data_struct = rep_s_data;
} else {
data_struct = dist_s_data;
}
} else {
if(is_replicated) {
data_struct = rep_a_data;
} else {
data_struct = dist_a_data;
}
}
*data = data_struct->arrays[index];
}
//======================================================||
//================PRIVATE FUNCTIONS=====================||
//================DATA COMMUNICATION====================||
//======================================================||
//======================================================||
/*
* Funcion generalizada para enviar datos desde los hijos.
* La asincronizidad se refiere a si el hilo padre e hijo lo hacen
* de forma bloqueante o no. El padre puede tener varios hilos.
*/
void send_data(int numP_children, malleability_data_t *data_struct, int is_asynchronous) {
int i;
char *aux;
if(is_asynchronous) {
for(i=0; i < data_struct->entries; i++) {
aux = (char *) data_struct->arrays[i]; //TODO Comprobar que realmente es un char
send_async(aux, data_struct->qty[i], mall->myId, mall->numP, mall->root, mall->intercomm, numP_children, data_struct->requests, mall_conf->comm_type);
}
} else {
for(i=0; i < data_struct->entries; i++) {
aux = (char *) data_struct->arrays[i]; //TODO Comprobar que realmente es un char
send_sync(aux, data_struct->qty[i], mall->myId, mall->numP, mall->root, mall->intercomm, numP_children);
}
}
}
/*
* Funcion generalizada para recibir datos desde los hijos.
* La asincronizidad se refiere a si el hilo padre e hijo lo hacen
* de forma bloqueante o no. El padre puede tener varios hilos.
*/
void recv_data(int numP_parents, malleability_data_t *data_struct, int is_asynchronous) {
int i;
char *aux;
if(is_asynchronous) {
for(i=0; i < data_struct->entries; i++) {
aux = (char *) data_struct->arrays[i]; //TODO Comprobar que realmente es un char
recv_async(&aux, data_struct->qty[i], mall->myId, mall->numP, mall->root, mall->intercomm, numP_parents, mall_conf->comm_type);
data_struct->arrays[i] = (void *) aux;
}
} else {
for(i=0; i < data_struct->entries; i++) {
aux = (char *) data_struct->arrays[i]; //TODO Comprobar que realmente es un char
recv_sync(&aux, data_struct->qty[i], mall->myId, mall->numP, mall->root, mall->intercomm, numP_parents);
data_struct->arrays[i] = (void *) aux;
}
}
}
//======================================================||
//================PRIVATE FUNCTIONS=====================||
//=====================CHILDREN=========================||
//======================================================||
//======================================================||
/*
* Inicializacion de los datos de los hijos.
* En la misma se reciben datos de los padres: La configuracion
* de la ejecucion a realizar; y los datos a recibir de los padres
* ya sea de forma sincrona, asincrona o ambas.
*/
void Children_init() {
int numP_parents, root_parents, i;
int spawn_is_single;
MPI_Comm aux;
MPI_Bcast(&spawn_is_single, 1, MPI_INT, MALLEABILITY_ROOT, mall->intercomm);
if(spawn_is_single) {
malleability_establish_connection(mall->myId, MALLEABILITY_ROOT, &(mall->intercomm));
}
MPI_Bcast(&(mall_conf->spawn_type), 1, MPI_INT, MALLEABILITY_ROOT, mall->intercomm);
MPI_Bcast(&root_parents, 1, MPI_INT, MALLEABILITY_ROOT, mall->intercomm);
MPI_Bcast(&numP_parents, 1, MPI_INT, root_parents, mall->intercomm);
mall_conf->config_file = recv_config_file(mall->root, mall->intercomm);
mall_conf->results = (results_data *) malloc(sizeof(results_data));
init_results_data(mall_conf->results, mall_conf->config_file->resizes, RESULTS_INIT_DATA_QTY);
if(dist_a_data->entries || rep_a_data->entries) { // Recibir datos asincronos
comm_data_info(rep_a_data, dist_a_data, MALLEABILITY_CHILDREN, mall->myId, root_parents, mall->intercomm);
if(mall_conf->comm_type == MAL_USE_NORMAL || mall_conf->comm_type == MAL_USE_IBARRIER || mall_conf->comm_type == MAL_USE_POINT) {
recv_data(numP_parents, dist_a_data, 1);
} else if (mall_conf->comm_type == MAL_USE_THREAD) { //TODO Modificar uso para que tenga sentido comm_threaded
recv_data(numP_parents, dist_a_data, 0);
}
mall_conf->results->async_end= MPI_Wtime(); // Obtener timestamp de cuando termina comm asincrona
}
comm_data_info(rep_s_data, dist_s_data, MALLEABILITY_CHILDREN, mall->myId, root_parents, mall->intercomm);
if(dist_s_data->entries || rep_s_data->entries) { // Recibir datos sincronos
recv_data(numP_parents, dist_s_data, 0);
mall_conf->results->sync_end = MPI_Wtime(); // Obtener timestamp de cuando termina comm sincrona
// TODO Crear funcion especifica y anyadir para Asinc
// TODO Tener en cuenta el tipo y qty
for(i=0; i<rep_s_data->entries; i++) {
MPI_Datatype datatype;
if(rep_s_data->types[i] == MAL_INT) {
datatype = MPI_INT;
} else {
datatype = MPI_CHAR;
}
MPI_Bcast(rep_s_data->arrays[i], rep_s_data->qty[i], datatype, root_parents, mall->intercomm);
}
}
if(mall_conf->spawn_type == COMM_SPAWN_MERGE || mall_conf->spawn_type == COMM_SPAWN_MERGE_PTHREAD) {
proc_adapt_expand(&(mall->numP), mall->numP+numP_parents, mall->intercomm, &(mall->comm), MALLEABILITY_CHILDREN);
if(mall->thread_comm != MPI_COMM_WORLD) MPI_Comm_free(&(mall->thread_comm));
MPI_Comm_dup(mall->comm, &aux);
mall->thread_comm = aux;
MPI_Comm_dup(mall->comm, &aux);
mall->user_comm = aux;
}
// Guardar los resultados de esta transmision
recv_results(mall_conf->results, mall->root, mall_conf->config_file->resizes, mall->intercomm);
MPI_Comm_disconnect(&(mall->intercomm));
}
//======================================================||
//================PRIVATE FUNCTIONS=====================||
//=====================PARENTS==========================||
//======================================================||
//======================================================||
/*
* Se encarga de realizar la creacion de los procesos hijos.
* Si se pide en segundo plano devuelve el estado actual.
*/
int spawn_step(){
mall_conf->results->spawn_start = MPI_Wtime();
if((mall_conf->spawn_type == COMM_SPAWN_MERGE || mall_conf->spawn_type == COMM_SPAWN_MERGE_PTHREAD) && mall->numP > mall->numC) {
state = shrink_redistribution();
return state;
}
state = init_slurm_comm(mall->name_exec, mall->num_cpus, mall->num_nodes, mall->nodelist, mall->myId, mall->numP, mall->numC, mall->root, mall_conf->spawn_dist, mall_conf->spawn_type, mall_conf->spawn_is_single, mall->thread_comm, &(mall->intercomm));
if(mall_conf->spawn_type == COMM_SPAWN_SERIAL || mall_conf->spawn_type == COMM_SPAWN_MERGE)
mall_conf->results->spawn_time[mall_conf->grp] = MPI_Wtime() - mall_conf->results->spawn_start;
else if(mall_conf->spawn_type == COMM_SPAWN_PTHREAD || mall_conf->spawn_type == COMM_SPAWN_MERGE_PTHREAD) {
//mall_conf->results->spawn_thread_time[mall_conf->grp] = MPI_Wtime() - mall_conf->results->spawn_start;
//mall_conf->results->spawn_start = MPI_Wtime();
}
return state;
}
/*
* Comienza la redistribucion de los datos con el nuevo grupo de procesos.
*
* Primero se envia la configuracion a utilizar al nuevo grupo de procesos y a continuacion
* se realiza el envio asincrono y/o sincrono si lo hay.
*
* En caso de que haya comunicacion asincrona, se comienza y se termina la funcion
* indicando que se ha comenzado un envio asincrono.
*
* Si no hay comunicacion asincrono se pasa a realizar la sincrona si la hubiese.
*
* Finalmente se envian datos sobre los resultados a los hijos y se desconectan ambos
* grupos de procesos.
*/
int start_redistribution() {
int rootBcast = MPI_PROC_NULL;
if(mall->myId == mall->root) rootBcast = MPI_ROOT;
MPI_Bcast(&(mall_conf->spawn_type), 1, MPI_INT, rootBcast, mall->intercomm);
MPI_Bcast(&(mall->root), 1, MPI_INT, rootBcast, mall->intercomm);
MPI_Bcast(&(mall->numP), 1, MPI_INT, rootBcast, mall->intercomm);
send_config_file(mall_conf->config_file, rootBcast, mall->intercomm);
if(dist_a_data->entries || rep_a_data->entries) { // Recibir datos asincronos
mall_conf->results->async_start = MPI_Wtime();
comm_data_info(rep_a_data, dist_a_data, MALLEABILITY_NOT_CHILDREN, mall->myId, mall->root, mall->intercomm);
if(mall_conf->comm_type == MAL_USE_THREAD) {
return thread_creation();
} else {
send_data(mall->numC, dist_a_data, MALLEABILITY_USE_ASYNCHRONOUS);
return MAL_DIST_PENDING;
}
}
return end_redistribution();
}
/*
* @deprecated
* Comprueba si la redistribucion asincrona ha terminado.
* Si no ha terminado la funcion termina indicandolo, en caso contrario,
* se continua con la comunicacion sincrona, el envio de resultados y
* se desconectan los grupos de procesos.
*
* Esta funcion permite dos modos de funcionamiento al comprobar si la
* comunicacion asincrona ha terminado.
* Si se utiliza el modo "MAL_USE_NORMAL" o "MAL_USE_POINT", se considera
* terminada cuando los padres terminan de enviar.
* Si se utiliza el modo "MAL_USE_IBARRIER", se considera terminada cuando
* los hijos han terminado de recibir.
*/
int check_redistribution() {
int completed, all_completed, test_err;
MPI_Request *req_completed;
//dist_a_data->requests[0][X] //FIXME Numero magico 0 -- Modificar para que sea un for?
if (mall_conf->comm_type == MAL_USE_POINT) {
test_err = MPI_Testall(mall->numC, dist_a_data->requests[0], &completed, MPI_STATUSES_IGNORE);
} else {
if(mall_conf->comm_type == MAL_USE_NORMAL) {
req_completed = &(dist_a_data->requests[0][0]);
} else if (mall_conf->comm_type == MAL_USE_IBARRIER) {
req_completed = &(dist_a_data->requests[0][1]);
}
test_err = MPI_Test(req_completed, &completed, MPI_STATUS_IGNORE);
}
if (test_err != MPI_SUCCESS && test_err != MPI_ERR_PENDING) {
printf("P%d aborting -- Test Async\n", mall->myId);
MPI_Abort(MPI_COMM_WORLD, test_err);
}
MPI_Allreduce(&completed, &all_completed, 1, MPI_INT, MPI_MIN, mall->comm);
if(!all_completed) return MAL_DIST_PENDING; // Continue only if asynchronous send has ended
if(mall_conf->comm_type == MAL_USE_IBARRIER) {
MPI_Wait(&(dist_a_data->requests[0][0]), MPI_STATUS_IGNORE); // Indicar como completado el envio asincrono
//Para la desconexión de ambos grupos de procesos es necesario indicar a MPI que esta comm
//ha terminado, aunque solo se pueda llegar a este punto cuando ha terminado
}
return end_redistribution();
}
/*
* Termina la redistribución de los datos con los hijos, comprobando
* si se han realizado iteraciones con comunicaciones en segundo plano
* y enviando cuantas iteraciones se han realizado a los hijos.
*
* Además se realizan las comunicaciones síncronas se las hay.
* Finalmente termina enviando los datos temporales a los hijos.
*/
int end_redistribution() {
int result, i, rootBcast = MPI_PROC_NULL;
MPI_Comm aux;
if(mall->myId == mall->root) rootBcast = MPI_ROOT;
if(dist_s_data->entries || rep_s_data->entries) { // Enviar datos sincronos
comm_data_info(rep_s_data, dist_s_data, MALLEABILITY_NOT_CHILDREN, mall->myId, mall->root, mall->intercomm);
send_data(mall->numC, dist_s_data, MALLEABILITY_USE_SYNCHRONOUS);
// TODO Crear funcion especifica y anyadir para Asinc
// TODO Tener en cuenta el tipo
for(i=0; i<rep_s_data->entries; i++) {
MPI_Datatype datatype;
if(rep_s_data->types[i] == MAL_INT) {
datatype = MPI_INT;
} else {
datatype = MPI_CHAR;
}
MPI_Bcast(rep_s_data->arrays[i], rep_s_data->qty[i], datatype, rootBcast, mall->intercomm);
}
}
if(mall_conf->spawn_type == COMM_SPAWN_MERGE || mall_conf->spawn_type == COMM_SPAWN_MERGE_PTHREAD) {
double time_adapt = MPI_Wtime();
proc_adapt_expand(&(mall->numP), mall->numC, mall->intercomm, &(mall->comm), MALLEABILITY_NOT_CHILDREN);
if(mall->thread_comm != MPI_COMM_WORLD) MPI_Comm_free(&(mall->thread_comm));
MPI_Comm_dup(mall->comm, &aux);
mall->thread_comm = aux;
MPI_Comm_dup(mall->comm, &aux);
mall->user_comm = aux;
mall_conf->results->spawn_time[mall_conf->grp] += MPI_Wtime() - time_adapt;
// result = MAL_DIST_ADAPTED;
}
send_results(mall_conf->results, rootBcast, mall_conf->config_file->resizes, mall->intercomm);
result = MAL_DIST_COMPLETED;
MPI_Comm_disconnect(&(mall->intercomm));
state = MAL_NOT_STARTED;
return result;
}
///=============================================
///=============================================
///=============================================
double time_adapt, time_adapt_end;
int state_shrink=0; //TODO Refactor
pthread_t thread_shrink;
MPI_Comm comm_shrink;
int thread_shrink_creation();
void *thread_shrink_work();
/*
* Crea una hebra para ejecutar una comunicación en segundo plano.
*/
int thread_shrink_creation() {
if(pthread_create(&thread_shrink, NULL, thread_shrink_work, NULL)) {
printf("Error al crear el hilo\n");
MPI_Abort(MPI_COMM_WORLD, -1);
return -1;
}
return MAL_SPAWN_PENDING;
}
void* thread_shrink_work() {
proc_adapt_shrink(mall->numC, &comm_shrink, mall->myId);
time_adapt_end = MPI_Wtime();
state_shrink=2;
pthread_exit(NULL);
}
///=============================================
///=============================================
///=============================================
int shrink_redistribution() {
int global_state;
double time_aux;
MPI_Comm aux_comm;
if(mall_conf->spawn_type == COMM_SPAWN_MERGE_PTHREAD) {
if(state_shrink == 0) {
time_adapt = MPI_Wtime();
state_shrink = 1;
MPI_Comm_dup(mall->comm, &comm_shrink);
thread_shrink_creation();
return MAL_SPAWN_PENDING;
} else if(state_shrink>0) {
MPI_Allreduce(&state_shrink, &global_state, 1, MPI_INT, MPI_MIN, mall->comm);
if(global_state < 2) return MAL_SPAWN_PENDING;
time_aux = MPI_Wtime();
if(pthread_join(thread_shrink, NULL)) {
printf("Error al esperar al hilo\n");
MPI_Abort(MPI_COMM_WORLD, -1);
return -10;
}
MPI_Comm_dup(mall->comm, &aux_comm);
mall->comm = comm_shrink;
}
} else {
time_adapt = MPI_Wtime();
MPI_Comm_dup(mall->comm, &aux_comm);
proc_adapt_shrink( mall->numC, &(mall->comm), mall->myId);
}
//TODO REFACTOR -- Que solo la llamada de collect iters este fuera de los hilos
zombies_collect_suspended(aux_comm, mall->myId, mall->numP, mall->numC, mall->root, (void *) mall_conf->results, mall->user_comm);
if(mall->myId < mall->numC) {
MPI_Comm_free(&aux_comm);
MPI_Comm_dup(mall->comm, &aux_comm);
mall->thread_comm = aux_comm;
MPI_Comm_dup(mall->comm, &aux_comm);
mall->user_comm = aux_comm;
mall_conf->results->spawn_time[mall_conf->grp] = MPI_Wtime() - time_adapt;
if(mall_conf->spawn_type == COMM_SPAWN_MERGE_PTHREAD) {
mall_conf->results->spawn_real_time[mall_conf->grp] = time_adapt_end - time_adapt + MPI_Wtime() - time_aux;
}
return MAL_DIST_COMPLETED; //FIXME Refactor Poner a SPAWN_COMPLETED
} else {
return MAL_ZOMBIE;
}
}
// TODO MOVER A OTRO LADO??
//======================================================||
//================PRIVATE FUNCTIONS=====================||
//===============COMM PARENTS THREADS===================||
//======================================================||
//======================================================||
/*
* Crea una hebra para ejecutar una comunicación en segundo plano.
*/
int thread_creation() {
if(pthread_create(&(mall->async_thread), NULL, thread_async_work, NULL)) {
printf("Error al crear el hilo\n");
MPI_Abort(MPI_COMM_WORLD, -1);
return -1;
}
return MAL_DIST_PENDING;
}
/*
* Comprobación por parte de una hebra maestra que indica
* si una hebra esclava ha terminado su comunicación en segundo plano.
*
* El estado de la comunicación es devuelto al finalizar la función.
*/
int thread_check() {
int all_completed = 0;
// Comprueba que todos los hilos han terminado la distribucion (Mismo valor en commAsync)
MPI_Allreduce(&state, &all_completed, 1, MPI_INT, MPI_MAX, mall->comm);
if(all_completed != MAL_DIST_COMPLETED) return MAL_DIST_PENDING; // Continue only if asynchronous send has ended
//FIXME No se tiene en cuenta el estado MAL_APP_ENDED
if(pthread_join(mall->async_thread, NULL)) {
printf("Error al esperar al hilo\n");
MPI_Abort(MPI_COMM_WORLD, -1);
return -2;
}
return end_redistribution();
}
/*
* Función ejecutada por una hebra.
* Ejecuta una comunicación síncrona con los hijos que
* para el usuario se puede considerar como en segundo plano.
*
* Cuando termina la comunicación la hebra maestra puede comprobarlo
* por el valor "commAsync".
*/
void* thread_async_work(void* void_arg) {
send_data(mall->numC, dist_a_data, MALLEABILITY_USE_SYNCHRONOUS);
state = MAL_DIST_COMPLETED;
pthread_exit(NULL);
}
#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>
#include <fcntl.h>
#include <sys/stat.h>
#include <mpi.h>
#include "../IOcodes/read_ini.h"
#include "../IOcodes/results.h"
#include "malleabilityStates.h"
int init_malleability(int myId, int numP, int root, MPI_Comm comm, char *name_exec, char *nodelist, int num_cpus, int num_nodes);
void free_malleability();
void indicate_ending_malleability(int new_outside_state);
int malleability_checkpoint();
void set_benchmark_grp(int grp);
void set_malleability_configuration(int spawn_type, int spawn_is_single, int spawn_dist, int spawn_threaded, int comm_type, int comm_threaded);
void set_children_number(int numC); // TODO TO BE DEPRECATED
void get_malleability_user_comm(MPI_Comm *comm);
void malleability_add_data(void *data, int total_qty, int type, int is_replicated, int is_constant);
void malleability_get_entries(int *entries, int is_replicated, int is_constant);
void malleability_get_data(void **data, int index, int is_replicated, int is_constant);
void set_benchmark_configuration(configuration *config_file);
void get_benchmark_configuration(configuration **config_file);
void set_benchmark_results(results_data *results);
void get_benchmark_results(results_data **results);
//States
#define MAL_UNRESERVED -1
#define MAL_DENIED -2
#define MAL_ZOMBIE -3
#define MAL_NOT_STARTED 0
#define MAL_SPAWN_PENDING 1
#define MAL_SPAWN_SINGLE_START 2
#define MAL_SPAWN_SINGLE_PENDING 3
#define MAL_SPAWN_COMPLETED 4
#define MAL_DIST_PENDING 5
#define MAL_DIST_COMPLETED 6
#define MAL_DIST_ADAPTED 7
#define MAL_APP_EXECUTING 0
#define MAL_APP_ENDED 1
// TODO Refactor
#define COMM_PHY_NODES 1
#define COMM_PHY_CPU 2
// SPAWN METHODS
#define COMM_SPAWN_SERIAL 0
#define COMM_SPAWN_PTHREAD 1
#define COMM_SPAWN_MERGE 2
#define COMM_SPAWN_MERGE_PTHREAD 3
//#define COMM_SPAWN_BASELINE 0
//#define COMM_SPAWN_MERGE 1
//SPAWN STRATEGIES
#define COMM_SPAWN_MULTIPLE 0
#define COMM_SPAWN_SINGLE 1
//#define COMM_SPAWN_SERIAL 0
//#define COMM_SPAWN_PTHREAD 1
#define MAL_USE_NORMAL 0
#define MAL_USE_IBARRIER 1
#define MAL_USE_POINT 2
#define MAL_USE_THREAD 3
#define MAL_INT 0
#define MAL_CHAR 1
#define MALLEABILITY_CHILDREN 1
#define MALLEABILITY_NOT_CHILDREN 0
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <fcntl.h>
#include <unistd.h>
#include <mpi.h>
//#include <slurm/slurm.h>
#include <signal.h>
#include "../IOcodes/results.h"
#include "malleabilityZombies.h"
#define PIDS_QTY 320
void zombies_suspend();
int offset_pids, *pids = NULL;
void gestor_usr2() {}
void zombies_collect_suspended(MPI_Comm comm, int myId, int numP, int numC, int root, void *results_void, MPI_Comm user_comm) {
int pid = getpid();
int *pids_counts = malloc(numP * sizeof(int));
int *pids_displs = malloc(numP * sizeof(int));
int i, count=1;
if(myId < numC) {
count = 0;
if(myId == root) {
for(i=0; i < numC; i++) {
pids_counts[i] = 0;
}
for(i=numC; i<numP; i++) {
pids_counts[i] = 1;
pids_displs[i] = (i + offset_pids) - numC;
}
offset_pids += numP - numC;
}
}
MPI_Gatherv(&pid, count, MPI_INT, pids, pids_counts, pids_displs, MPI_INT, root, comm);
free(pids_counts);
free(pids_displs);
if(myId >= numC) {
// Needed to ensure iteration times are collected before suspending these processes
results_data *results = (results_data *) results_void;
compute_results_iter(results, myId, root, user_comm);
zombies_suspend();
}
}
void zombies_service_init() {
offset_pids = 0;
pids = malloc(PIDS_QTY * sizeof(int));
for(int i=0; i<PIDS_QTY; i++) {
pids[i] = 0;
}
}
void zombies_service_free() {
free(pids);
}
void zombies_suspend() {
struct sigaction act;
sigemptyset(&act.sa_mask);
act.sa_flags=0;
act.sa_handler=gestor_usr2;
sigaction(SIGUSR2, &act, NULL);
sigset_t set;
sigprocmask(SIG_SETMASK,NULL,&set);
sigsuspend(&set);
}
void zombies_awake() {
for(int i=0; i < offset_pids; i++) { // Despertar a los zombies
kill(pids[i], SIGUSR2);
}
}
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <fcntl.h>
#include <unistd.h>
#include <mpi.h>
//#include <slurm/slurm.h>
#include <signal.h>
void zombies_collect_suspended(MPI_Comm comm, int myId, int numP, int numC, int root, void *results_void, MPI_Comm user_comm);
void zombies_service_init();
void zombies_service_free();
void zombies_awake();
#!/bin/bash
dir="/home/martini/malleability_benchmark/Codes/auxiliar_codes"
aux=$(grep "\[resize0\]" -n $1 | cut -d ":" -f1)
read -r ini fin <<<$(echo $aux)
diff=$(( fin - ini ))
numP=$(head -$fin $1 | tail -$diff | cut -d ';' -f1 | grep procs | cut -d '=' -f2)
dist=$(head -$fin $1 | tail -$diff | cut -d ';' -f1 | grep physical_dist | cut -d '=' -f2)
if [ $dist == "node" ]; then
dist=1
elif [ $dist == "cpu" ]; then
dist=2
fi
$dir/Recordnodelist.o $numP $dist
echo $numP
......@@ -2,24 +2,32 @@
#SBATCH -p P1
#SBATCH -N 1
#SBATCH --exclude=c01,c00,c02
dir="/home/martini/malleability_benchmark"
codeDir="/Codes"
partition='P1'
source build/config.txt
cores=$(bash $PROTEO_HOME$execDir/BashScripts/getCores.sh $partition)
nodelist=$SLURM_JOB_NODELIST
nodes=$SLURM_JOB_NUM_NODES
configFile=$1
echo "MPICH"
module load mpich-3.4.1-noucx
#export HYDRA_DEBUG=1
#-disable-hostname-propagation -disable-auto-cleanup -pmi-port -hosts n00,n01
numP=$(bash recordMachinefile.sh $1)
outIndex=0
if [ $# -ge 2 ]
then
outIndex=$2
fi
mpirun -print-all-exitcodes -f hostfile.o$SLURM_JOB_ID $dir$codeDir/a.out $1 $2 $nodelist $nodes
#mpirun -np $numP $dir$codeDir/a.out $1 $2 $nodelist $nodes
rm hostfile.o$SLURM_JOB_ID
echo "MPICH provider=$FI_PROVIDER"
mpirun --version
numP=$(bash $PROTEO_HOME$execDir/BashScripts/getNumPNeeded.sh $configFile 0)
initial_nodelist=$(bash $PROTEO_HOME$execDir/BashScripts/createInitialNodelist.sh $numP $cores $nodelist)
echo $initial_nodelist
echo "Test PreRUN $numP $nodelist"
mpirun -hosts $initial_nodelist -np $numP $PROTEO_BIN $configFile $outIndex
echo "END RUN"
sed -i 's/application called MPI_Abort(MPI_COMM_WORLD, -100) - process/shrink cleaning/g' slurm-$SLURM_JOB_ID.out
sed -i 's/Abort(-100)/shrink cleaning/g' slurm-$SLURM_JOB_ID.out
MAM_ID=$(($SLURM_JOB_ID % 1000))
rm MAM_HF_ID*$MAM_ID*S*.tmp
#!/bin/bash
#SBATCH -p P1
#SBATCH -N 1
#SBATCH --exclude=c01,c00,c02
scriptDir="$(dirname "$0")"
source $scriptDir/build/config.txt
resultsDir="/Results"
nodelist=$SLURM_JOB_NODELIST
nodes=$SLURM_JOB_NUM_NODES
configFile=$1
outIndex=$2
echo "MPICH"
numP=$(bash $PROTEO_HOME$execDir/BashScripts/getNumPNeeded.sh $configFile 0)
name_res="Extrae_"$nodes"_Test_"$numP
dir_name_res=$PROTEO_HOME$resultsDir"/"$name_res
#mpirun -np $numP $PROTEO_BIN $configFile $outIndex $nodelist $nodes
srun -n$numP --mpi=pmi2 ./trace.sh $PROTEO_BIN $configFile $outIndex $nodelist $nodes
echo "END RUN"
sed -i 's/application called MPI_Abort(MPI_COMM_WORLD, -100) - process/shrink cleaning/g' slurm-$SLURM_JOB_ID.out
sed -i 's/Abort(-100)/shrink cleaning/g' slurm-$SLURM_JOB_ID.out
rm hostfile.o$SLURM_JOB_ID
echo "MOVING DATA"
mkdir $dir_name_res
mv a.out.* $dir_name_res
mv TRACE* $dir_name_res
mv set-0/ $dir_name_res
mv R$outIndex* $dir_name_res
echo "JOB ENDED"
#!/bin/bash
#SBATCH -p P1
#SBATCH -N 1
#SBATCH --exclude=c01,c00,c02
source build/config.txt
cores=$(bash $PROTEO_HOME$execDir/BashScripts/getCores.sh $partition)
nodelist=$SLURM_JOB_NODELIST
nodes=$SLURM_JOB_NUM_NODES
configFile=$1
outIndex=0
if [ $# -ge 2 ]
then
outIndex=$2
fi
echo "MPICH provider=$FI_PROVIDER"
mpirun --version
numP=$(bash $PROTEO_HOME$execDir/BashScripts/getNumPNeeded.sh $configFile 0)
initial_nodelist=$(bash $PROTEO_HOME$execDir/BashScripts/createInitialNodelist.sh $numP $cores $nodelist)
echo $initial_nodelist
echo "Test PreRUN $numP $nodelist"
mpirun -hosts $initial_nodelist -np $numP valgrind --leak-check=full --show-leak-kinds=all --track-origins=yes --trace-children=yes --log-file=vg.sp.%p.$SLURM_JOB_ID $PROTEO_BIN $configFile $outIndex
echo "END RUN"
sed -i 's/application called MPI_Abort(MPI_COMM_WORLD, -100) - process/shrink cleaning/g' slurm-$SLURM_JOB_ID.out
sed -i 's/Abort(-100)/shrink cleaning/g' slurm-$SLURM_JOB_ID.out
MAM_ID=$(($SLURM_JOB_ID % 1000))
rm MAM_HF_ID0$MAM_ID*.tmp
[general]
resizes=1 ; Numero de redistribuciones
matrix_tam=1000 ; Tamaño en bytes de la matriz de computo
comm_tam=1000 ; Tamaño en bytes de los datos a comunicar en cada iteracion. Una sola vez
SDR=100000000 ; Tamaño en bytes a redistribuir de forma sincrona
ADR=100000000 ; Tamaño en bytes a redistribuir de forma asincrona 1000000000
AIB=3 ; Indica si las redistribuciones asíncronas se consideran terminadas para los padres
; cuando terminan de enviar (0), cuando terminan de recibir los valores (1)
; o usar comunicaciones punto a punto (2), o utilizar hebras(3)
CST=0 ; Indica como realizar el spawn. (0) Para el método baseline, (1) para el método
; baseline con hilos, (2) para el método merge y (3) para el método merge
; con hilos
CSS=0 ; Indica si el spawn se realiza con todos los procesos (0) o solo participa
; el proceso raiz (1)
time=1 ; Tiempo necesario para realizar una iteracion
; end [general]
[resize0] ; Grupo inicial(mpirun)
iters=10 ; Numero de iteraciones a realizar por este grupo
procs=2 ; Cantidad de procesos en el grupo
factor=1 ; Factor de coste
physical_dist=node ; Tipo de redistribución física de los procesos
Total_Resizes=1
Total_Stages=4
Granularity=600
SDR=500000.0
ADR=500000.0
Rigid=0
Capture_Method=0
;end [general]
[stage0]
Stage_Type=1
Stage_Bytes=0
Stage_Time=0.6085314
;end [stage0]
[stage1]
Stage_Type=8
Stage_Bytes=8
Stage_Time=0
;end [stage1]
[stage2]
Stage_Type=8
Stage_Bytes=8
Stage_Time=0
;end [stage2]
[stage3]
Stage_Type=6
Stage_Bytes=33176880
Stage_Time=0
;end [stage3]
[resize0]
Iters=5
Procs=20
FactorS=0.2
Dist=compact
Redistribution_Method=0
Redistribution_Strategy=1
Spawn_Method=0
Spawn_Strategy=1
;end [resize0]
[resize1] ; Grupo de hijos 1
iters=20
procs=4
factor=0.5
physical_dist=node
[resize1]
Iters=50
Procs=2
FactorS=1
Dist=compact
Redistribution_Method=0
Redistribution_Strategy=0
Spawn_Method=0
Spawn_Strategy=0
;end [resize1]
#!/bin/bash
# Configure Extrae
export EXTRAE_CONFIG_FILE=./extrae.xml
# Load the tracing library (choose C/Fortran)
export LD_PRELOAD=${EXTRAE_HOME}/lib/libmpitrace.so
# Run the program
$*
#!/bin/bash
# Gets the first nodelist that will be used during the emulation
# Parameter 1(Optional) - Amount of executions per file. Must be a positive number
#====== Do not modify these values =======
numP=$1
cores=$2
nodelist=$3
initial_node_qty=$(($numP / $cores))
if [ $initial_node_qty -eq 0 ]
then
initial_node_qty=1
fi
common_node_name="n" #FIXME What if it uses another type of node?
if [[ $nodelist == *"["* ]]; then
common_node_name=$(echo $nodelist | cut -d '[' -f1)
fi
node_array=($(echo $nodelist | sed -e 's/[\[${common_node_name}]//g' -e 's/\]/ /g' -e 's/,/ /g'))
actual_node_qty=0
for ((i=0; $actual_node_qty<$initial_node_qty; i++))
do
element=($(echo ${node_array[$i]} | sed -e 's/-/ /g'))
nodes_qty=1
if [ "${#element[@]}" -gt 1 ];
then
nodes_qty=$((10#${element[1]}-10#${element[0]}+1))
fi
expected_node_qty=$(($actual_node_qty + $nodes_qty))
if [ "$expected_node_qty" -le "$initial_node_qty" ];
then
added_qty=$nodes_qty
actual_node_qty=$expected_node_qty
else
added_qty=$(($initial_node_qty - $actual_node_qty))
actual_node_qty=$initial_node_qty
fi
for ((j=0; j<$added_qty; j++))
do
index=$((10#${element[0]} + $j))
index=0$index # FIXME What if there are more than 9 nodes?
#FIXME What if less than $cores have to be spawned?
for ((core=0; core<$cores; core++)) # FIXME What if the user asks for a spread distribution
do
initial_nodelist="${initial_nodelist:+$initial_nodelist,}"$common_node_name$index
done
done
done
#Print result
echo $initial_nodelist
#!/bin/bash
# Obtains the number of total cores in an homogenous partition
# Parameter 1 - Partition to use
#====== Do not modify these values =======
partition=$1
hostlist=$(sinfo -hs --partition $partition | sed 's/ */:/g' | cut -d ':' -f5)
basic_node=$(scontrol show hostname $hostlist | paste -d, -s | cut -d ',' -f1)
cores=$(scontrol show node $basic_node | grep CPUTot | cut -d '=' -f3 | cut -d ' ' -f1)
echo "$cores"
#!/bin/bash
# Obtains for a given configuration file how many nodes will be needed
# Parameter 1 - Configuration file name for the emulation.
# Parameter 2 - Number of cores in the machines. The machines must be homogenous. Must be a positive number.
#====== Do not modify these values =======
execDir="/Exec"
if [ "$#" -lt "2" ]
then
echo "Not enough arguments"
echo "Usage -> bash getMaxNodesNeeded.sh Configuration.ini NumCores"
exit -1
fi
config_file=$1
cores=$2
max_numP=-1
total_resizes=$(grep Total_Resizes $config_file | cut -d '=' -f2)
total_groups=$(($total_resizes + 1))
for ((j=0; j<total_groups; j++));
do
numP=$(bash $PROTEO_HOME$execDir/BashScripts/getNumPNeeded.sh $config_file $j)
if [ "$numP" -gt "$max_numP" ];
then
max_numP=$numP
fi
done
node_qty=$(($max_numP / $cores))
if [ $node_qty -eq 0 ]
then
node_qty=1
fi
echo $node_qty
#!/bin/bash
# Runs in a given current directory all .ini files
# Parameter 1(Optional) - Amount of executions per file. Must be a positive number
#====== Do not modify these values =======
config_file=$1
group_index=$2
resize_info=$(grep "\[resize$group_index\]" -n $config_file | cut -d ":" -f1)
first_line=$(echo $resize_info | cut -d " " -f1)
last_line=$(echo $resize_info | cut -d " " -f2)
range_lines=$(( last_line - first_line ))
numP=$(head -$last_line $config_file | tail -$range_lines | cut -d ';' -f1 | grep Procs | cut -d '=' -f2)
echo $numP
#!/bin/bash
dir="/home/martini/malleability_benchmark/"
codeDir="Codes/"
execDir="Exec/"
ResultsDir="Results/"
ResultsDirName=$1
maxIndex=$2
cantidadGrupos=$3 #Contando a los padres
totalEjGrupo=$4 #Total de ejecuciones por grupo
maxTime=$5 #Maximo tiempo que se considera válido
if [ $# -lt 3 ]
partition="P1"
# Checks if all the runs in the current working directory performed under a
# Slurm manager have been performed correctly and if some runs can be corrected
# they are launched again
# Parameter 1 - Common name of the configuration files
# Parameter 2 - Maximum index of the runs
# Parameter 3 - Amount of repetitions per index/run
# Parameter 4 - Total stages in all runs. #FIXME The amount of stages must be equal across all the runs, must be modified in the future.
# Parameter 5 - Total groups of processes in all runs #FIXME The amount of groups must be equal across all the runs, must be modified in the future.
# Parameter 6 - Maximum valid iteration time across all runs. If an iteration time
# is higher, that particular repetition inside the run is cleaned and
# launched again.
# Parameter 7(Optional) - Maximum amount of time in seconds needed by a single execution. Default value is 0, which indicates infinite time.
# Must be a positive integer.
#====== Do not modify the following values =======
scriptDir="$(dirname "$0")"
source $scriptDir/../Codes/build/config.txt
cores=$(bash $PROTEO_HOME$execDir/BashScripts/getCores.sh $partition)
if [ "$#" -lt "6" ]
then
echo "Faltan argumentos"
echo "Uso -> bash CheckRun NombreDirectorio IndiceMaximo Grupos"
echo "Not enough arguments"
echo "Usage -> bash CheckRun.sh Common_Name maxIndex total_repetitions total_stages total_groups max_iteration_time [limit_time]"
exit -1
fi
cd $dir$ResultsDir
if [ ! -d $ResultsDirName ]
common_name=$1
maxIndex=$2
totalEjGrupo=$3 #Total de ejecuciones por grupo
total_stages=$4
total_groups=$5
maxTime=$6 #Maximo tiempo que se considera válido
limit_time_exec=0
if [ $# -ge 7 ] #Max time per execution in seconds
then
echo "La carpeta de resultados $ResultsDirName no existe. Abortando"
exit -1
limit_time_exec=$7
fi
cd $ResultsDirName
#Comprobar si hay errores
#Si los hay, salir
grep -i -e fatal -e error -e abort -e == */slurm* > errores2.txt
qty=$(wc -l errores2.txt | cut -d ' ' -f1)
limit_time=0
exec_lines_basic=6
iter_lines_basic=3
exec_total_lines=$(($exec_lines_basic+$total_stages+$total_groups))
iter_total_lines=$(($iter_lines_basic+$total_stages*2+1))
exec_remove=$(($exec_lines_basic+$total_stages+$total_groups-1))
iter_remove=$(($iter_lines_basic+$total_stages))
if [ $qty -gt 0 ]
#Check if there are fatal errors during executions
grep -i -e fatal -e error -e abort -e == slurm* > errores2.txt
qty=$(wc -l errores2.txt | cut -d ' ' -f1)
if [ "$qty" -gt "0" ]
then
echo "Se han encontrado errores de ejecución graves. Abortando"
echo "Revisar archivo errores2.txt en el directorio $ResultsDirName"
echo "Found Fatal errors during execution. Aborting"
echo "Read file errors2 to see the errors and in which files"
echo "FAILURE"
exit -2
fi
rm errores2.txt
#Comprobar que el número de archivos es correcto
#Pueden estar todos los archivos pero no estar los archivos
#completos -- Esto se comprueba más tarde
qtyG=$(ls R*/R*_Global.out | wc -l)
qtyG=$(($qtyG * 2))
qtyL=$(ls R*/R*_G?N*.out | wc -l)
if [ $qtyG == $qtyL ]
#Check if the number of output files is correct.
#If the number is not correct is a fatal error and the user
# is informed in which runs the amount does not match, and
# then the scripts exits.
#The user must figure out what to do with those runs.
qtyG=$(ls R*_Global.out | wc -l)
qtyG=$(($qtyG * $total_groups))
qtyL=$(ls R*_G*N*.out | wc -l)
if [ "$qtyG" == "$qtyL" ]
then
echo "El numero de ficheros G($qtyG) y L($qtyL) coincide"
echo "Number of G($qtyG) and L($qtyL) files match"
else
#Si faltan archivos, se indican cuales faltan
echo "Faltan ejecuciones Locales o globales"
for ((i=1; i<$maxIndex; i++))
echo "Lacking Local($qtyL) or global($qtyG) files. Aborting"
echo "Lacking Local($qtyL) or global($qtyG) files. Aborting" > errores2.txt
for ((i=0; i<$maxIndex; i++))
do
qtyEx=$(grep Tex -r Run$i | wc -l)
qtyIt=$(grep Top -r Run$i | wc -l)
qtyEx=$(grep T_total R"$i"_Global.out | wc -l)
qtyIt=$(grep T_iter R"$i"_G*N*.out | wc -l)
qtyEx=$(($qtyEx * 2))
if [ $qtyEx -ne $qtyIt ]
if [ "$qtyEx" -ne "$qtyIt" ]
then
diff=$(($totalEjGrupo-$qtyEx))
echo "Faltan archivos en Run$i"
echo "Files do not match at Run $i -- diff=$diff"
echo "Files do not match at Run $i -- diff=$diff" >> errores2.txt
fi
done
echo "FAILURE"
exit -1
fi
rm errores2.txt
#grep -rn "2.\." R* TODO Testear que el tiempo teorico maximo es valido?
# Check if there is any negative execution time
# Only invalid IDs are stored
rm -f tmp.txt
touch tmp.txt
exec_ids=($(grep -n "T_total" R*_Global.out | grep - | cut -d '_' -f1 | cut -d 'R' -f2))
exec_line=($(grep -n "T_total" R*_Global.out | grep - | cut -d ':' -f2))
for ((i=${#exec_ids[@]}-1; i>=0; i--))
do
first_line=$((${exec_line[$i]}-$exec_remove))
last_line=$(($first_line+$exec_total_lines-1))
echo "${exec_ids[$i]}:$first_line:$last_line" >> tmp.txt
done
# Check if there is any iter time higher than expected
# Only invalid IDs are stored
iter_times=($(grep "T_iter" R*_G*N*.out | cut -d ' ' -f2))
iter_ids=($(grep "T_iter" R*_G*N*.out | cut -d '_' -f1 | cut -d 'R' -f2))
iter_line=($(grep -n "T_iter" R*_G*N*.out | cut -d ':' -f2))
for ((i=${#iter_times[@]}-1; i>=0; i--))
do
is_invalid=$(echo ${iter_times[$i]}'>'$maxTime | bc -l)
if [ $is_invalid -eq 1 ]
then
first_line=$((${iter_line[$i]}-$iter_remove))
# Translate line number to Global file
first_line=$(($first_line/$iter_total_lines))
first_line=$(($first_line*$exec_total_lines+1))
last_line=$(($first_line+$exec_total_lines-1))
echo "${iter_ids[$i]}:$first_line:$last_line" >> tmp.txt
fi
done
#Comprobar si hay runs con tiempo negativos
#Si los hay, reejecutar e informar de cuales son
grep - */R* | grep Tex > errores.txt
qty=$(wc -l errores.txt | cut -d ' ' -f1)
if [ $qty -gt 0 ]
#Clean data from collected erroneous executions
qty=$(wc -l tmp.txt | cut -d ' ' -f1)
if [ "$qty" -gt 0 ];
then
echo "Se han encontrado errores de ejecución leves. Volviendo a ejecutar"
echo "Found minor execution errors. Executing again. Review file errores.txt."
echo "CHECKRUN -- Found errors" >> errores.txt
while IFS="" read -r lineRun || [ -n "$lineRun" ]
do
#Obtener datos de una ejecución erronea
run=$(echo $lineRun | cut -d 'R' -f3 | cut -d '_' -f1)
if [ $run -gt $maxIndex ]
then #Indice de ejecuciones posteriores echas a mano -- FIXME Eliminar?
realRun=$(($run - $maxIndex))
index=$run
else # Indice de las primeras ejecuciones
realRun=$run
index=$(($run + $maxIndex))
fi
echo "Run $run"
cd Run$realRun
#Arreglar ejecuccion
#1 - Borrar lineas erroneas
qty=$(grep -n - R* | grep Tex | wc -l)
for ((i=0; i<qty; i++))
#Obtain data of erroneous execution
run=$(echo $lineRun | cut -d ':' -f1)
echo "Run $run had an erroneous execution, cleaning bad data."
echo "Run$run----------------------------------------------" >> errores.txt
#1 - Delete erroneous lines in Global file
first_line=$(echo $lineRun | cut -d ':' -f2)
last_line=$(echo $lineRun | cut -d ':' -f3)
sed -n ''$first_line','$last_line'p' R${run}_Global.out >> errores.txt
sed -i ''$first_line','$last_line'd' R${run}_Global.out
#2 - Translate line numbers to Local files type
first_line=$(($first_line/$exec_total_lines))
first_line=$(($first_line*$iter_total_lines+1))
last_line=$(($first_line+$iter_total_lines-1))
#3 - Delete erroneous lines in Local files
for ((j=0; j<total_groups; j++));
do
fin=$(grep -n - R* | grep Tex | cut -d ':' -f2 | head -n1)
init=$(($fin - 7))
sed -i ''$init','$fin'd' R${realRun}_Global.out
#Se borran las lineas de los ficheros locales asociados
aux=$(($fin / 8)) #Utilizado para saber de entre las ejecuciones del fichero, cual es la erronea
fin=$(($aux * 5))
init=$(($fin - 4))
for ((j=0; j<cantidadGrupos; j++)); do
sed -i ''$init','$fin'd' R${realRun}_G${j}*
done
sed -n ''$first_line','$last_line'p' R${run}_G${j}* >> errores.txt
sed -i ''$first_line','$last_line'd' R${run}_G${j}*
done
echo "--------------------------------------------------" >> errores.txt
#2 - Reelanzar ejecucion
proc_list=$(grep Procs R${realRun}_Global.out | cut -d '=' -f3 | cut -d ',' -f1)
proc_parents=$(echo $proc_list | cut -d ' ' -f1)
proc_children=$(echo $proc_list | cut -d ' ' -f2)
nodes=8 # Maximo actual
if [ $proc_parents -gt $proc_children ]
then
nodes=$(($proc_parents / 20))
else
nodes=$(($proc_children / 20))
fi
sbatch -N $nodes $dir$execDir./singleRun.sh config$realRun.ini $index
cd $dir$ResultsDir$ResultsDirName
done < errores.txt
exit 0
done < tmp.txt
fi
#Comprobar que todas las ejecuciones tienen todas las ejecucciones que tocan
#Solo es necesario comprobar el global.
#Check if all repetitions for each Run have been executed
#If any run lacks repetitions, the job is automatically launched again
#If a run has even executed a repetition, is not launched as it could be in the waiting queue
qty_missing=0
cd $dir$ResultsDir$ResultsDirName
for ((i=1; i<$maxIndex; i++))
use_extrae=0
for ((run=0; run<$maxIndex; run++))
do
qtyEx=$(grep Tex -r Run$i | wc -l)
if [ $qtyEx -ne $totalEjGrupo ]
diff=0
if [ -f "R${run}_Global.out" ]
then
qtyEx=$(grep T_total R"$run"_Global.out | wc -l)
if [ "$qtyEx" -ne "$totalEjGrupo" ];
then
diff=$(($totalEjGrupo-$qtyEx))
qty_missing=$(($qty_missing+1))
echo "Faltan en $i, $diff ejecuciones"
echo "Run$run lacks $diff repetitions"
fi
else
diff=$(($totalEjGrupo))
echo "Run$run results not found -- Trying to execute"
fi
if [ $diff -ne 0 ] #Execute if needed
then
qty_missing=$(($qty_missing+$diff))
if [ $limit_time_exec -ne 0 ] #Max time per execution in seconds
then
limit_time=$(($limit_time_exec*$diff/60+1))
fi
#2 - Obtain number of nodes needed
config_file="$common_name$run.ini"
node_qty=$(bash $PROTEO_HOME$execDir/BashScripts/getMaxNodesNeeded.sh $config_file $cores)
#3 - Launch execution
sbatch -p $partition -N $node_qty -t $limit_time $PROTEO_HOME$execDir/generalRun.sh $cores $config_file $use_extrae $run $diff
fi
done
if [ $qty_missing -eq 0 ]
if [ "$qty_missing" -eq "0" ];
then
echo "Todos los archivos tienen $totalEjGrupo ejecuciones"
echo "SUCCESS"
else
echo "REPEATING - A total of $qty_missing executions are being repeated"
fi
<?xml version='1.0'?>
<trace enabled="yes"
home="/home/martini/extrae-mpich-3.4.1-noucx"
initial-mode="detail"
type="paraver"
>
<mpi enabled="yes">
<counters enabled="yes" />
<comm-calls enabled="yes" />
</mpi>
<openmp enabled="no" ompt="no">
<locks enabled="no" />
<taskloop enabled="no" />
<counters enabled="yes" />
</openmp>
<pthread enabled="no">
<locks enabled="no" />
<counters enabled="yes" />
</pthread>
<callers enabled="yes">
<mpi enabled="yes">1-3</mpi>
<sampling enabled="no">1-5</sampling>
<dynamic-memory enabled="no">1-3</dynamic-memory>
<input-output enabled="no">1-3</input-output>
<syscall enabled="no">1-3</syscall>
</callers>
<user-functions enabled="no" list="/home/bsc41/bsc41273/user-functions.dat" exclude-automatic-functions="no">
<counters enabled="yes" />
</user-functions>
<counters enabled="yes">
<cpu enabled="yes" starting-set-distribution="1">
<set enabled="yes" domain="all" changeat-time="0">
PAPI_TOT_INS,PAPI_TOT_CYC
</set>
</cpu>
<network enabled="no" />
<resource-usage enabled="no" />
<memory-usage enabled="no" />
</counters>
<storage enabled="no">
<trace-prefix enabled="yes">TRACE</trace-prefix>
<size enabled="no">5</size>
<temporal-directory enabled="yes">/scratch</temporal-directory>
<final-directory enabled="yes">/gpfs/scratch/bsc41/bsc41273</final-directory>
</storage>
<buffer enabled="yes">
<size enabled="yes">5000000</size>
<circular enabled="no" />
</buffer>
<trace-control enabled="no">
<file enabled="no" frequency="5M">/gpfs/scratch/bsc41/bsc41273/control</file>
<global-ops enabled="no"></global-ops>
</trace-control>
<others enabled="yes">
<minimum-time enabled="no">10M</minimum-time>
<finalize-on-signal enabled="yes"
SIGUSR1="no" SIGUSR2="no" SIGINT="yes"
SIGQUIT="yes" SIGTERM="yes" SIGXCPU="yes"
SIGFPE="yes" SIGSEGV="yes" SIGABRT="yes"
/>
<flush-sampling-buffer-at-instrumentation-point enabled="yes" />
</others>
<bursts enabled="no">
<threshold enabled="yes">500u</threshold>
<mpi-statistics enabled="yes" />
</bursts>
<sampling enabled="no" type="default" period="50m" variability="10m" />
<dynamic-memory enabled="no">
<alloc enabled="yes" threshold="32768" />
<free enabled="yes" />
</dynamic-memory>
<pebs-sampling enabled="no">
<loads enabled="no" frequency="100" minimum-latency="10" />
<stores enabled="no" frequency="50">
<offcore-l3-misses enabled="no" /> <!-- Read together with stores samples. -->
</stores>
<load-l3-misses enabled="no" frequency="25" />
</pebs-sampling>
<input-output enabled="no" internals="no"/>
<syscall enabled="no" />
<merge enabled="no"
synchronization="default"
tree-fan-out="16"
max-memory="512"
joint-states="yes"
keep-mpits="yes"
translate-addresses="yes"
sort-addresses="yes"
translate-data-addresses="yes"
overwrite="yes"
/>
</trace>
#!/bin/bash
# Configure Extrae
export EXTRAE_CONFIG_FILE=./extrae.xml
# Load the tracing library (choose C/Fortran)
export LD_PRELOAD=${EXTRAE_HOME}/lib/libmpitrace.so
# Run the program
$*
#!/bin/bash
scriptDir="$(dirname "$0")"
source $scriptDir/../../Codes/build/config.txt
export EXTRAE_CONFIG_FILE=extrae.xml
export LD_PRELOAD=$EXTRAE_HOME/lib/libmpitrace.so
$PROTEO_BIN
import sys
import glob
import os
from datetime import date
from enum import Enum
GENERAL_SECTION = "[general]"
RESIZE_SECTION = "[resize"
STAGE_SECTION = "[stage"
END_SECTION_DELIMITER = ";end"
DIFFERENT_VALUE_DELIMITER=':'
LIST_VALUE_DELIMITER=','
class Config_section(Enum):
INVALID=0
GENERAL=1
RESIZE=2
STAGE=3
P_TOTAL_RESIZES="Total_Resizes"
P_TOTAL_STAGES="Total_Stages"
P_GRANULARITY="Granularity"
P_SDR="SDR"
P_ADR="ADR"
P_RIGID="Rigid"
P_CAPTURE_METHOD="Capture_Method"
P_STAGE_TYPE="Stage_Type"
P_STAGE_BYTES="Stage_Bytes"
P_STAGE_TIME_CAPPED="Stage_Time_Capped"
P_STAGE_TIME="Stage_Time"
P_RESIZE_ITERS="Iters"
P_RESIZE_PROCS="Procs"
P_RESIZE_FACTORS="FactorS"
P_RESIZE_DIST="Dist"
P_RESIZE_REDISTRIBUTION_METHOD="Redistribution_Method"
P_RESIZE_REDISTRIBUTION_STRATEGY="Redistribution_Strategy"
P_RESIZE_SPAWN_METHOD="Spawn_Method"
P_RESIZE_SPAWN_STRATEGY="Spawn_Strategy"
@classmethod
def has_key(cls, name):
return any(x.value == name for x in cls)
def is_ending_of_section(line):
if(END_SECTION_DELIMITER in line):
return True
return False
def is_a_general_section(line):
if(line == GENERAL_SECTION):
return True
return False
def is_a_resize_section(line):
if(RESIZE_SECTION in line and not is_ending_of_section(line)):
return True
return False
def is_a_stage_section(line):
if(STAGE_SECTION in line and not is_ending_of_section(line)):
return True
return False
def convert_to_number(number):
res = None
try:
res = float(number)
except ValueError:
if isinstance(number, str):
res = number
else:
print("Unable to convert to number - Not a fatal error")
if isinstance(res, float):
try:
res = int(number)
except ValueError:
print("Unable to convert float to int - Not a fatal error")
return res
def process_line(line, data):
key,value = line.split('=')
if(not Config_section.has_key(key)):
print("Unknown parameter " + key)
return False
value = value.split(DIFFERENT_VALUE_DELIMITER) # Some keys have values that will be swapped between files
for i in range(len(value)):
value[i] = value[i].split(LIST_VALUE_DELIMITER) # Final config files could have multiple values for the same key
for j in range(len(value[i])):
value[i][j] = convert_to_number(value[i][j])
if len(value[i]) > 1:
value[i] = tuple(value[i])
elif len(value[i]) == 1:
value[i] = value[i][j]
if len(value) == 1:
value = value[0]
data[key]=value
return True
def process_file(file_name):
f = open(file_name, "r")
lines = f.read().splitlines()
section_type = Config_section.INVALID
general_data = {}
stages_data=[]
resizes_data=[]
processing=0
for line in lines:
if(section_type != Config_section.INVALID):
if(is_ending_of_section(line)):
section_type = Config_section.INVALID
else:
process_line(line, processing)
elif(is_a_general_section(line)):
section_type = Config_section.GENERAL
processing = general_data
elif(is_a_resize_section(line)):
section_type = Config_section.RESIZE
resizes_data.append({})
processing = resizes_data[len(resizes_data)-1]
elif(is_a_stage_section(line)):
section_type = Config_section.STAGE
stages_data.append({})
processing = stages_data[len(stages_data)-1]
# print(general_data)
# print(stages_data)
# print(resizes_data)
f.close()
return general_data,stages_data,resizes_data
def key_line_write(f, keys, values):
for i in range(len(keys)):
f.write(keys[i] + "=")
if type(values[i]) == tuple:
f.write(str(values[i][0]))
for j in range(1,len(values[i])):
f.write("," + str(values[i][j]) )
else:
f.write(str(values[i]))
f.write("\n")
def general_section_write(f, general_data):
f.write(GENERAL_SECTION + "\n")
keys = list(general_data.keys())
values = list(general_data.values())
key_line_write(f, keys, values)
f.write(END_SECTION_DELIMITER + " " + GENERAL_SECTION + "\n")
def stage_section_write(f, stage_data, section_index):
f.write(STAGE_SECTION + str(section_index) + "]\n")
keys = list(stage_data.keys())
values = list(stage_data.values())
key_line_write(f, keys, values)
f.write(END_SECTION_DELIMITER + " " + STAGE_SECTION + str(section_index) + "]\n")
def resize_section_write(f, resize_data, section_index):
f.write(RESIZE_SECTION + str(section_index) + "]\n")
keys = list(resize_data.keys())
values = list(resize_data.values())
key_line_write(f, keys, values)
f.write(END_SECTION_DELIMITER + " " + RESIZE_SECTION + str(section_index) + "]\n")
def write_output_file(datasets, common_output_name, output_index):
file_name = common_output_name + str(output_index) + ".ini"
total_stages=int(datasets[0][Config_section.P_TOTAL_STAGES.value])
total_groups=int(datasets[0][Config_section.P_TOTAL_RESIZES.value])+1
f = open(file_name, "w")
general_section_write(f, datasets[0])
for i in range(total_stages):
stage_section_write(f, datasets[i+1], i)
for i in range(total_groups):
resize_section_write(f, datasets[i+1+total_stages], i)
f.close()
def check_sections_assumptions(datasets):
total_groups=int(datasets[0][Config_section.P_TOTAL_RESIZES.value])+1
total_stages=int(datasets[0][Config_section.P_TOTAL_STAGES.value])
adr = datasets[0][Config_section.P_ADR.value]
for i in range(total_groups):
#Not valid if resize is to the same amount of processes
if i>0:
if datasets[total_stages+1+i][Config_section.P_RESIZE_PROCS.value] == datasets[total_stages+i][Config_section.P_RESIZE_PROCS.value]:
return False
return True
def correct_adr(sdr, adr_percentage, w_general_dataset):
#TODO Tener en cuenta que tanto sdr como adr pueden tener diferentes valores
if (adr_percentage != 0):
w_general_dataset[Config_section.P_ADR.value] = sdr * (adr_percentage/100)
w_general_dataset[Config_section.P_SDR.value] = sdr * ((100.0-adr_percentage)/100)
def create_output_files(common_output_name, general_data, resize_data, stage_data):
def read_parameter(level_index):
dictionary = write_datasets[ds_indexes[level_index]]
key = keys[level_index]
index = indexes[level_index]
max_index = mindexes[level_index]
values = lists[level_index]
finished=False
if(index == max_index):
index = 0
if(level_index+1 == len(lists)):
finished = True
else:
finished = read_parameter(level_index+1)
dictionary[key] = values[index]
if(key == Config_section.P_RESIZE_PROCS.value):
original_dictionary = datasets[ds_indexes[level_index]]
dictionary[Config_section.P_RESIZE_FACTORS.value] = original_dictionary[Config_section.P_RESIZE_FACTORS.value][index]
elif(key == Config_section.P_SDR.value or key == Config_section.P_ADR.value):
original_dictionary = datasets[ds_indexes[level_index]]
sdr = original_dictionary[Config_section.P_SDR.value]
adr_percentage = original_dictionary[Config_section.P_ADR.value][index]
correct_adr(sdr, adr_percentage, dictionary)
indexes[level_index] = index + 1
return finished
datasets=[general_data]
write_datasets=[general_data.copy()]
for dataset in resize_data:
datasets.append(dataset)
write_datasets.append(dataset.copy())
for dataset in stage_data:
datasets.append(dataset)
write_datasets.append(dataset.copy())
lists=[] # Stores lists of those variables with multiple values
keys=[] # Stores keys of those variables with multiple values
indexes=[] # Stores actual index for each variable with multiple values. Always starts at 0.
mindexes=[] # Stores len of lists of each variable with multiple values
ds_indexes=[] # Stores the index of the dataset where the variable is stored
#For each variable with a list of elements
for i in range(len(datasets)):
values_aux = list(datasets[i].values())
keys_aux = list(datasets[i].keys())
for j in range(len(values_aux)):
if type(values_aux[j]) == list and keys_aux[j] != Config_section.P_RESIZE_FACTORS.value:
keys.append(keys_aux[j])
lists.append(values_aux[j])
ds_indexes.append(i)
indexes.append(0)
mindexes.append(len(values_aux[j]))
directory = "/Desglosed-" + str(date.today())
path = os.getcwd() + directory
os.mkdir(path, mode=0o775)
os.chdir(path)
#Get the first set of values
for i in range(len(lists)):
read_parameter(i)
#FIXME Deberia hacerse en otra parte
if (type(datasets[0][Config_section.P_SDR.value]) != list or type(datasets[0][Config_section.P_ADR.value]) != list):
sdr = datasets[0][Config_section.P_SDR.value]
adr_percentage = datasets[0][Config_section.P_ADR.value]
correct_adr(sdr, adr_percentage, write_datasets[0])
output_index=0
adr_corrected=False
finished = False
while not finished:
if(check_sections_assumptions(write_datasets)):
write_output_file(write_datasets, common_output_name, output_index)
# for i in range(len(write_datasets)):
# print(write_datasets[i])
# print("\n\n\n------------------------------------------" + str(output_index) + " ADR=" + str(adr_corrected))
output_index+=1
finished = read_parameter(0)
#=====================================================
if(len(sys.argv) < 3):
print("Not enough arguments given.\nExpected usage: python3 read_multiple.py file.ini output_name")
name = sys.argv[1]
common_output_name = sys.argv[2]
general_data, resize_data, stage_data = process_file(name)
create_output_files(common_output_name, general_data, resize_data, stage_data)
exit(1)
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment