Commit 62af4ef7 authored by iker_martin's avatar iker_martin
Browse files

WIP 1/3 - Bugfix for jobids higher than 1000.

parent 18499f94
...@@ -46,17 +46,15 @@ void open_port(Spawn_ports *spawn_port, int open_port, int open_service) ...@@ -46,17 +46,15 @@ void open_port(Spawn_ports *spawn_port, int open_port, int open_service)
if (spawn_port->port_name != NULL) if (spawn_port->port_name != NULL)
return; return;
if (open_port) if (open_port) {
{
spawn_port->opened_port = 1; spawn_port->opened_port = 1;
spawn_port->port_name = (char *)malloc(MPI_MAX_PORT_NAME * sizeof(char)); spawn_port->port_name = (char *)malloc(MPI_MAX_PORT_NAME * sizeof(char));
MPI_Open_port(MPI_INFO_NULL, spawn_port->port_name); MPI_Open_port(MPI_INFO_NULL, spawn_port->port_name);
if (open_service != MAM_SERVICE_UNNEEDED) if (open_service != MAM_SERVICE_UNNEEDED) {
{
spawn_port->service_name = (char *)malloc((MAM_SERVICE_NAME_SIZE) * sizeof(char)); spawn_port->service_name = (char *)malloc((MAM_SERVICE_NAME_SIZE) * sizeof(char));
#if MAM_USE_SLURM #if MAM_USE_SLURM
char *tmp = getenv("SLURM_JOB_ID"); char *tmp = getenv("SLURM_JOB_ID");
if(tmp != NULL) { job_id = atoi(tmp); } if(tmp != NULL) { job_id = atoi(tmp)%1000; }
#endif #endif
snprintf(spawn_port->service_name, MAM_SERVICE_NAME_SIZE, "mam_service_jid%04d_gr%03d", job_id, open_service); snprintf(spawn_port->service_name, MAM_SERVICE_NAME_SIZE, "mam_service_jid%04d_gr%03d", job_id, open_service);
MPI_Publish_name(spawn_port->service_name, MPI_INFO_NULL, spawn_port->port_name); MPI_Publish_name(spawn_port->service_name, MPI_INFO_NULL, spawn_port->port_name);
...@@ -131,7 +129,7 @@ void discover_remote_port(int id_group, Spawn_ports *spawn_port) { ...@@ -131,7 +129,7 @@ void discover_remote_port(int id_group, Spawn_ports *spawn_port) {
spawn_port->remote_service = (char*) malloc(MAM_SERVICE_NAME_SIZE * sizeof(char)); spawn_port->remote_service = (char*) malloc(MAM_SERVICE_NAME_SIZE * sizeof(char));
#if MAM_USE_SLURM #if MAM_USE_SLURM
char *tmp = getenv("SLURM_JOB_ID"); char *tmp = getenv("SLURM_JOB_ID");
if(tmp != NULL) { job_id = atoi(tmp); } if(tmp != NULL) { job_id = atoi(tmp)%1000; }
#endif #endif
snprintf(spawn_port->remote_service, MAM_SERVICE_NAME_SIZE, "mam_service_jid%04d_gr%03d", job_id, id_group); snprintf(spawn_port->remote_service, MAM_SERVICE_NAME_SIZE, "mam_service_jid%04d_gr%03d", job_id, id_group);
} else { // For subsequent lookups, only update the variable part (group ID) of the service name. } else { // For subsequent lookups, only update the variable part (group ID) of the service name.
......
...@@ -6,6 +6,7 @@ ...@@ -6,6 +6,7 @@
#include <string.h> #include <string.h>
#include <mpi.h> #include <mpi.h>
#include "ProcessDist.h" #include "ProcessDist.h"
#include "SpawnUtils.h"
#include "../MAM_Constants.h" #include "../MAM_Constants.h"
#include "../MAM_DataStructures.h" #include "../MAM_DataStructures.h"
...@@ -23,7 +24,6 @@ ...@@ -23,7 +24,6 @@
void node_dist(Spawn_data spawn_data, int **qty, int *used_nodes, int *total_spawns); void node_dist(Spawn_data spawn_data, int **qty, int *used_nodes, int *total_spawns);
void spread_dist(Spawn_data spawn_data, int *used_nodes, int *procs); void spread_dist(Spawn_data spawn_data, int *used_nodes, int *procs);
void compact_dist(Spawn_data spawn_data, int *used_nodes, int *procs); void compact_dist(Spawn_data spawn_data, int *used_nodes, int *procs);
void set_spawn_cmd(Spawn_data *spawn_data);
void generate_info_string(char *nodelist, int *procs_array, size_t nodes, Spawn_data *spawn_data); void generate_info_string(char *nodelist, int *procs_array, size_t nodes, Spawn_data *spawn_data);
void generate_multiple_info_string(char *nodelist, int *procs_array, size_t nodes, Spawn_data *spawn_data); void generate_multiple_info_string(char *nodelist, int *procs_array, size_t nodes, Spawn_data *spawn_data);
...@@ -63,7 +63,7 @@ void processes_dist(Spawn_data *spawn_data) { ...@@ -63,7 +63,7 @@ void processes_dist(Spawn_data *spawn_data) {
#if MAM_USE_SLURM #if MAM_USE_SLURM
switch(spawn_data->mapping_fill_method) { switch(spawn_data->mapping_fill_method) {
case MAM_PHY_TYPE_STRING: case MAM_PHY_TYPE_STRING:
if(spawn_data->spawn_is_multiple) { if(spawn_data->spawn_is_multiple || spawn_data->spawn_is_parallel) {
generate_multiple_info_string_slurm(mall->nodelist, procs_array, used_nodes, spawn_data); generate_multiple_info_string_slurm(mall->nodelist, procs_array, used_nodes, spawn_data);
} else { } else {
generate_info_string_slurm(mall->nodelist, procs_array, used_nodes, spawn_data); generate_info_string_slurm(mall->nodelist, procs_array, used_nodes, spawn_data);
...@@ -74,16 +74,55 @@ void processes_dist(Spawn_data *spawn_data) { ...@@ -74,16 +74,55 @@ void processes_dist(Spawn_data *spawn_data) {
break; break;
} }
#else #else
if(spawn_data->spawn_is_multiple) { if(spawn_data->spawn_is_multiple || spawn_data->spawn_is_parallel) {
generate_multiple_info_string(mall->nodelist, procs_array, used_nodes, spawn_data); generate_multiple_info_string(mall->nodelist, procs_array, used_nodes, spawn_data);
} else { } else {
generate_info_string(mall->nodelist, procs_array, used_nodes, spawn_data); generate_info_string(mall->nodelist, procs_array, used_nodes, spawn_data);
} }
#endif #endif
set_spawn_cmd(spawn_data); char *aux_cmd = get_spawn_cmd();
for(int index = 0; index<spawn_data->total_spawns; index++) {
spawn_data->sets[index].cmd = aux_cmd;
}
free(procs_array); free(procs_array);
} }
void set_hostfile_name(char **file_name, int *n, int jid, int index) {
if(*file_name == NULL) {
*file_name = (char *) malloc(MAM_HOSTFILE_SIZE * sizeof(char));
}
if(*n == 0) {
jid = jid % 1000;
snprintf(*file_name, MAM_HOSTFILE_SIZE , "%s%04d%s%03d%s", MAM_HOSTFILE_NAME1, jid, MAM_HOSTFILE_NAME2, index, MAM_HOSTFILE_NAME3);
} else {
snprintf((*file_name)+MAM_HOSTFILE_SIZE1, MAM_HOSTFILE_SIZE2 , "%03d%s", index, MAM_HOSTFILE_NAME3);
}
*n=1;
}
int read_hostfile_procs(char *file_name, int *qty) {
char *line = NULL, *ptr;
FILE *file = NULL;
file = fopen(file_name, "r");
if(file == NULL) {
perror("Could not open hostfile to read");
MPI_Abort(MPI_COMM_WORLD, -1);
}
*qty = 0;
line = (char *) malloc(MAM_HOSTFILE_LINE_SIZE * sizeof(char));
while (fgets(line, MAM_HOSTFILE_LINE_SIZE, file) != NULL) {
size_t len = strlen(line);
ptr = line + len - 1;
// Search delimiter
while (ptr != line && *ptr != ':') { ptr--; }
if (*ptr == ':') { *qty += atoi(ptr + 1); }
}
return 0;
}
//--------------PRIVATE FUNCTIONS---------------// //--------------PRIVATE FUNCTIONS---------------//
//-----------------DISTRIBUTION-----------------// //-----------------DISTRIBUTION-----------------//
...@@ -117,7 +156,7 @@ void node_dist(Spawn_data spawn_data, int **qty, int *used_nodes, int *total_spa ...@@ -117,7 +156,7 @@ void node_dist(Spawn_data spawn_data, int **qty, int *used_nodes, int *total_spa
*qty = calloc(*used_nodes, sizeof(int)); // Numero de procesos por nodo *qty = calloc(*used_nodes, sizeof(int)); // Numero de procesos por nodo
// if(MAM_Contains_strat(MAM_SPAWN_STRATEGIES, MAM_STRAT_SPAWN_MULTIPLE, NULL) ) { // if(MAM_Contains_strat(MAM_SPAWN_STRATEGIES, MAM_STRAT_SPAWN_MULTIPLE, NULL) ) {
if(spawn_data.spawn_is_multiple) { if(spawn_data.spawn_is_multiple || spawn_data.spawn_is_parallel) {
for(i=0; i< *used_nodes; i++) { for(i=0; i< *used_nodes; i++) {
(*qty)[i] = procs[i]; (*qty)[i] = procs[i];
if(procs[i]) (*total_spawns)++; if(procs[i]) (*total_spawns)++;
...@@ -199,35 +238,6 @@ void compact_dist(Spawn_data spawn_data, int *used_nodes, int *procs) { ...@@ -199,35 +238,6 @@ void compact_dist(Spawn_data spawn_data, int *used_nodes, int *procs) {
if(*used_nodes > mall->num_nodes) *used_nodes = mall->num_nodes; //FIXME Si ocurre esto no es un error? if(*used_nodes > mall->num_nodes) *used_nodes = mall->num_nodes; //FIXME Si ocurre esto no es un error?
} }
//--------------PRIVATE FUNCTIONS---------------//
//-------------------CMD SET--------------------//
/*
* Comprueba que comando hay que llamar al realizar
* el spawn. Todos los sets tienen que hacer el mismo
* comando.
*
*/
void set_spawn_cmd(Spawn_data *spawn_data) {
int index = 0;
char *cmd_aux;
switch(mall_conf->external_usage) {
case MAM_USE_VALGRIND:
cmd_aux = MAM_VALGRIND_SCRIPT;
break;
case MAM_USE_EXTRAE:
cmd_aux = MAM_EXTRAE_SCRIPT;
break;
default:
cmd_aux = mall->name_exec;
break;
}
for(; index<spawn_data->total_spawns; index++) {
spawn_data->sets[index].cmd = cmd_aux;
}
}
//--------------PRIVATE FUNCTIONS---------------// //--------------PRIVATE FUNCTIONS---------------//
//-------------------INFO SET-------------------// //-------------------INFO SET-------------------//
...@@ -465,14 +475,14 @@ void generate_info_hostfile_slurm(char *nodelist, int *qty, size_t used_nodes, S ...@@ -465,14 +475,14 @@ void generate_info_hostfile_slurm(char *nodelist, int *qty, size_t used_nodes, S
hostlist_t hostlist; hostlist_t hostlist;
char *tmp = getenv("SLURM_JOB_ID"); char *tmp = getenv("SLURM_JOB_ID");
jid = tmp != NULL ? atoi(tmp) : 0; jid = tmp != NULL ? (atoi(tmp)%1000) : 0;
line = NULL; line = NULL;
hostlist = slurm_hostlist_create(nodelist); hostlist = slurm_hostlist_create(nodelist);
hostfile_name = (char *) malloc(MAM_HOSTFILE_SIZE * sizeof(char)); hostfile_name = (char *) malloc(MAM_HOSTFILE_SIZE * sizeof(char));
snprintf(hostfile_name, MAM_HOSTFILE_SIZE , "%s%04d%s%03d%s", MAM_HOSTFILE_NAME1, jid, MAM_HOSTFILE_NAME2, index, MAM_HOSTFILE_NAME3); snprintf(hostfile_name, MAM_HOSTFILE_SIZE , "%s%04d%s%03d%s", MAM_HOSTFILE_NAME1, jid, MAM_HOSTFILE_NAME2, index, MAM_HOSTFILE_NAME3);
if(spawn_data->spawn_is_multiple) { // MULTIPLE if(spawn_data->spawn_is_multiple || spawn_data->spawn_is_parallel) { // MULTIPLE
for(; index<spawn_data->total_spawns; index++) { for(; index<spawn_data->total_spawns; index++) {
// This strat creates 1 hostfile per spawn // This strat creates 1 hostfile per spawn
qty_index = fill_multiple_hostfile_slurm(hostfile_name, qty+qty_index, &hostlist, &line, &len_line); qty_index = fill_multiple_hostfile_slurm(hostfile_name, qty+qty_index, &hostlist, &line, &len_line);
......
...@@ -4,5 +4,7 @@ ...@@ -4,5 +4,7 @@
#include "Spawn_DataStructure.h" #include "Spawn_DataStructure.h"
void processes_dist(Spawn_data *spawn_data); void processes_dist(Spawn_data *spawn_data);
void set_hostfile_name(char **file_name, int *n, int jid, int index);
int read_hostfile_procs(char *file_name, int *qty);
#endif #endif
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment