Commit 6a71bbf2 authored by iker_martin's avatar iker_martin
Browse files

Added modifications for MaM usage in DMR

parent c9941842
...@@ -227,6 +227,21 @@ void MAM_Check_configuration() { ...@@ -227,6 +227,21 @@ void MAM_Check_configuration() {
MAM_Set_key_configuration(MAM_SPAWN_METHOD, MAM_SPAWN_BASELINE, NULL); MAM_Set_key_configuration(MAM_SPAWN_METHOD, MAM_SPAWN_BASELINE, NULL);
} }
// BEGIN ADDED FOR DMR
char *tmp = getenv(MAM_SPAWN_METHOD_ENV);
int tmp_value = mall_conf->spawn_method;
if(tmp != NULL) {
tmp_value = atoi(tmp);
}
if(mall_conf->spawn_method != (size_t) tmp_value) {
MAM_Set_key_configuration(MAM_SPAWN_METHOD, tmp_value, NULL);
}
if(!MAM_Contains_strat(MAM_SPAWN_STRATEGIES, MAM_STRAT_SPAWN_PARALLEL, NULL)
&& mall_conf->spawn_method == MAM_SPAWN_MERGE) {
MAM_I_set_spawn_strat(MAM_STRAT_SPAWN_PARALLEL, &mall_conf->spawn_strategies);
}
// END ADDED FOR DMR
MPI_Allreduce(&mall->internode_group, &global_internodes, 1, MPI_INT, MPI_MAX, mall->comm); MPI_Allreduce(&mall->internode_group, &global_internodes, 1, MPI_INT, MPI_MAX, mall->comm);
if((MAM_Contains_strat(MAM_SPAWN_STRATEGIES, MAM_STRAT_SPAWN_MULTIPLE, NULL) if((MAM_Contains_strat(MAM_SPAWN_STRATEGIES, MAM_STRAT_SPAWN_MULTIPLE, NULL)
|| MAM_Contains_strat(MAM_SPAWN_STRATEGIES, MAM_STRAT_SPAWN_PARALLEL, NULL) ) || MAM_Contains_strat(MAM_SPAWN_STRATEGIES, MAM_STRAT_SPAWN_PARALLEL, NULL) )
......
...@@ -9,7 +9,7 @@ int state = MAM_I_UNRESERVED; ...@@ -9,7 +9,7 @@ int state = MAM_I_UNRESERVED;
* de MaM. * de MaM.
*/ */
void MAM_Def_main_datatype() { void MAM_Def_main_datatype() {
int i, counts = 12; int i, counts = 13;
int blocklengths[counts]; int blocklengths[counts];
MPI_Aint displs[counts]; MPI_Aint displs[counts];
MPI_Datatype types[counts]; MPI_Datatype types[counts];
...@@ -29,14 +29,15 @@ void MAM_Def_main_datatype() { ...@@ -29,14 +29,15 @@ void MAM_Def_main_datatype() {
MPI_Get_address(&(mall_conf->spawn_dist), &displs[2]); MPI_Get_address(&(mall_conf->spawn_dist), &displs[2]);
MPI_Get_address(&(mall_conf->red_method), &displs[3]); MPI_Get_address(&(mall_conf->red_method), &displs[3]);
MPI_Get_address(&(mall_conf->red_strategies), &displs[4]); MPI_Get_address(&(mall_conf->red_strategies), &displs[4]);
MPI_Get_address(&(mall_conf->slurm_jid), &displs[5]); //DMR ADDITION - Which is the original Slurm JobId
MPI_Get_address(&(mall->root_parents), &displs[5]); MPI_Get_address(&(mall->root_parents), &displs[6]);
MPI_Get_address(&(mall->num_parents), &displs[6]); //TODO Add only when Single strat active? MPI_Get_address(&(mall->num_parents), &displs[7]); //TODO Add only when Single strat active?
MPI_Get_address(&(mall->numC), &displs[7]); //TODO Add only when MultipleSpawn strat active? MPI_Get_address(&(mall->numC), &displs[8]); //TODO Add only when MultipleSpawn strat active?
MPI_Get_address(&(mall->gid), &displs[8]); //TODO Add only when ParallelSpawn strat active? MPI_Get_address(&(mall->gid), &displs[9]); //TODO Add only when ParallelSpawn strat active?
MPI_Get_address(&(mall->num_cpus), &displs[9]); MPI_Get_address(&(mall->num_cpus), &displs[10]);
MPI_Get_address(&(mall->num_nodes), &displs[10]); MPI_Get_address(&(mall->num_nodes), &displs[11]);
MPI_Get_address(&(mall->nodelist_len), &displs[11]); MPI_Get_address(&(mall->nodelist_len), &displs[12]);
MPI_Type_create_struct(counts, blocklengths, displs, types, &mall->struct_type); MPI_Type_create_struct(counts, blocklengths, displs, types, &mall->struct_type);
MPI_Type_commit(&mall->struct_type); MPI_Type_commit(&mall->struct_type);
......
...@@ -45,6 +45,7 @@ typedef struct { ...@@ -45,6 +45,7 @@ typedef struct {
unsigned int red_strategies; unsigned int red_strategies;
int external_usage; // Whether a different application should be called by Spawn and which int external_usage; // Whether a different application should be called by Spawn and which
int slurm_jid; //DMR ADDITION - Which is the original Slurm JobId
malleability_times_t *times; malleability_times_t *times;
} malleability_config_t; } malleability_config_t;
......
...@@ -248,11 +248,28 @@ void MAM_Resume_redistribution(int *mam_state) { ...@@ -248,11 +248,28 @@ void MAM_Resume_redistribution(int *mam_state) {
if(mam_state != NULL) *mam_state = MAM_PENDING; if(mam_state != NULL) *mam_state = MAM_PENDING;
} }
//BEGIN ADDED FOR DMR
int MAM_DMR_Is_zombie() {
return mall->zombie;
}
void MAM_DMR_Update_nodelist(char *nodelist, int num_nodes) {
if(mall->nodelist!= NULL) {
free(mall->nodelist);
mall->nodelist = NULL;
}
mall->nodelist_len = strlen(nodelist)+1;
mall->nodelist = (char *) malloc(mall->nodelist_len * sizeof(char));
strcpy(mall->nodelist, nodelist);
mall->num_nodes = num_nodes;
}
//END ADDED FOR DMR
/* /*
* TODO * TODO
*/ */
void MAM_Commit(int *mam_state) { void MAM_Commit(int *mam_state) {
int request_abort; //int request_abort; Removed for DMR
#if MAM_DEBUG #if MAM_DEBUG
if(mall->myId == mall->root){ DEBUG_FUNC("Trying to commit", mall->myId, mall->numP); } fflush(stdout); if(mall->myId == mall->root){ DEBUG_FUNC("Trying to commit", mall->myId, mall->numP); } fflush(stdout);
#endif #endif
...@@ -273,10 +290,19 @@ void MAM_Commit(int *mam_state) { ...@@ -273,10 +290,19 @@ void MAM_Commit(int *mam_state) {
#if MAM_DEBUG >= 1 #if MAM_DEBUG >= 1
DEBUG_FUNC("Is terminating as zombie", mall->myId, mall->numP); fflush(stdout); DEBUG_FUNC("Is terminating as zombie", mall->myId, mall->numP); fflush(stdout);
#endif #endif
/* BEGIN REMOVED FOR DMR
request_abort = MAM_Finalize(); request_abort = MAM_Finalize();
if(request_abort) { MPI_Abort(MPI_COMM_WORLD, -101); } if(request_abort) { MPI_Abort(MPI_COMM_WORLD, -101); }
MPI_Finalize(); MPI_Finalize();
exit(0); exit(0);
END REMOVED FOR DMR
*/
//BEGIN ADDED FOR DMR
if(mall->intercomm != MPI_COMM_NULL && mall->intercomm != MPI_COMM_WORLD) { MPI_Comm_disconnect(&(mall->intercomm)); }
state = MAM_I_NOT_STARTED;
if(mam_state != NULL) *mam_state = MAM_COMPLETED;
return;
//END ADDED FOR DMR
} }
// Reset/Free communicators // Reset/Free communicators
......
...@@ -21,6 +21,8 @@ void MAM_Resume_redistribution(int *mam_state); ...@@ -21,6 +21,8 @@ void MAM_Resume_redistribution(int *mam_state);
int MAM_Get_Reconf_Info(mam_user_reconf_t *reconf_info); int MAM_Get_Reconf_Info(mam_user_reconf_t *reconf_info);
int MAM_DMR_Is_zombie(); //Added for DMR
void MAM_DMR_Update_nodelist(char *nodelist, int num_nodes); //Added for DMR
void MAM_Data_add(void *data, size_t *index, size_t total_qty, MPI_Datatype type, int is_replicated, int is_constant); void MAM_Data_add(void *data, size_t *index, size_t total_qty, MPI_Datatype type, int is_replicated, int is_constant);
void MAM_Data_modify(void *data, size_t index, size_t total_qty, MPI_Datatype type, int is_replicated, int is_constant); void MAM_Data_modify(void *data, size_t index, size_t total_qty, MPI_Datatype type, int is_replicated, int is_constant);
......
...@@ -165,6 +165,7 @@ int MAM_I_get_hosts_info() { ...@@ -165,6 +165,7 @@ int MAM_I_get_hosts_info() {
free(unique_hosts); free(unique_hosts);
} }
mall_conf->slurm_jid = 0; //Added for DMR
free(my_host); free(my_host);
return 0; return 0;
} }
...@@ -208,6 +209,10 @@ int MAM_I_slurm_getenv_hosts_info() { ...@@ -208,6 +209,10 @@ int MAM_I_slurm_getenv_hosts_info() {
int cpus, count; int cpus, count;
//int i, *cpus_counts, *nodes_counts, *aux; //int i, *cpus_counts, *nodes_counts, *aux;
tmp = getenv("SLURM_JOB_ID");
if(tmp == NULL) return 1;
mall_conf->slurm_jid = atoi(tmp); //Modified for DMR
tmp = getenv("SLURM_JOB_NUM_NODES"); tmp = getenv("SLURM_JOB_NUM_NODES");
if(tmp == NULL) return 1; if(tmp == NULL) return 1;
mall->num_nodes = atoi(tmp); mall->num_nodes = atoi(tmp);
...@@ -270,16 +275,16 @@ int MAM_I_slurm_getenv_hosts_info() { ...@@ -270,16 +275,16 @@ int MAM_I_slurm_getenv_hosts_info() {
* FIXME Does not consider heterogenous machines * FIXME Does not consider heterogenous machines
*/ */
int MAM_I_slurm_getjob_hosts_info() { int MAM_I_slurm_getjob_hosts_info() {
int jobId, err; int err;
char *tmp = NULL; char *tmp = NULL;
job_info_msg_t *j_info; job_info_msg_t *j_info;
slurm_job_info_t last_record; slurm_job_info_t last_record;
tmp = getenv("SLURM_JOB_ID"); tmp = getenv("SLURM_JOB_ID");
if(tmp == NULL) return 1; if(tmp == NULL) return 1;
jobId = atoi(tmp); mall_conf->slurm_jid = atoi(tmp); //Modified for DMR
err = slurm_load_job(&j_info, jobId, 1); // FIXME Valgrind Not freed err = slurm_load_job(&j_info, mall_conf->slurm_jid, 1); // FIXME Valgrind Not freed //Modified for DMR
if(err) return err; if(err) return err;
last_record = j_info->job_array[j_info->record_count - 1]; last_record = j_info->job_array[j_info->record_count - 1];
......
...@@ -4,11 +4,16 @@ MCC = mpicc ...@@ -4,11 +4,16 @@ MCC = mpicc
C_FLAGS = -Wall -Wextra -Wshadow -Wfatal-errors C_FLAGS = -Wall -Wextra -Wshadow -Wfatal-errors
LD_FLAGS = -lm -pthread LD_FLAGS = -lm -pthread
MAM_USE_SLURM ?= 0 MAM_USE_SLURM ?= 1
MAM_USE_BARRIERS ?= 0 MAM_USE_BARRIERS ?= 0
MAM_DEBUG ?= 0 MAM_DEBUG ?= 0
DEF = -DMAM_USE_SLURM=$(MAM_USE_SLURM) -DMAM_USE_BARRIERS=$(MAM_USE_BARRIERS) -DMAM_DEBUG=$(MAM_DEBUG) DEF = -DMAM_USE_SLURM=$(MAM_USE_SLURM) -DMAM_USE_BARRIERS=$(MAM_USE_BARRIERS) -DMAM_DEBUG=$(MAM_DEBUG)
ifdef DMR_PATH
MPIFLAGS = -I$(MPI_PATH)/include -L$(MPI_PATH)/lib
SLURMFLAGS = -I$(SLURM_ROOT)/include -L$(SLURM_ROOT)/lib
endif
ifeq ($(MAM_USE_SLURM),1) ifeq ($(MAM_USE_SLURM),1)
LD_FLAGS += -lslurm LD_FLAGS += -lslurm
endif endif
...@@ -52,7 +57,7 @@ $(LIB) : $(BUILD_DIR)/$(LIB) ...@@ -52,7 +57,7 @@ $(LIB) : $(BUILD_DIR)/$(LIB)
# Actual target of the binary - depends on all .o files. # Actual target of the binary - depends on all .o files.
$(BUILD_DIR)/$(LIB) : $(OBJ) $(BUILD_DIR)/$(LIB) : $(OBJ)
$(MCC) $(C_FLAGS) $^ -shared -o $@ $(LD_FLAGS) $(MCC) $(C_FLAGS) $(MPIFLAGS) $(SLURMFLAGS) $^ -shared -o $@ $(LD_FLAGS)
# Include all .d files # Include all .d files
# .d files are used for knowing the dependencies of each source file # .d files are used for knowing the dependencies of each source file
...@@ -65,4 +70,4 @@ $(BUILD_DIR)/$(LIB) : $(OBJ) ...@@ -65,4 +70,4 @@ $(BUILD_DIR)/$(LIB) : $(OBJ)
# the same name as the .o file. # the same name as the .o file.
$(BUILD_DIR)/%.o : %.c $(BUILD_DIR)/%.o : %.c
@mkdir -p $(@D) @mkdir -p $(@D)
$(MCC) $(C_FLAGS) $(DEF) -fpic -MMD -c $< -o $@ $(MCC) $(C_FLAGS) $(MPIFLAGS) $(SLURMFLAGS) $(DEF) -fpic -MMD -c $< -o $@
...@@ -53,8 +53,7 @@ void open_port(Spawn_ports *spawn_port, int open_port, int open_service) ...@@ -53,8 +53,7 @@ void open_port(Spawn_ports *spawn_port, int open_port, int open_service)
if (open_service != MAM_SERVICE_UNNEEDED) { if (open_service != MAM_SERVICE_UNNEEDED) {
spawn_port->service_name = (char *)malloc((MAM_SERVICE_NAME_SIZE) * sizeof(char)); spawn_port->service_name = (char *)malloc((MAM_SERVICE_NAME_SIZE) * sizeof(char));
#if MAM_USE_SLURM #if MAM_USE_SLURM
char *tmp = getenv("SLURM_JOB_ID"); job_id = mall_conf->slurm_jid%1000; //Modified for DMR
if(tmp != NULL) { job_id = atoi(tmp)%1000; }
#endif #endif
snprintf(spawn_port->service_name, MAM_SERVICE_NAME_SIZE, "mam_service_jid%04d_gr%03d", job_id, open_service); snprintf(spawn_port->service_name, MAM_SERVICE_NAME_SIZE, "mam_service_jid%04d_gr%03d", job_id, open_service);
MPI_Publish_name(spawn_port->service_name, MPI_INFO_NULL, spawn_port->port_name); MPI_Publish_name(spawn_port->service_name, MPI_INFO_NULL, spawn_port->port_name);
...@@ -128,8 +127,7 @@ void discover_remote_port(int id_group, Spawn_ports *spawn_port) { ...@@ -128,8 +127,7 @@ void discover_remote_port(int id_group, Spawn_ports *spawn_port) {
if(spawn_port->remote_service == NULL) { //First discover if(spawn_port->remote_service == NULL) { //First discover
spawn_port->remote_service = (char*) malloc(MAM_SERVICE_NAME_SIZE * sizeof(char)); spawn_port->remote_service = (char*) malloc(MAM_SERVICE_NAME_SIZE * sizeof(char));
#if MAM_USE_SLURM #if MAM_USE_SLURM
char *tmp = getenv("SLURM_JOB_ID"); job_id = mall_conf->slurm_jid%1000; //Modified for DMR
if(tmp != NULL) { job_id = atoi(tmp)%1000; }
#endif #endif
snprintf(spawn_port->remote_service, MAM_SERVICE_NAME_SIZE, "mam_service_jid%04d_gr%03d", job_id, id_group); snprintf(spawn_port->remote_service, MAM_SERVICE_NAME_SIZE, "mam_service_jid%04d_gr%03d", job_id, id_group);
} else { // For subsequent lookups, only update the variable part (group ID) of the service name. } else { // For subsequent lookups, only update the variable part (group ID) of the service name.
......
...@@ -474,8 +474,7 @@ void generate_info_hostfile_slurm(char *nodelist, int *qty, size_t used_nodes, S ...@@ -474,8 +474,7 @@ void generate_info_hostfile_slurm(char *nodelist, int *qty, size_t used_nodes, S
char *hostfile_name, *line; char *hostfile_name, *line;
hostlist_t hostlist; hostlist_t hostlist;
char *tmp = getenv("SLURM_JOB_ID"); jid = mall_conf->slurm_jid%1000; //Modified for DMR
jid = tmp != NULL ? (atoi(tmp)%1000) : 0;
line = NULL; line = NULL;
hostlist = slurm_hostlist_create(nodelist); hostlist = slurm_hostlist_create(nodelist);
......
...@@ -129,7 +129,7 @@ void hypercube_spawn(int group_id, int groups, int init_nodes, int init_step, ...@@ -129,7 +129,7 @@ void hypercube_spawn(int group_id, int groups, int init_nodes, int init_step,
MPI_Comm **spawn_comm, int *qty_comms) { MPI_Comm **spawn_comm, int *qty_comms) {
int i, aux_sum, actual_step; int i, aux_sum, actual_step;
int next_group_id, actual_nodes; int next_group_id, actual_nodes;
int jid=0, n=0; int n=0;
char *file_name = NULL; char *file_name = NULL;
Spawn_set set; Spawn_set set;
...@@ -144,14 +144,10 @@ void hypercube_spawn(int group_id, int groups, int init_nodes, int init_step, ...@@ -144,14 +144,10 @@ void hypercube_spawn(int group_id, int groups, int init_nodes, int init_step,
} }
//if(mall->myId == 0)printf("T1 P%d+%d step=%d next_id=%d aux_sum=%d actual_nodes=%d comms=%d\n", mall->myId, group_id, actual_step, next_group_id, aux_sum, actual_nodes, *qty_comms); //if(mall->myId == 0)printf("T1 P%d+%d step=%d next_id=%d aux_sum=%d actual_nodes=%d comms=%d\n", mall->myId, group_id, actual_step, next_group_id, aux_sum, actual_nodes, *qty_comms);
#if MAM_USE_SLURM
char *tmp = getenv("SLURM_JOB_ID");
if(tmp != NULL) { jid = atoi(tmp); }
#endif
set.cmd = get_spawn_cmd(); set.cmd = get_spawn_cmd();
i = 0; i = 0;
while(next_group_id < groups - init_nodes) { while(next_group_id < groups - init_nodes) {
set_hostfile_name(&file_name, &n, jid, next_group_id); set_hostfile_name(&file_name, &n, mall_conf->slurm_jid, next_group_id); //Modified for DMR
//read_hostfile_procs(file_name, &set.spawn_qty); //read_hostfile_procs(file_name, &set.spawn_qty);
set.spawn_qty = mall->num_cpus; set.spawn_qty = mall->num_cpus;
MPI_Info_create(&set.mapping); MPI_Info_create(&set.mapping);
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment