Commit 6a71bbf2 authored by iker_martin's avatar iker_martin
Browse files

Added modifications for MaM usage in DMR

parent c9941842
......@@ -227,6 +227,21 @@ void MAM_Check_configuration() {
MAM_Set_key_configuration(MAM_SPAWN_METHOD, MAM_SPAWN_BASELINE, NULL);
}
// BEGIN ADDED FOR DMR
char *tmp = getenv(MAM_SPAWN_METHOD_ENV);
int tmp_value = mall_conf->spawn_method;
if(tmp != NULL) {
tmp_value = atoi(tmp);
}
if(mall_conf->spawn_method != (size_t) tmp_value) {
MAM_Set_key_configuration(MAM_SPAWN_METHOD, tmp_value, NULL);
}
if(!MAM_Contains_strat(MAM_SPAWN_STRATEGIES, MAM_STRAT_SPAWN_PARALLEL, NULL)
&& mall_conf->spawn_method == MAM_SPAWN_MERGE) {
MAM_I_set_spawn_strat(MAM_STRAT_SPAWN_PARALLEL, &mall_conf->spawn_strategies);
}
// END ADDED FOR DMR
MPI_Allreduce(&mall->internode_group, &global_internodes, 1, MPI_INT, MPI_MAX, mall->comm);
if((MAM_Contains_strat(MAM_SPAWN_STRATEGIES, MAM_STRAT_SPAWN_MULTIPLE, NULL)
|| MAM_Contains_strat(MAM_SPAWN_STRATEGIES, MAM_STRAT_SPAWN_PARALLEL, NULL) )
......
......@@ -9,7 +9,7 @@ int state = MAM_I_UNRESERVED;
* de MaM.
*/
void MAM_Def_main_datatype() {
int i, counts = 12;
int i, counts = 13;
int blocklengths[counts];
MPI_Aint displs[counts];
MPI_Datatype types[counts];
......@@ -29,14 +29,15 @@ void MAM_Def_main_datatype() {
MPI_Get_address(&(mall_conf->spawn_dist), &displs[2]);
MPI_Get_address(&(mall_conf->red_method), &displs[3]);
MPI_Get_address(&(mall_conf->red_strategies), &displs[4]);
MPI_Get_address(&(mall_conf->slurm_jid), &displs[5]); //DMR ADDITION - Which is the original Slurm JobId
MPI_Get_address(&(mall->root_parents), &displs[5]);
MPI_Get_address(&(mall->num_parents), &displs[6]); //TODO Add only when Single strat active?
MPI_Get_address(&(mall->numC), &displs[7]); //TODO Add only when MultipleSpawn strat active?
MPI_Get_address(&(mall->gid), &displs[8]); //TODO Add only when ParallelSpawn strat active?
MPI_Get_address(&(mall->num_cpus), &displs[9]);
MPI_Get_address(&(mall->num_nodes), &displs[10]);
MPI_Get_address(&(mall->nodelist_len), &displs[11]);
MPI_Get_address(&(mall->root_parents), &displs[6]);
MPI_Get_address(&(mall->num_parents), &displs[7]); //TODO Add only when Single strat active?
MPI_Get_address(&(mall->numC), &displs[8]); //TODO Add only when MultipleSpawn strat active?
MPI_Get_address(&(mall->gid), &displs[9]); //TODO Add only when ParallelSpawn strat active?
MPI_Get_address(&(mall->num_cpus), &displs[10]);
MPI_Get_address(&(mall->num_nodes), &displs[11]);
MPI_Get_address(&(mall->nodelist_len), &displs[12]);
MPI_Type_create_struct(counts, blocklengths, displs, types, &mall->struct_type);
MPI_Type_commit(&mall->struct_type);
......
......@@ -45,6 +45,7 @@ typedef struct {
unsigned int red_strategies;
int external_usage; // Whether a different application should be called by Spawn and which
int slurm_jid; //DMR ADDITION - Which is the original Slurm JobId
malleability_times_t *times;
} malleability_config_t;
......
......@@ -248,11 +248,28 @@ void MAM_Resume_redistribution(int *mam_state) {
if(mam_state != NULL) *mam_state = MAM_PENDING;
}
//BEGIN ADDED FOR DMR
int MAM_DMR_Is_zombie() {
return mall->zombie;
}
void MAM_DMR_Update_nodelist(char *nodelist, int num_nodes) {
if(mall->nodelist!= NULL) {
free(mall->nodelist);
mall->nodelist = NULL;
}
mall->nodelist_len = strlen(nodelist)+1;
mall->nodelist = (char *) malloc(mall->nodelist_len * sizeof(char));
strcpy(mall->nodelist, nodelist);
mall->num_nodes = num_nodes;
}
//END ADDED FOR DMR
/*
* TODO
*/
void MAM_Commit(int *mam_state) {
int request_abort;
//int request_abort; Removed for DMR
#if MAM_DEBUG
if(mall->myId == mall->root){ DEBUG_FUNC("Trying to commit", mall->myId, mall->numP); } fflush(stdout);
#endif
......@@ -273,10 +290,19 @@ void MAM_Commit(int *mam_state) {
#if MAM_DEBUG >= 1
DEBUG_FUNC("Is terminating as zombie", mall->myId, mall->numP); fflush(stdout);
#endif
/* BEGIN REMOVED FOR DMR
request_abort = MAM_Finalize();
if(request_abort) { MPI_Abort(MPI_COMM_WORLD, -101); }
MPI_Finalize();
exit(0);
END REMOVED FOR DMR
*/
//BEGIN ADDED FOR DMR
if(mall->intercomm != MPI_COMM_NULL && mall->intercomm != MPI_COMM_WORLD) { MPI_Comm_disconnect(&(mall->intercomm)); }
state = MAM_I_NOT_STARTED;
if(mam_state != NULL) *mam_state = MAM_COMPLETED;
return;
//END ADDED FOR DMR
}
// Reset/Free communicators
......
......@@ -21,6 +21,8 @@ void MAM_Resume_redistribution(int *mam_state);
int MAM_Get_Reconf_Info(mam_user_reconf_t *reconf_info);
int MAM_DMR_Is_zombie(); //Added for DMR
void MAM_DMR_Update_nodelist(char *nodelist, int num_nodes); //Added for DMR
void MAM_Data_add(void *data, size_t *index, size_t total_qty, MPI_Datatype type, int is_replicated, int is_constant);
void MAM_Data_modify(void *data, size_t index, size_t total_qty, MPI_Datatype type, int is_replicated, int is_constant);
......
......@@ -165,6 +165,7 @@ int MAM_I_get_hosts_info() {
free(unique_hosts);
}
mall_conf->slurm_jid = 0; //Added for DMR
free(my_host);
return 0;
}
......@@ -208,6 +209,10 @@ int MAM_I_slurm_getenv_hosts_info() {
int cpus, count;
//int i, *cpus_counts, *nodes_counts, *aux;
tmp = getenv("SLURM_JOB_ID");
if(tmp == NULL) return 1;
mall_conf->slurm_jid = atoi(tmp); //Modified for DMR
tmp = getenv("SLURM_JOB_NUM_NODES");
if(tmp == NULL) return 1;
mall->num_nodes = atoi(tmp);
......@@ -270,16 +275,16 @@ int MAM_I_slurm_getenv_hosts_info() {
* FIXME Does not consider heterogenous machines
*/
int MAM_I_slurm_getjob_hosts_info() {
int jobId, err;
int err;
char *tmp = NULL;
job_info_msg_t *j_info;
slurm_job_info_t last_record;
tmp = getenv("SLURM_JOB_ID");
if(tmp == NULL) return 1;
jobId = atoi(tmp);
mall_conf->slurm_jid = atoi(tmp); //Modified for DMR
err = slurm_load_job(&j_info, jobId, 1); // FIXME Valgrind Not freed
err = slurm_load_job(&j_info, mall_conf->slurm_jid, 1); // FIXME Valgrind Not freed //Modified for DMR
if(err) return err;
last_record = j_info->job_array[j_info->record_count - 1];
......
......@@ -4,11 +4,16 @@ MCC = mpicc
C_FLAGS = -Wall -Wextra -Wshadow -Wfatal-errors
LD_FLAGS = -lm -pthread
MAM_USE_SLURM ?= 0
MAM_USE_SLURM ?= 1
MAM_USE_BARRIERS ?= 0
MAM_DEBUG ?= 0
DEF = -DMAM_USE_SLURM=$(MAM_USE_SLURM) -DMAM_USE_BARRIERS=$(MAM_USE_BARRIERS) -DMAM_DEBUG=$(MAM_DEBUG)
ifdef DMR_PATH
MPIFLAGS = -I$(MPI_PATH)/include -L$(MPI_PATH)/lib
SLURMFLAGS = -I$(SLURM_ROOT)/include -L$(SLURM_ROOT)/lib
endif
ifeq ($(MAM_USE_SLURM),1)
LD_FLAGS += -lslurm
endif
......@@ -52,7 +57,7 @@ $(LIB) : $(BUILD_DIR)/$(LIB)
# Actual target of the binary - depends on all .o files.
$(BUILD_DIR)/$(LIB) : $(OBJ)
$(MCC) $(C_FLAGS) $^ -shared -o $@ $(LD_FLAGS)
$(MCC) $(C_FLAGS) $(MPIFLAGS) $(SLURMFLAGS) $^ -shared -o $@ $(LD_FLAGS)
# Include all .d files
# .d files are used for knowing the dependencies of each source file
......@@ -65,4 +70,4 @@ $(BUILD_DIR)/$(LIB) : $(OBJ)
# the same name as the .o file.
$(BUILD_DIR)/%.o : %.c
@mkdir -p $(@D)
$(MCC) $(C_FLAGS) $(DEF) -fpic -MMD -c $< -o $@
$(MCC) $(C_FLAGS) $(MPIFLAGS) $(SLURMFLAGS) $(DEF) -fpic -MMD -c $< -o $@
......@@ -53,8 +53,7 @@ void open_port(Spawn_ports *spawn_port, int open_port, int open_service)
if (open_service != MAM_SERVICE_UNNEEDED) {
spawn_port->service_name = (char *)malloc((MAM_SERVICE_NAME_SIZE) * sizeof(char));
#if MAM_USE_SLURM
char *tmp = getenv("SLURM_JOB_ID");
if(tmp != NULL) { job_id = atoi(tmp)%1000; }
job_id = mall_conf->slurm_jid%1000; //Modified for DMR
#endif
snprintf(spawn_port->service_name, MAM_SERVICE_NAME_SIZE, "mam_service_jid%04d_gr%03d", job_id, open_service);
MPI_Publish_name(spawn_port->service_name, MPI_INFO_NULL, spawn_port->port_name);
......@@ -128,8 +127,7 @@ void discover_remote_port(int id_group, Spawn_ports *spawn_port) {
if(spawn_port->remote_service == NULL) { //First discover
spawn_port->remote_service = (char*) malloc(MAM_SERVICE_NAME_SIZE * sizeof(char));
#if MAM_USE_SLURM
char *tmp = getenv("SLURM_JOB_ID");
if(tmp != NULL) { job_id = atoi(tmp)%1000; }
job_id = mall_conf->slurm_jid%1000; //Modified for DMR
#endif
snprintf(spawn_port->remote_service, MAM_SERVICE_NAME_SIZE, "mam_service_jid%04d_gr%03d", job_id, id_group);
} else { // For subsequent lookups, only update the variable part (group ID) of the service name.
......
......@@ -474,8 +474,7 @@ void generate_info_hostfile_slurm(char *nodelist, int *qty, size_t used_nodes, S
char *hostfile_name, *line;
hostlist_t hostlist;
char *tmp = getenv("SLURM_JOB_ID");
jid = tmp != NULL ? (atoi(tmp)%1000) : 0;
jid = mall_conf->slurm_jid%1000; //Modified for DMR
line = NULL;
hostlist = slurm_hostlist_create(nodelist);
......
......@@ -129,7 +129,7 @@ void hypercube_spawn(int group_id, int groups, int init_nodes, int init_step,
MPI_Comm **spawn_comm, int *qty_comms) {
int i, aux_sum, actual_step;
int next_group_id, actual_nodes;
int jid=0, n=0;
int n=0;
char *file_name = NULL;
Spawn_set set;
......@@ -144,14 +144,10 @@ void hypercube_spawn(int group_id, int groups, int init_nodes, int init_step,
}
//if(mall->myId == 0)printf("T1 P%d+%d step=%d next_id=%d aux_sum=%d actual_nodes=%d comms=%d\n", mall->myId, group_id, actual_step, next_group_id, aux_sum, actual_nodes, *qty_comms);
#if MAM_USE_SLURM
char *tmp = getenv("SLURM_JOB_ID");
if(tmp != NULL) { jid = atoi(tmp); }
#endif
set.cmd = get_spawn_cmd();
i = 0;
while(next_group_id < groups - init_nodes) {
set_hostfile_name(&file_name, &n, jid, next_group_id);
set_hostfile_name(&file_name, &n, mall_conf->slurm_jid, next_group_id); //Modified for DMR
//read_hostfile_procs(file_name, &set.spawn_qty);
set.spawn_qty = mall->num_cpus;
MPI_Info_create(&set.mapping);
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment