Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Iker Martín Álvarez
Proteo
Commits
6a71bbf2
Commit
6a71bbf2
authored
Nov 07, 2024
by
iker_martin
Browse files
Added modifications for MaM usage in DMR
parent
c9941842
Changes
10
Show whitespace changes
Inline
Side-by-side
Codes/MaM/MAM_Configuration.c
View file @
6a71bbf2
...
...
@@ -227,6 +227,21 @@ void MAM_Check_configuration() {
MAM_Set_key_configuration
(
MAM_SPAWN_METHOD
,
MAM_SPAWN_BASELINE
,
NULL
);
}
// BEGIN ADDED FOR DMR
char
*
tmp
=
getenv
(
MAM_SPAWN_METHOD_ENV
);
int
tmp_value
=
mall_conf
->
spawn_method
;
if
(
tmp
!=
NULL
)
{
tmp_value
=
atoi
(
tmp
);
}
if
(
mall_conf
->
spawn_method
!=
(
size_t
)
tmp_value
)
{
MAM_Set_key_configuration
(
MAM_SPAWN_METHOD
,
tmp_value
,
NULL
);
}
if
(
!
MAM_Contains_strat
(
MAM_SPAWN_STRATEGIES
,
MAM_STRAT_SPAWN_PARALLEL
,
NULL
)
&&
mall_conf
->
spawn_method
==
MAM_SPAWN_MERGE
)
{
MAM_I_set_spawn_strat
(
MAM_STRAT_SPAWN_PARALLEL
,
&
mall_conf
->
spawn_strategies
);
}
// END ADDED FOR DMR
MPI_Allreduce
(
&
mall
->
internode_group
,
&
global_internodes
,
1
,
MPI_INT
,
MPI_MAX
,
mall
->
comm
);
if
((
MAM_Contains_strat
(
MAM_SPAWN_STRATEGIES
,
MAM_STRAT_SPAWN_MULTIPLE
,
NULL
)
||
MAM_Contains_strat
(
MAM_SPAWN_STRATEGIES
,
MAM_STRAT_SPAWN_PARALLEL
,
NULL
)
)
...
...
Codes/MaM/MAM_DataStructures.c
View file @
6a71bbf2
...
...
@@ -9,7 +9,7 @@ int state = MAM_I_UNRESERVED;
* de MaM.
*/
void
MAM_Def_main_datatype
()
{
int
i
,
counts
=
1
2
;
int
i
,
counts
=
1
3
;
int
blocklengths
[
counts
];
MPI_Aint
displs
[
counts
];
MPI_Datatype
types
[
counts
];
...
...
@@ -29,14 +29,15 @@ void MAM_Def_main_datatype() {
MPI_Get_address
(
&
(
mall_conf
->
spawn_dist
),
&
displs
[
2
]);
MPI_Get_address
(
&
(
mall_conf
->
red_method
),
&
displs
[
3
]);
MPI_Get_address
(
&
(
mall_conf
->
red_strategies
),
&
displs
[
4
]);
MPI_Get_address
(
&
(
mall_conf
->
slurm_jid
),
&
displs
[
5
]);
//DMR ADDITION - Which is the original Slurm JobId
MPI_Get_address
(
&
(
mall
->
root_parents
),
&
displs
[
5
]);
MPI_Get_address
(
&
(
mall
->
num_parents
),
&
displs
[
6
]);
//TODO Add only when Single strat active?
MPI_Get_address
(
&
(
mall
->
numC
),
&
displs
[
7
]);
//TODO Add only when MultipleSpawn strat active?
MPI_Get_address
(
&
(
mall
->
gid
),
&
displs
[
8
]);
//TODO Add only when ParallelSpawn strat active?
MPI_Get_address
(
&
(
mall
->
num_cpus
),
&
displs
[
9
]);
MPI_Get_address
(
&
(
mall
->
num_nodes
),
&
displs
[
1
0
]);
MPI_Get_address
(
&
(
mall
->
nodelist_len
),
&
displs
[
1
1
]);
MPI_Get_address
(
&
(
mall
->
root_parents
),
&
displs
[
6
]);
MPI_Get_address
(
&
(
mall
->
num_parents
),
&
displs
[
7
]);
//TODO Add only when Single strat active?
MPI_Get_address
(
&
(
mall
->
numC
),
&
displs
[
8
]);
//TODO Add only when MultipleSpawn strat active?
MPI_Get_address
(
&
(
mall
->
gid
),
&
displs
[
9
]);
//TODO Add only when ParallelSpawn strat active?
MPI_Get_address
(
&
(
mall
->
num_cpus
),
&
displs
[
10
]);
MPI_Get_address
(
&
(
mall
->
num_nodes
),
&
displs
[
1
1
]);
MPI_Get_address
(
&
(
mall
->
nodelist_len
),
&
displs
[
1
2
]);
MPI_Type_create_struct
(
counts
,
blocklengths
,
displs
,
types
,
&
mall
->
struct_type
);
MPI_Type_commit
(
&
mall
->
struct_type
);
...
...
Codes/MaM/MAM_DataStructures.h
View file @
6a71bbf2
...
...
@@ -45,6 +45,7 @@ typedef struct {
unsigned
int
red_strategies
;
int
external_usage
;
// Whether a different application should be called by Spawn and which
int
slurm_jid
;
//DMR ADDITION - Which is the original Slurm JobId
malleability_times_t
*
times
;
}
malleability_config_t
;
...
...
Codes/MaM/MAM_Manager.c
View file @
6a71bbf2
...
...
@@ -248,11 +248,28 @@ void MAM_Resume_redistribution(int *mam_state) {
if
(
mam_state
!=
NULL
)
*
mam_state
=
MAM_PENDING
;
}
//BEGIN ADDED FOR DMR
int
MAM_DMR_Is_zombie
()
{
return
mall
->
zombie
;
}
void
MAM_DMR_Update_nodelist
(
char
*
nodelist
,
int
num_nodes
)
{
if
(
mall
->
nodelist
!=
NULL
)
{
free
(
mall
->
nodelist
);
mall
->
nodelist
=
NULL
;
}
mall
->
nodelist_len
=
strlen
(
nodelist
)
+
1
;
mall
->
nodelist
=
(
char
*
)
malloc
(
mall
->
nodelist_len
*
sizeof
(
char
));
strcpy
(
mall
->
nodelist
,
nodelist
);
mall
->
num_nodes
=
num_nodes
;
}
//END ADDED FOR DMR
/*
* TODO
*/
void
MAM_Commit
(
int
*
mam_state
)
{
int
request_abort
;
//
int request_abort;
Removed for DMR
#if MAM_DEBUG
if
(
mall
->
myId
==
mall
->
root
){
DEBUG_FUNC
(
"Trying to commit"
,
mall
->
myId
,
mall
->
numP
);
}
fflush
(
stdout
);
#endif
...
...
@@ -273,10 +290,19 @@ void MAM_Commit(int *mam_state) {
#if MAM_DEBUG >= 1
DEBUG_FUNC
(
"Is terminating as zombie"
,
mall
->
myId
,
mall
->
numP
);
fflush
(
stdout
);
#endif
/* BEGIN REMOVED FOR DMR
request_abort = MAM_Finalize();
if(request_abort) { MPI_Abort(MPI_COMM_WORLD, -101); }
MPI_Finalize();
exit(0);
END REMOVED FOR DMR
*/
//BEGIN ADDED FOR DMR
if
(
mall
->
intercomm
!=
MPI_COMM_NULL
&&
mall
->
intercomm
!=
MPI_COMM_WORLD
)
{
MPI_Comm_disconnect
(
&
(
mall
->
intercomm
));
}
state
=
MAM_I_NOT_STARTED
;
if
(
mam_state
!=
NULL
)
*
mam_state
=
MAM_COMPLETED
;
return
;
//END ADDED FOR DMR
}
// Reset/Free communicators
...
...
Codes/MaM/MAM_Manager.h
View file @
6a71bbf2
...
...
@@ -21,6 +21,8 @@ void MAM_Resume_redistribution(int *mam_state);
int
MAM_Get_Reconf_Info
(
mam_user_reconf_t
*
reconf_info
);
int
MAM_DMR_Is_zombie
();
//Added for DMR
void
MAM_DMR_Update_nodelist
(
char
*
nodelist
,
int
num_nodes
);
//Added for DMR
void
MAM_Data_add
(
void
*
data
,
size_t
*
index
,
size_t
total_qty
,
MPI_Datatype
type
,
int
is_replicated
,
int
is_constant
);
void
MAM_Data_modify
(
void
*
data
,
size_t
index
,
size_t
total_qty
,
MPI_Datatype
type
,
int
is_replicated
,
int
is_constant
);
...
...
Codes/MaM/MAM_RMS.c
View file @
6a71bbf2
...
...
@@ -165,6 +165,7 @@ int MAM_I_get_hosts_info() {
free
(
unique_hosts
);
}
mall_conf
->
slurm_jid
=
0
;
//Added for DMR
free
(
my_host
);
return
0
;
}
...
...
@@ -208,6 +209,10 @@ int MAM_I_slurm_getenv_hosts_info() {
int
cpus
,
count
;
//int i, *cpus_counts, *nodes_counts, *aux;
tmp
=
getenv
(
"SLURM_JOB_ID"
);
if
(
tmp
==
NULL
)
return
1
;
mall_conf
->
slurm_jid
=
atoi
(
tmp
);
//Modified for DMR
tmp
=
getenv
(
"SLURM_JOB_NUM_NODES"
);
if
(
tmp
==
NULL
)
return
1
;
mall
->
num_nodes
=
atoi
(
tmp
);
...
...
@@ -270,16 +275,16 @@ int MAM_I_slurm_getenv_hosts_info() {
* FIXME Does not consider heterogenous machines
*/
int
MAM_I_slurm_getjob_hosts_info
()
{
int
jobId
,
err
;
int
err
;
char
*
tmp
=
NULL
;
job_info_msg_t
*
j_info
;
slurm_job_info_t
last_record
;
tmp
=
getenv
(
"SLURM_JOB_ID"
);
if
(
tmp
==
NULL
)
return
1
;
jobId
=
atoi
(
tmp
);
mall_conf
->
slurm_jid
=
atoi
(
tmp
);
//Modified for DMR
err
=
slurm_load_job
(
&
j_info
,
jobI
d
,
1
);
// FIXME Valgrind Not freed
err
=
slurm_load_job
(
&
j_info
,
mall_conf
->
slurm_ji
d
,
1
);
// FIXME Valgrind Not freed
//Modified for DMR
if
(
err
)
return
err
;
last_record
=
j_info
->
job_array
[
j_info
->
record_count
-
1
];
...
...
Codes/MaM/Makefile
View file @
6a71bbf2
...
...
@@ -4,11 +4,16 @@ MCC = mpicc
C_FLAGS
=
-Wall
-Wextra
-Wshadow
-Wfatal-errors
LD_FLAGS
=
-lm
-pthread
MAM_USE_SLURM
?=
0
MAM_USE_SLURM
?=
1
MAM_USE_BARRIERS
?=
0
MAM_DEBUG
?=
0
DEF
=
-DMAM_USE_SLURM
=
$(MAM_USE_SLURM)
-DMAM_USE_BARRIERS
=
$(MAM_USE_BARRIERS)
-DMAM_DEBUG
=
$(MAM_DEBUG)
ifdef
DMR_PATH
MPIFLAGS
=
-I
$(MPI_PATH)
/include
-L
$(MPI_PATH)
/lib
SLURMFLAGS
=
-I
$(SLURM_ROOT)
/include
-L
$(SLURM_ROOT)
/lib
endif
ifeq
($(MAM_USE_SLURM),1)
LD_FLAGS
+=
-lslurm
endif
...
...
@@ -52,7 +57,7 @@ $(LIB) : $(BUILD_DIR)/$(LIB)
# Actual target of the binary - depends on all .o files.
$(BUILD_DIR)/$(LIB)
:
$(OBJ)
$(MCC)
$(C_FLAGS)
$^
-shared
-o
$@
$(LD_FLAGS)
$(MCC)
$(C_FLAGS)
$(MPIFLAGS)
$(SLURMFLAGS)
$^
-shared
-o
$@
$(LD_FLAGS)
# Include all .d files
# .d files are used for knowing the dependencies of each source file
...
...
@@ -65,4 +70,4 @@ $(BUILD_DIR)/$(LIB) : $(OBJ)
# the same name as the .o file.
$(BUILD_DIR)/%.o
:
%.c
@
mkdir
-p
$
(
@D
)
$(MCC)
$(C_FLAGS)
$(DEF)
-fpic
-MMD
-c
$<
-o
$@
$(MCC)
$(C_FLAGS)
$(MPIFLAGS)
$(SLURMFLAGS)
$(DEF)
-fpic
-MMD
-c
$<
-o
$@
Codes/MaM/spawn_methods/PortService.c
View file @
6a71bbf2
...
...
@@ -53,8 +53,7 @@ void open_port(Spawn_ports *spawn_port, int open_port, int open_service)
if
(
open_service
!=
MAM_SERVICE_UNNEEDED
)
{
spawn_port
->
service_name
=
(
char
*
)
malloc
((
MAM_SERVICE_NAME_SIZE
)
*
sizeof
(
char
));
#if MAM_USE_SLURM
char
*
tmp
=
getenv
(
"SLURM_JOB_ID"
);
if
(
tmp
!=
NULL
)
{
job_id
=
atoi
(
tmp
)
%
1000
;
}
job_id
=
mall_conf
->
slurm_jid
%
1000
;
//Modified for DMR
#endif
snprintf
(
spawn_port
->
service_name
,
MAM_SERVICE_NAME_SIZE
,
"mam_service_jid%04d_gr%03d"
,
job_id
,
open_service
);
MPI_Publish_name
(
spawn_port
->
service_name
,
MPI_INFO_NULL
,
spawn_port
->
port_name
);
...
...
@@ -128,8 +127,7 @@ void discover_remote_port(int id_group, Spawn_ports *spawn_port) {
if
(
spawn_port
->
remote_service
==
NULL
)
{
//First discover
spawn_port
->
remote_service
=
(
char
*
)
malloc
(
MAM_SERVICE_NAME_SIZE
*
sizeof
(
char
));
#if MAM_USE_SLURM
char
*
tmp
=
getenv
(
"SLURM_JOB_ID"
);
if
(
tmp
!=
NULL
)
{
job_id
=
atoi
(
tmp
)
%
1000
;
}
job_id
=
mall_conf
->
slurm_jid
%
1000
;
//Modified for DMR
#endif
snprintf
(
spawn_port
->
remote_service
,
MAM_SERVICE_NAME_SIZE
,
"mam_service_jid%04d_gr%03d"
,
job_id
,
id_group
);
}
else
{
// For subsequent lookups, only update the variable part (group ID) of the service name.
...
...
Codes/MaM/spawn_methods/ProcessDist.c
View file @
6a71bbf2
...
...
@@ -474,8 +474,7 @@ void generate_info_hostfile_slurm(char *nodelist, int *qty, size_t used_nodes, S
char
*
hostfile_name
,
*
line
;
hostlist_t
hostlist
;
char
*
tmp
=
getenv
(
"SLURM_JOB_ID"
);
jid
=
tmp
!=
NULL
?
(
atoi
(
tmp
)
%
1000
)
:
0
;
jid
=
mall_conf
->
slurm_jid
%
1000
;
//Modified for DMR
line
=
NULL
;
hostlist
=
slurm_hostlist_create
(
nodelist
);
...
...
Codes/MaM/spawn_methods/Strategy_Parallel.c
View file @
6a71bbf2
...
...
@@ -129,7 +129,7 @@ void hypercube_spawn(int group_id, int groups, int init_nodes, int init_step,
MPI_Comm
**
spawn_comm
,
int
*
qty_comms
)
{
int
i
,
aux_sum
,
actual_step
;
int
next_group_id
,
actual_nodes
;
int
jid
=
0
,
n
=
0
;
int
n
=
0
;
char
*
file_name
=
NULL
;
Spawn_set
set
;
...
...
@@ -144,14 +144,10 @@ void hypercube_spawn(int group_id, int groups, int init_nodes, int init_step,
}
//if(mall->myId == 0)printf("T1 P%d+%d step=%d next_id=%d aux_sum=%d actual_nodes=%d comms=%d\n", mall->myId, group_id, actual_step, next_group_id, aux_sum, actual_nodes, *qty_comms);
#if MAM_USE_SLURM
char
*
tmp
=
getenv
(
"SLURM_JOB_ID"
);
if
(
tmp
!=
NULL
)
{
jid
=
atoi
(
tmp
);
}
#endif
set
.
cmd
=
get_spawn_cmd
();
i
=
0
;
while
(
next_group_id
<
groups
-
init_nodes
)
{
set_hostfile_name
(
&
file_name
,
&
n
,
jid
,
next_group_id
);
set_hostfile_name
(
&
file_name
,
&
n
,
mall_conf
->
slurm_
jid
,
next_group_id
);
//Modified for DMR
//read_hostfile_procs(file_name, &set.spawn_qty);
set
.
spawn_qty
=
mall
->
num_cpus
;
MPI_Info_create
(
&
set
.
mapping
);
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment