Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Iker Martín Álvarez
Proteo
Commits
023318fb
Commit
023318fb
authored
Jan 24, 2024
by
iker_martin
Browse files
Added functions to automatically discover node info. Other minor changes.
parent
69193a34
Changes
6
Hide whitespace changes
Inline
Side-by-side
Codes/Main/Main.c
View file @
023318fb
...
@@ -48,15 +48,6 @@ int main(int argc, char *argv[]) {
...
@@ -48,15 +48,6 @@ int main(int argc, char *argv[]) {
int
im_child
;
int
im_child
;
int
abort_needed
=
0
;
int
abort_needed
=
0
;
int
num_cpus
,
num_nodes
;
char
*
nodelist
=
NULL
;
num_cpus
=
20
;
//FIXME NUMERO MAGICO //TODO Usar openMP para obtener el valor con un pragma
if
(
argc
>=
5
)
{
nodelist
=
argv
[
3
];
num_nodes
=
atoi
(
argv
[
4
]);
num_cpus
=
num_nodes
*
num_cpus
;
}
MPI_Init_thread
(
&
argc
,
&
argv
,
MPI_THREAD_MULTIPLE
,
&
req
);
MPI_Init_thread
(
&
argc
,
&
argv
,
MPI_THREAD_MULTIPLE
,
&
req
);
MPI_Comm_rank
(
MPI_COMM_WORLD
,
&
myId
);
MPI_Comm_rank
(
MPI_COMM_WORLD
,
&
myId
);
MPI_Comm_size
(
MPI_COMM_WORLD
,
&
numP
);
MPI_Comm_size
(
MPI_COMM_WORLD
,
&
numP
);
...
@@ -70,7 +61,7 @@ int main(int argc, char *argv[]) {
...
@@ -70,7 +61,7 @@ int main(int argc, char *argv[]) {
}
}
init_group_struct
(
argv
,
argc
,
myId
,
numP
);
init_group_struct
(
argv
,
argc
,
myId
,
numP
);
im_child
=
MAM_Init
(
ROOT
,
&
comm
,
argv
[
0
],
nodelist
,
num_cpus
,
num_nodes
,
user_redistribution
,
NULL
);
im_child
=
MAM_Init
(
ROOT
,
&
comm
,
argv
[
0
],
user_redistribution
,
NULL
);
if
(
im_child
)
{
if
(
im_child
)
{
update_targets
();
update_targets
();
...
...
Codes/malleability/malleabilityManager.c
View file @
023318fb
...
@@ -6,6 +6,7 @@
...
@@ -6,6 +6,7 @@
#include "malleabilityTypes.h"
#include "malleabilityTypes.h"
#include "malleabilityZombies.h"
#include "malleabilityZombies.h"
#include "malleabilityTimes.h"
#include "malleabilityTimes.h"
#include "malleabilityRMS.h"
#include "spawn_methods/GenericSpawn.h"
#include "spawn_methods/GenericSpawn.h"
#include "CommDist.h"
#include "CommDist.h"
...
@@ -18,7 +19,8 @@ void send_data(int numP_children, malleability_data_t *data_struct, int is_async
...
@@ -18,7 +19,8 @@ void send_data(int numP_children, malleability_data_t *data_struct, int is_async
void
recv_data
(
int
numP_parents
,
malleability_data_t
*
data_struct
,
int
is_asynchronous
);
void
recv_data
(
int
numP_parents
,
malleability_data_t
*
data_struct
,
int
is_asynchronous
);
int
MAM_St_not_started
(
int
*
mam_state
);
int
MAM_St_rms
(
int
*
mam_state
);
int
MAM_St_spawn_start
();
int
MAM_St_spawn_pending
(
int
wait_completed
);
int
MAM_St_spawn_pending
(
int
wait_completed
);
int
MAM_St_red_start
();
int
MAM_St_red_start
();
int
MAM_St_red_pending
(
int
*
mam_state
,
int
wait_completed
);
int
MAM_St_red_pending
(
int
*
mam_state
,
int
wait_completed
);
...
@@ -66,7 +68,7 @@ mam_user_reconf_t *user_reconf;
...
@@ -66,7 +68,7 @@ mam_user_reconf_t *user_reconf;
* la comunicacion los procesos hijo estan preparados para ejecutar la
* la comunicacion los procesos hijo estan preparados para ejecutar la
* aplicacion.
* aplicacion.
*/
*/
int
MAM_Init
(
int
root
,
MPI_Comm
*
comm
,
char
*
name_exec
,
char
*
nodelist
,
int
num_cpus
,
int
num_nodes
,
void
(
*
user_function
)(
void
*
),
void
*
user_args
)
{
int
MAM_Init
(
int
root
,
MPI_Comm
*
comm
,
char
*
name_exec
,
void
(
*
user_function
)(
void
*
),
void
*
user_args
)
{
MPI_Comm
dup_comm
,
thread_comm
;
MPI_Comm
dup_comm
,
thread_comm
;
mall_conf
=
(
malleability_config_t
*
)
malloc
(
sizeof
(
malleability_config_t
));
mall_conf
=
(
malleability_config_t
*
)
malloc
(
sizeof
(
malleability_config_t
));
...
@@ -99,9 +101,7 @@ int MAM_Init(int root, MPI_Comm *comm, char *name_exec, char *nodelist, int num_
...
@@ -99,9 +101,7 @@ int MAM_Init(int root, MPI_Comm *comm, char *name_exec, char *nodelist, int num_
mall
->
tmp_comm
=
MPI_COMM_NULL
;
mall
->
tmp_comm
=
MPI_COMM_NULL
;
mall
->
name_exec
=
name_exec
;
mall
->
name_exec
=
name_exec
;
mall
->
nodelist
=
nodelist
;
mall
->
nodelist
=
NULL
;
mall
->
num_cpus
=
num_cpus
;
mall
->
num_nodes
=
num_nodes
;
rep_s_data
->
entries
=
0
;
rep_s_data
->
entries
=
0
;
rep_a_data
->
entries
=
0
;
rep_a_data
->
entries
=
0
;
...
@@ -116,24 +116,18 @@ int MAM_Init(int root, MPI_Comm *comm, char *name_exec, char *nodelist, int num_
...
@@ -116,24 +116,18 @@ int MAM_Init(int root, MPI_Comm *comm, char *name_exec, char *nodelist, int num_
// Si son el primer grupo de procesos, obtienen los datos de los padres
// Si son el primer grupo de procesos, obtienen los datos de los padres
MPI_Comm_get_parent
(
&
(
mall
->
intercomm
));
MPI_Comm_get_parent
(
&
(
mall
->
intercomm
));
if
(
mall
->
intercomm
!=
MPI_COMM_NULL
)
{
if
(
mall
->
intercomm
!=
MPI_COMM_NULL
)
{
Children_init
(
user_function
,
user_args
);
Children_init
(
user_function
,
user_args
);
return
MALLEABILITY_CHILDREN
;
return
MALLEABILITY_CHILDREN
;
}
}
MAM_check_hosts
();
#if USE_MAL_BARRIERS && USE_MAL_DEBUG
#if USE_MAL_BARRIERS && USE_MAL_DEBUG
if
(
mall
->
myId
==
mall
->
root
)
if
(
mall
->
myId
==
mall
->
root
)
printf
(
"MaM: Using barriers to record times.
\n
"
);
printf
(
"MaM: Using barriers to record times.
\n
"
);
#endif
#endif
if
(
nodelist
!=
NULL
)
{
//TODO To be deprecated by using Slurm or else statement
mall
->
nodelist_len
=
strlen
(
nodelist
);
}
else
{
// If no nodelist is detected, get it from the actual run
mall
->
nodelist
=
malloc
(
MPI_MAX_PROCESSOR_NAME
*
sizeof
(
char
));
MPI_Get_processor_name
(
mall
->
nodelist
,
&
mall
->
nodelist_len
);
//TODO Get name of each process and create real nodelist
}
#if USE_MAL_DEBUG
#if USE_MAL_DEBUG
DEBUG_FUNC
(
"MaM has been initialized correctly as parents"
,
mall
->
myId
,
mall
->
numP
);
fflush
(
stdout
);
MPI_Barrier
(
*
comm
);
DEBUG_FUNC
(
"MaM has been initialized correctly as parents"
,
mall
->
myId
,
mall
->
numP
);
fflush
(
stdout
);
MPI_Barrier
(
*
comm
);
#endif
#endif
...
@@ -191,7 +185,10 @@ int MAM_Checkpoint(int *mam_state, int wait_completed, void (*user_function)(voi
...
@@ -191,7 +185,10 @@ int MAM_Checkpoint(int *mam_state, int wait_completed, void (*user_function)(voi
*
mam_state
=
MAM_UNRESERVED
;
*
mam_state
=
MAM_UNRESERVED
;
break
;
break
;
case
MALL_NOT_STARTED
:
case
MALL_NOT_STARTED
:
call_checkpoint
=
MAM_St_not_started
(
mam_state
);
call_checkpoint
=
MAM_St_rms
(
mam_state
);
break
;
case
MALL_RMS_COMPLETED
:
call_checkpoint
=
MAM_St_spawn_start
();
break
;
break
;
case
MALL_SPAWN_PENDING
:
// Comprueba si el spawn ha terminado
case
MALL_SPAWN_PENDING
:
// Comprueba si el spawn ha terminado
...
@@ -567,8 +564,9 @@ void recv_data(int numP_parents, malleability_data_t *data_struct, int is_asynch
...
@@ -567,8 +564,9 @@ void recv_data(int numP_parents, malleability_data_t *data_struct, int is_asynch
//======================================================||
//======================================================||
//======================================================||
//======================================================||
int
MAM_St_
not_started
(
int
*
mam_state
)
{
int
MAM_St_
rms
(
int
*
mam_state
)
{
*
mam_state
=
MAM_NOT_STARTED
;
*
mam_state
=
MAM_NOT_STARTED
;
state
=
MALL_RMS_COMPLETED
;
reset_malleability_times
();
reset_malleability_times
();
// Comprobar si se tiene que realizar un redimensionado
// Comprobar si se tiene que realizar un redimensionado
...
@@ -577,7 +575,10 @@ int MAM_St_not_started(int *mam_state) {
...
@@ -577,7 +575,10 @@ int MAM_St_not_started(int *mam_state) {
#endif
#endif
mall_conf
->
times
->
malleability_start
=
MPI_Wtime
();
mall_conf
->
times
->
malleability_start
=
MPI_Wtime
();
//if(CHECK_RMS()) {return MALL_DENIED;}
//if(CHECK_RMS()) {return MALL_DENIED;}
return
1
;
}
int
MAM_St_spawn_start
()
{
state
=
spawn_step
();
state
=
spawn_step
();
//FIXME Esto es necesario pero feo
//FIXME Esto es necesario pero feo
if
(
mall_conf
->
spawn_method
==
MALL_SPAWN_MERGE
&&
mall
->
myId
>=
mall
->
numC
){
mall
->
zombie
=
1
;
}
if
(
mall_conf
->
spawn_method
==
MALL_SPAWN_MERGE
&&
mall
->
myId
>=
mall
->
numC
){
mall
->
zombie
=
1
;
}
...
...
Codes/malleability/malleabilityManager.h
View file @
023318fb
...
@@ -15,7 +15,7 @@ typedef struct {
...
@@ -15,7 +15,7 @@ typedef struct {
MPI_Comm
comm
;
MPI_Comm
comm
;
}
mam_user_reconf_t
;
}
mam_user_reconf_t
;
int
MAM_Init
(
int
root
,
MPI_Comm
*
comm
,
char
*
name_exec
,
char
*
nodelist
,
int
num_cpus
,
int
num_nodes
,
void
(
*
user_function
)(
void
*
),
void
*
user_args
);
int
MAM_Init
(
int
root
,
MPI_Comm
*
comm
,
char
*
name_exec
,
void
(
*
user_function
)(
void
*
),
void
*
user_args
);
void
MAM_Finalize
();
void
MAM_Finalize
();
int
MAM_Checkpoint
(
int
*
mam_state
,
int
wait_completed
,
void
(
*
user_function
)(
void
*
),
void
*
user_args
);
int
MAM_Checkpoint
(
int
*
mam_state
,
int
wait_completed
,
void
(
*
user_function
)(
void
*
),
void
*
user_args
);
void
MAM_Resume_redistribution
(
int
*
mam_state
);
void
MAM_Resume_redistribution
(
int
*
mam_state
);
...
...
Codes/malleability/malleabilityRMS.c
0 → 100644
View file @
023318fb
#define _GNU_SOURCE
#include "malleabilityRMS.h"
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include <sched.h>
#if USE_MAL_SLURM
#include <slurm/slurm.h>
int
MAM_I_slurm_getenv_hosts_info
();
int
MAM_I_slurm_getjob_hosts_info
();
#endif
int
MAM_I_get_hosts_info
();
int
GetCPUCount
();
void
MAM_check_hosts
()
{
int
not_filled
=
1
;
#if USE_MAL_SLURM
not_filled
=
MAM_I_slurm_getenv_hosts_info
();
if
(
not_filled
)
{
if
(
mall
->
nodelist
!=
NULL
)
{
free
(
mall
->
nodelist
);
mall
->
nodelist
=
NULL
;
}
not_filled
=
MAM_I_slurm_getjob_hosts_info
();
}
#endif
if
(
not_filled
)
{
if
(
mall
->
nodelist
!=
NULL
)
{
free
(
mall
->
nodelist
);
mall
->
nodelist
=
NULL
;
}
not_filled
=
MAM_I_get_hosts_info
();
}
if
(
not_filled
)
{
if
(
mall
->
myId
==
mall
->
root
)
printf
(
"MAM FATAL ERROR: It has not been possible to obtain the nodelist
\n
"
);
fflush
(
stdout
);
MPI_Abort
(
mall
->
comm
,
-
50
);
}
#if USE_MAL_DEBUG >= 2
if
(
mall
->
myId
==
mall
->
root
)
{
DEBUG_FUNC
(
"Obtained Nodelist"
,
mall
->
myId
,
mall
->
numP
);
printf
(
"NODELIST: %s
\n
NODE_COUNT: %d NUM_CPUS_PER_NODE: %d
\n
"
,
mall
->
nodelist
,
mall
->
num_nodes
,
mall
->
num_cpus
);
fflush
(
stdout
);
}
#endif
}
/*
* TODO
* FIXME Does not consider heterogenous machines for num_cpus
* FIXME Always returns 0... -- Perform error checking?
*/
int
MAM_I_get_hosts_info
()
{
int
i
,
j
,
name_len
,
max_name_len
,
unique_count
,
*
unique_hosts
;
char
*
my_host
,
*
all_hosts
,
*
confirmed_host
,
*
tested_host
;
all_hosts
=
NULL
;
my_host
=
(
char
*
)
malloc
(
MPI_MAX_PROCESSOR_NAME
*
sizeof
(
char
));
MPI_Get_processor_name
(
my_host
,
&
name_len
);
MPI_Allreduce
(
&
name_len
,
&
max_name_len
,
1
,
MPI_INT
,
MPI_MAX
,
mall
->
comm
);
my_host
[
max_name_len
]
=
'\0'
;
max_name_len
++
;
// Len does not consider terminating character
if
(
mall
->
myId
==
mall
->
root
)
{
all_hosts
=
(
char
*
)
malloc
(
mall
->
numP
*
max_name_len
*
sizeof
(
char
));
unique_hosts
=
(
int
*
)
malloc
(
mall
->
numP
*
sizeof
(
int
));
unique_hosts
[
0
]
=
0
;
//First host will always be unique
unique_count
=
1
;
}
MPI_Gather
(
my_host
,
max_name_len
,
MPI_CHAR
,
all_hosts
,
max_name_len
,
MPI_CHAR
,
mall
->
root
,
mall
->
comm
);
if
(
mall
->
myId
==
mall
->
root
)
{
for
(
i
=
1
;
i
<
mall
->
numP
;
i
++
)
{
for
(
j
=
0
;
j
<
unique_count
;
j
++
)
{
tested_host
=
all_hosts
+
(
i
*
max_name_len
);
confirmed_host
=
all_hosts
+
(
unique_hosts
[
j
]
*
max_name_len
);
if
(
strcmp
(
tested_host
,
confirmed_host
)
!=
0
)
{
unique_hosts
[
unique_count
]
=
i
;
unique_count
++
;
break
;
}
}
}
mall
->
num_nodes
=
unique_count
;
mall
->
num_cpus
=
GetCPUCount
();
mall
->
nodelist_len
=
unique_count
*
max_name_len
;
mall
->
nodelist
=
(
char
*
)
malloc
(
mall
->
nodelist_len
*
sizeof
(
char
));
strcpy
(
mall
->
nodelist
,
""
);
//FIXME Strcat can be very inneficient...
for
(
i
=
0
;
i
<
unique_count
;
i
++
)
{
confirmed_host
=
all_hosts
+
(
unique_hosts
[
i
]
*
max_name_len
);
strcat
(
mall
->
nodelist
,
confirmed_host
);
if
(
i
<
unique_count
-
1
)
{
strcat
(
mall
->
nodelist
,
","
);
}
}
free
(
all_hosts
);
free
(
unique_hosts
);
}
free
(
my_host
);
return
0
;
}
/*
* @brief Get the total number of CPUs available to the process.
*
* This function uses sched_getaffinity to obtain the CPU affinity of the current process
* and counts the number of CPUs in the affinity set. It adjusts the loop based on the
* maximum number of CPUs allowed on the system.
*
* @return The total number of CPUs available to the process.
*
* Code obtained from: https://stackoverflow.com/questions/4586405/how-to-get-the-number-of-cpus-in-linux-using-c
* The code has been slightly modified.
*/
int
GetCPUCount
()
{
cpu_set_t
cs
;
CPU_ZERO
(
&
cs
);
sched_getaffinity
(
0
,
sizeof
(
cs
),
&
cs
);
int
count
=
0
;
int
max_cpus
=
sysconf
(
_SC_NPROCESSORS_ONLN
);
for
(
int
i
=
0
;
i
<
max_cpus
;
i
++
)
{
if
(
CPU_ISSET
(
i
,
&
cs
))
{
count
++
;
}
else
{
break
;
}
}
return
count
;
}
#if USE_MAL_SLURM
/*
* TODO
*/
int
MAM_I_slurm_getenv_hosts_info
()
{
char
*
tmp
=
NULL
,
*
token
;
int
cpus
,
count
;
//int i, *cpus_counts, *nodes_counts, *aux;
tmp
=
getenv
(
"SLURM_JOB_NUM_NODES"
);
if
(
tmp
==
NULL
)
return
1
;
mall
->
num_nodes
=
atoi
(
tmp
);
tmp
=
NULL
;
tmp
=
getenv
(
"SLURM_JOB_NODELIST"
);
if
(
tmp
==
NULL
)
return
1
;
mall
->
nodelist_len
=
strlen
(
tmp
)
+
1
;
mall
->
nodelist
=
(
char
*
)
malloc
(
mall
->
nodelist_len
*
sizeof
(
char
));
strcpy
(
mall
->
nodelist
,
tmp
);
tmp
=
NULL
;
tmp
=
getenv
(
"SLURM_JOB_CPUS_PER_NODE"
);
if
(
tmp
==
NULL
)
return
1
;
token
=
strtok
(
tmp
,
","
);
//TODO When MaM considers heteregenous allocations, these will be needed instead of num_cpus.
//cpus_counts = (int *) malloc(mall->num_nodes * sizeof(int));
//nodes_counts = (int *) malloc(mall->num_nodes * sizeof(int));
//i = 0;
mall
->
num_cpus
=
0
;
while
(
token
!=
NULL
)
{
count
=
1
;
// The count is not present when is 1 node.
if
(
sscanf
(
token
,
"%d(x%d)"
,
&
cpus
,
&
count
)
>=
1
)
{
mall
->
num_cpus
=
cpus
;
// num_cpus stores the amount of cores per cpu
//cpus_per_node[i] = cpus;
//nodes_count[i] = count;
//i++;
}
token
=
strtok
(
NULL
,
","
);
}
/*
if(i < mall->num_nodes) {
aux = (int *) realloc(cpus_per_node, i * sizeof(int));
if(cpus_per_node != aux && cpus_per_node != NULL) free(cpus_per_node);
cpus_per_node = aux;
aux = (int *) realloc(nodes_counts, i * sizeof(int));
if(nodes_count != aux && nodes_count != NULL) free(nodes_count);
nodes_count = aux;
}
*/
return
0
;
}
/*
* TODO
* FIXME Does not consider heterogenous machines
*/
int
MAM_I_slurm_getjob_hosts_info
()
{
int
jobId
,
err
;
char
*
tmp
=
NULL
;
job_info_msg_t
*
j_info
;
slurm_job_info_t
last_record
;
tmp
=
getenv
(
"SLURM_JOB_ID"
);
if
(
tmp
==
NULL
)
return
1
;
jobId
=
atoi
(
tmp
);
err
=
slurm_load_job
(
&
j_info
,
jobId
,
1
);
if
(
err
)
return
err
;
last_record
=
j_info
->
job_array
[
j_info
->
record_count
-
1
];
mall
->
num_nodes
=
last_record
.
num_nodes
;
mall
->
num_cpus
=
last_record
.
num_cpus
;
mall
->
nodelist_len
=
strlen
(
last_record
.
nodes
)
+
1
;
mall
->
nodelist
=
(
char
*
)
malloc
(
mall
->
nodelist_len
*
sizeof
(
char
));
strcpy
(
mall
->
nodelist
,
last_record
.
nodes
);
slurm_free_job_info_msg
(
j_info
);
return
0
;
}
#endif
Codes/malleability/malleabilityRMS.h
0 → 100644
View file @
023318fb
#ifndef MALLEABILITY_RMS_H
#define MALLEABILITY_RMS_H
#include <mpi.h>
#include "malleabilityDataStructures.h"
void
MAM_check_hosts
();
#endif
Codes/malleability/malleabilityStates.h
View file @
023318fb
...
@@ -6,7 +6,7 @@
...
@@ -6,7 +6,7 @@
//States
//States
#define MALL_DENIED -1
#define MALL_DENIED -1
enum
mall_inner_states
{
MALL_UNRESERVED
,
MALL_NOT_STARTED
,
MALL_SPAWN_PENDING
,
MALL_SPAWN_SINGLE_PENDING
,
enum
mall_inner_states
{
MALL_UNRESERVED
,
MALL_NOT_STARTED
,
MALL_RMS_COMPLETED
,
MALL_SPAWN_PENDING
,
MALL_SPAWN_SINGLE_PENDING
,
MALL_SPAWN_SINGLE_COMPLETED
,
MALL_SPAWN_ADAPT_POSTPONE
,
MALL_SPAWN_COMPLETED
,
MALL_DIST_PENDING
,
MALL_DIST_COMPLETED
,
MALL_SPAWN_SINGLE_COMPLETED
,
MALL_SPAWN_ADAPT_POSTPONE
,
MALL_SPAWN_COMPLETED
,
MALL_DIST_PENDING
,
MALL_DIST_COMPLETED
,
MALL_SPAWN_ADAPT_PENDING
,
MALL_USER_PENDING
,
MALL_USER_COMPLETED
,
MALL_SPAWN_ADAPTED
,
MALL_COMPLETED
};
MALL_SPAWN_ADAPT_PENDING
,
MALL_USER_PENDING
,
MALL_USER_COMPLETED
,
MALL_SPAWN_ADAPTED
,
MALL_COMPLETED
};
enum
mam_states
{
MAM_UNRESERVED
,
MAM_NOT_STARTED
,
MAM_PENDING
,
MAM_USER_PENDING
,
MAM_COMPLETED
};
enum
mam_states
{
MAM_UNRESERVED
,
MAM_NOT_STARTED
,
MAM_PENDING
,
MAM_USER_PENDING
,
MAM_COMPLETED
};
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment