Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Iker Martín Álvarez
Proteo
Commits
62af4ef7
Commit
62af4ef7
authored
Oct 16, 2024
by
iker_martin
Browse files
WIP 1/3 - Bugfix for jobids higher than 1000.
parent
18499f94
Changes
3
Hide whitespace changes
Inline
Side-by-side
Codes/MaM/spawn_methods/PortService.c
View file @
62af4ef7
...
@@ -46,17 +46,15 @@ void open_port(Spawn_ports *spawn_port, int open_port, int open_service)
...
@@ -46,17 +46,15 @@ void open_port(Spawn_ports *spawn_port, int open_port, int open_service)
if
(
spawn_port
->
port_name
!=
NULL
)
if
(
spawn_port
->
port_name
!=
NULL
)
return
;
return
;
if
(
open_port
)
if
(
open_port
)
{
{
spawn_port
->
opened_port
=
1
;
spawn_port
->
opened_port
=
1
;
spawn_port
->
port_name
=
(
char
*
)
malloc
(
MPI_MAX_PORT_NAME
*
sizeof
(
char
));
spawn_port
->
port_name
=
(
char
*
)
malloc
(
MPI_MAX_PORT_NAME
*
sizeof
(
char
));
MPI_Open_port
(
MPI_INFO_NULL
,
spawn_port
->
port_name
);
MPI_Open_port
(
MPI_INFO_NULL
,
spawn_port
->
port_name
);
if
(
open_service
!=
MAM_SERVICE_UNNEEDED
)
if
(
open_service
!=
MAM_SERVICE_UNNEEDED
)
{
{
spawn_port
->
service_name
=
(
char
*
)
malloc
((
MAM_SERVICE_NAME_SIZE
)
*
sizeof
(
char
));
spawn_port
->
service_name
=
(
char
*
)
malloc
((
MAM_SERVICE_NAME_SIZE
)
*
sizeof
(
char
));
#if MAM_USE_SLURM
#if MAM_USE_SLURM
char
*
tmp
=
getenv
(
"SLURM_JOB_ID"
);
char
*
tmp
=
getenv
(
"SLURM_JOB_ID"
);
if
(
tmp
!=
NULL
)
{
job_id
=
atoi
(
tmp
);
}
if
(
tmp
!=
NULL
)
{
job_id
=
atoi
(
tmp
)
%
1000
;
}
#endif
#endif
snprintf
(
spawn_port
->
service_name
,
MAM_SERVICE_NAME_SIZE
,
"mam_service_jid%04d_gr%03d"
,
job_id
,
open_service
);
snprintf
(
spawn_port
->
service_name
,
MAM_SERVICE_NAME_SIZE
,
"mam_service_jid%04d_gr%03d"
,
job_id
,
open_service
);
MPI_Publish_name
(
spawn_port
->
service_name
,
MPI_INFO_NULL
,
spawn_port
->
port_name
);
MPI_Publish_name
(
spawn_port
->
service_name
,
MPI_INFO_NULL
,
spawn_port
->
port_name
);
...
@@ -131,7 +129,7 @@ void discover_remote_port(int id_group, Spawn_ports *spawn_port) {
...
@@ -131,7 +129,7 @@ void discover_remote_port(int id_group, Spawn_ports *spawn_port) {
spawn_port
->
remote_service
=
(
char
*
)
malloc
(
MAM_SERVICE_NAME_SIZE
*
sizeof
(
char
));
spawn_port
->
remote_service
=
(
char
*
)
malloc
(
MAM_SERVICE_NAME_SIZE
*
sizeof
(
char
));
#if MAM_USE_SLURM
#if MAM_USE_SLURM
char
*
tmp
=
getenv
(
"SLURM_JOB_ID"
);
char
*
tmp
=
getenv
(
"SLURM_JOB_ID"
);
if
(
tmp
!=
NULL
)
{
job_id
=
atoi
(
tmp
);
}
if
(
tmp
!=
NULL
)
{
job_id
=
atoi
(
tmp
)
%
1000
;
}
#endif
#endif
snprintf
(
spawn_port
->
remote_service
,
MAM_SERVICE_NAME_SIZE
,
"mam_service_jid%04d_gr%03d"
,
job_id
,
id_group
);
snprintf
(
spawn_port
->
remote_service
,
MAM_SERVICE_NAME_SIZE
,
"mam_service_jid%04d_gr%03d"
,
job_id
,
id_group
);
}
else
{
// For subsequent lookups, only update the variable part (group ID) of the service name.
}
else
{
// For subsequent lookups, only update the variable part (group ID) of the service name.
...
...
Codes/MaM/spawn_methods/ProcessDist.c
View file @
62af4ef7
...
@@ -6,6 +6,7 @@
...
@@ -6,6 +6,7 @@
#include <string.h>
#include <string.h>
#include <mpi.h>
#include <mpi.h>
#include "ProcessDist.h"
#include "ProcessDist.h"
#include "SpawnUtils.h"
#include "../MAM_Constants.h"
#include "../MAM_Constants.h"
#include "../MAM_DataStructures.h"
#include "../MAM_DataStructures.h"
...
@@ -23,7 +24,6 @@
...
@@ -23,7 +24,6 @@
void
node_dist
(
Spawn_data
spawn_data
,
int
**
qty
,
int
*
used_nodes
,
int
*
total_spawns
);
void
node_dist
(
Spawn_data
spawn_data
,
int
**
qty
,
int
*
used_nodes
,
int
*
total_spawns
);
void
spread_dist
(
Spawn_data
spawn_data
,
int
*
used_nodes
,
int
*
procs
);
void
spread_dist
(
Spawn_data
spawn_data
,
int
*
used_nodes
,
int
*
procs
);
void
compact_dist
(
Spawn_data
spawn_data
,
int
*
used_nodes
,
int
*
procs
);
void
compact_dist
(
Spawn_data
spawn_data
,
int
*
used_nodes
,
int
*
procs
);
void
set_spawn_cmd
(
Spawn_data
*
spawn_data
);
void
generate_info_string
(
char
*
nodelist
,
int
*
procs_array
,
size_t
nodes
,
Spawn_data
*
spawn_data
);
void
generate_info_string
(
char
*
nodelist
,
int
*
procs_array
,
size_t
nodes
,
Spawn_data
*
spawn_data
);
void
generate_multiple_info_string
(
char
*
nodelist
,
int
*
procs_array
,
size_t
nodes
,
Spawn_data
*
spawn_data
);
void
generate_multiple_info_string
(
char
*
nodelist
,
int
*
procs_array
,
size_t
nodes
,
Spawn_data
*
spawn_data
);
...
@@ -63,7 +63,7 @@ void processes_dist(Spawn_data *spawn_data) {
...
@@ -63,7 +63,7 @@ void processes_dist(Spawn_data *spawn_data) {
#if MAM_USE_SLURM
#if MAM_USE_SLURM
switch
(
spawn_data
->
mapping_fill_method
)
{
switch
(
spawn_data
->
mapping_fill_method
)
{
case
MAM_PHY_TYPE_STRING
:
case
MAM_PHY_TYPE_STRING
:
if
(
spawn_data
->
spawn_is_multiple
)
{
if
(
spawn_data
->
spawn_is_multiple
||
spawn_data
->
spawn_is_parallel
)
{
generate_multiple_info_string_slurm
(
mall
->
nodelist
,
procs_array
,
used_nodes
,
spawn_data
);
generate_multiple_info_string_slurm
(
mall
->
nodelist
,
procs_array
,
used_nodes
,
spawn_data
);
}
else
{
}
else
{
generate_info_string_slurm
(
mall
->
nodelist
,
procs_array
,
used_nodes
,
spawn_data
);
generate_info_string_slurm
(
mall
->
nodelist
,
procs_array
,
used_nodes
,
spawn_data
);
...
@@ -74,16 +74,55 @@ void processes_dist(Spawn_data *spawn_data) {
...
@@ -74,16 +74,55 @@ void processes_dist(Spawn_data *spawn_data) {
break
;
break
;
}
}
#else
#else
if
(
spawn_data
->
spawn_is_multiple
)
{
if
(
spawn_data
->
spawn_is_multiple
||
spawn_data
->
spawn_is_parallel
)
{
generate_multiple_info_string
(
mall
->
nodelist
,
procs_array
,
used_nodes
,
spawn_data
);
generate_multiple_info_string
(
mall
->
nodelist
,
procs_array
,
used_nodes
,
spawn_data
);
}
else
{
}
else
{
generate_info_string
(
mall
->
nodelist
,
procs_array
,
used_nodes
,
spawn_data
);
generate_info_string
(
mall
->
nodelist
,
procs_array
,
used_nodes
,
spawn_data
);
}
}
#endif
#endif
set_spawn_cmd
(
spawn_data
);
char
*
aux_cmd
=
get_spawn_cmd
();
for
(
int
index
=
0
;
index
<
spawn_data
->
total_spawns
;
index
++
)
{
spawn_data
->
sets
[
index
].
cmd
=
aux_cmd
;
}
free
(
procs_array
);
free
(
procs_array
);
}
}
void
set_hostfile_name
(
char
**
file_name
,
int
*
n
,
int
jid
,
int
index
)
{
if
(
*
file_name
==
NULL
)
{
*
file_name
=
(
char
*
)
malloc
(
MAM_HOSTFILE_SIZE
*
sizeof
(
char
));
}
if
(
*
n
==
0
)
{
jid
=
jid
%
1000
;
snprintf
(
*
file_name
,
MAM_HOSTFILE_SIZE
,
"%s%04d%s%03d%s"
,
MAM_HOSTFILE_NAME1
,
jid
,
MAM_HOSTFILE_NAME2
,
index
,
MAM_HOSTFILE_NAME3
);
}
else
{
snprintf
((
*
file_name
)
+
MAM_HOSTFILE_SIZE1
,
MAM_HOSTFILE_SIZE2
,
"%03d%s"
,
index
,
MAM_HOSTFILE_NAME3
);
}
*
n
=
1
;
}
int
read_hostfile_procs
(
char
*
file_name
,
int
*
qty
)
{
char
*
line
=
NULL
,
*
ptr
;
FILE
*
file
=
NULL
;
file
=
fopen
(
file_name
,
"r"
);
if
(
file
==
NULL
)
{
perror
(
"Could not open hostfile to read"
);
MPI_Abort
(
MPI_COMM_WORLD
,
-
1
);
}
*
qty
=
0
;
line
=
(
char
*
)
malloc
(
MAM_HOSTFILE_LINE_SIZE
*
sizeof
(
char
));
while
(
fgets
(
line
,
MAM_HOSTFILE_LINE_SIZE
,
file
)
!=
NULL
)
{
size_t
len
=
strlen
(
line
);
ptr
=
line
+
len
-
1
;
// Search delimiter
while
(
ptr
!=
line
&&
*
ptr
!=
':'
)
{
ptr
--
;
}
if
(
*
ptr
==
':'
)
{
*
qty
+=
atoi
(
ptr
+
1
);
}
}
return
0
;
}
//--------------PRIVATE FUNCTIONS---------------//
//--------------PRIVATE FUNCTIONS---------------//
//-----------------DISTRIBUTION-----------------//
//-----------------DISTRIBUTION-----------------//
...
@@ -117,7 +156,7 @@ void node_dist(Spawn_data spawn_data, int **qty, int *used_nodes, int *total_spa
...
@@ -117,7 +156,7 @@ void node_dist(Spawn_data spawn_data, int **qty, int *used_nodes, int *total_spa
*
qty
=
calloc
(
*
used_nodes
,
sizeof
(
int
));
// Numero de procesos por nodo
*
qty
=
calloc
(
*
used_nodes
,
sizeof
(
int
));
// Numero de procesos por nodo
// if(MAM_Contains_strat(MAM_SPAWN_STRATEGIES, MAM_STRAT_SPAWN_MULTIPLE, NULL) ) {
// if(MAM_Contains_strat(MAM_SPAWN_STRATEGIES, MAM_STRAT_SPAWN_MULTIPLE, NULL) ) {
if
(
spawn_data
.
spawn_is_multiple
)
{
if
(
spawn_data
.
spawn_is_multiple
||
spawn_data
.
spawn_is_parallel
)
{
for
(
i
=
0
;
i
<
*
used_nodes
;
i
++
)
{
for
(
i
=
0
;
i
<
*
used_nodes
;
i
++
)
{
(
*
qty
)[
i
]
=
procs
[
i
];
(
*
qty
)[
i
]
=
procs
[
i
];
if
(
procs
[
i
])
(
*
total_spawns
)
++
;
if
(
procs
[
i
])
(
*
total_spawns
)
++
;
...
@@ -199,35 +238,6 @@ void compact_dist(Spawn_data spawn_data, int *used_nodes, int *procs) {
...
@@ -199,35 +238,6 @@ void compact_dist(Spawn_data spawn_data, int *used_nodes, int *procs) {
if
(
*
used_nodes
>
mall
->
num_nodes
)
*
used_nodes
=
mall
->
num_nodes
;
//FIXME Si ocurre esto no es un error?
if
(
*
used_nodes
>
mall
->
num_nodes
)
*
used_nodes
=
mall
->
num_nodes
;
//FIXME Si ocurre esto no es un error?
}
}
//--------------PRIVATE FUNCTIONS---------------//
//-------------------CMD SET--------------------//
/*
* Comprueba que comando hay que llamar al realizar
* el spawn. Todos los sets tienen que hacer el mismo
* comando.
*
*/
void
set_spawn_cmd
(
Spawn_data
*
spawn_data
)
{
int
index
=
0
;
char
*
cmd_aux
;
switch
(
mall_conf
->
external_usage
)
{
case
MAM_USE_VALGRIND
:
cmd_aux
=
MAM_VALGRIND_SCRIPT
;
break
;
case
MAM_USE_EXTRAE
:
cmd_aux
=
MAM_EXTRAE_SCRIPT
;
break
;
default:
cmd_aux
=
mall
->
name_exec
;
break
;
}
for
(;
index
<
spawn_data
->
total_spawns
;
index
++
)
{
spawn_data
->
sets
[
index
].
cmd
=
cmd_aux
;
}
}
//--------------PRIVATE FUNCTIONS---------------//
//--------------PRIVATE FUNCTIONS---------------//
//-------------------INFO SET-------------------//
//-------------------INFO SET-------------------//
...
@@ -465,14 +475,14 @@ void generate_info_hostfile_slurm(char *nodelist, int *qty, size_t used_nodes, S
...
@@ -465,14 +475,14 @@ void generate_info_hostfile_slurm(char *nodelist, int *qty, size_t used_nodes, S
hostlist_t
hostlist
;
hostlist_t
hostlist
;
char
*
tmp
=
getenv
(
"SLURM_JOB_ID"
);
char
*
tmp
=
getenv
(
"SLURM_JOB_ID"
);
jid
=
tmp
!=
NULL
?
atoi
(
tmp
)
:
0
;
jid
=
tmp
!=
NULL
?
(
atoi
(
tmp
)
%
1000
)
:
0
;
line
=
NULL
;
line
=
NULL
;
hostlist
=
slurm_hostlist_create
(
nodelist
);
hostlist
=
slurm_hostlist_create
(
nodelist
);
hostfile_name
=
(
char
*
)
malloc
(
MAM_HOSTFILE_SIZE
*
sizeof
(
char
));
hostfile_name
=
(
char
*
)
malloc
(
MAM_HOSTFILE_SIZE
*
sizeof
(
char
));
snprintf
(
hostfile_name
,
MAM_HOSTFILE_SIZE
,
"%s%04d%s%03d%s"
,
MAM_HOSTFILE_NAME1
,
jid
,
MAM_HOSTFILE_NAME2
,
index
,
MAM_HOSTFILE_NAME3
);
snprintf
(
hostfile_name
,
MAM_HOSTFILE_SIZE
,
"%s%04d%s%03d%s"
,
MAM_HOSTFILE_NAME1
,
jid
,
MAM_HOSTFILE_NAME2
,
index
,
MAM_HOSTFILE_NAME3
);
if
(
spawn_data
->
spawn_is_multiple
)
{
// MULTIPLE
if
(
spawn_data
->
spawn_is_multiple
||
spawn_data
->
spawn_is_parallel
)
{
// MULTIPLE
for
(;
index
<
spawn_data
->
total_spawns
;
index
++
)
{
for
(;
index
<
spawn_data
->
total_spawns
;
index
++
)
{
// This strat creates 1 hostfile per spawn
// This strat creates 1 hostfile per spawn
qty_index
=
fill_multiple_hostfile_slurm
(
hostfile_name
,
qty
+
qty_index
,
&
hostlist
,
&
line
,
&
len_line
);
qty_index
=
fill_multiple_hostfile_slurm
(
hostfile_name
,
qty
+
qty_index
,
&
hostlist
,
&
line
,
&
len_line
);
...
...
Codes/MaM/spawn_methods/ProcessDist.h
View file @
62af4ef7
...
@@ -4,5 +4,7 @@
...
@@ -4,5 +4,7 @@
#include "Spawn_DataStructure.h"
#include "Spawn_DataStructure.h"
void
processes_dist
(
Spawn_data
*
spawn_data
);
void
processes_dist
(
Spawn_data
*
spawn_data
);
void
set_hostfile_name
(
char
**
file_name
,
int
*
n
,
int
jid
,
int
index
);
int
read_hostfile_procs
(
char
*
file_name
,
int
*
qty
);
#endif
#endif
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment