Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Iker Martín Álvarez
Proteo
Commits
5cd121d6
Commit
5cd121d6
authored
Apr 12, 2022
by
iker_martin
Browse files
Hotfix -- No se recogian correctamente las iteraciones en Merge Shrink
parent
f4a8b977
Changes
6
Hide whitespace changes
Inline
Side-by-side
Codes/Main/Main.c
View file @
5cd121d6
...
...
@@ -71,6 +71,7 @@ int main(int argc, char *argv[]) {
}
init_group_struct
(
argv
,
argc
,
myId
,
numP
);
//FIXME No funciona en OpenMPI
im_child
=
init_malleability
(
myId
,
numP
,
ROOT
,
comm
,
argv
[
0
],
nodelist
,
num_cpus
,
num_nodes
);
if
(
!
im_child
)
{
//TODO REFACTOR Simplificar inicio
...
...
@@ -249,7 +250,6 @@ void iterate(double *matrix, int n, int async_comm, int iter) {
operations
=
time
/
Top
;
//FIXME Calcular una sola vez
for
(
i
=
0
;
i
<
operations
;
i
++
)
{
aux
+=
computePiSerial
(
n
);
}
...
...
Codes/malleability/ProcessDist.c
View file @
5cd121d6
...
...
@@ -157,7 +157,7 @@ int check_slurm_comm(int myId, int root, int numP, MPI_Comm *child, MPI_Comm com
int
state
=-
10
;
//printf("[%d][3] Test min\n", myId); fflush(stdout);
//pthread_mutex_lock(&spawn_mutex);
//pthread_mutex_lock(&spawn_mutex);
// TODO Descomentar
MPI_Allreduce
(
&
commState
,
&
state
,
1
,
MPI_INT
,
MPI_MIN
,
comm
);
//pthread_mutex_unlock(&spawn_mutex);
...
...
@@ -173,7 +173,7 @@ int check_slurm_comm(int myId, int root, int numP, MPI_Comm *child, MPI_Comm com
}
else
if
(
slurm_data
->
spawn_is_single
)
{
//pthread_mutex_lock(&spawn_mutex);
//pthread_mutex_lock(&spawn_mutex);
// TODO Descomentar
MPI_Bcast
(
&
commState
,
1
,
MPI_INT
,
root
,
comm
);
//pthread_mutex_unlock(&spawn_mutex);
int
threads_not_spawned
=
pthread_equal
(
pthread_self
(),
spawn_thread
);
...
...
Codes/malleability/malleabilityManager.c
View file @
5cd121d6
...
...
@@ -170,15 +170,20 @@ int malleability_checkpoint() {
}
else
if
(
state
==
MAL_SPAWN_PENDING
||
state
==
MAL_SPAWN_SINGLE_PENDING
)
{
// Comprueba si el spawn ha terminado y comienza la redistribucion
double
end_real_time
;
state
=
check_slurm_comm
(
mall
->
myId
,
mall
->
root
,
mall
->
numP
,
&
(
mall
->
intercomm
),
mall
->
comm
,
mall
->
thread_comm
,
&
end_real_time
);
if
(
state
==
MAL_SPAWN_COMPLETED
)
{
mall_conf
->
results
->
spawn_time
[
mall_conf
->
grp
]
=
MPI_Wtime
()
-
mall_conf
->
results
->
spawn_start
;
if
(
mall_conf
->
spawn_type
==
COMM_SPAWN_PTHREAD
||
mall_conf
->
spawn_type
==
COMM_SPAWN_MERGE_PTHREAD
)
{
mall_conf
->
results
->
spawn_real_time
[
mall_conf
->
grp
]
=
end_real_time
-
mall_conf
->
results
->
spawn_start
;
}
//TODO Si es MERGE SHRINK, metodo diferente de redistribucion de datos
state
=
start_redistribution
();
if
(
mall_conf
->
spawn_type
==
COMM_SPAWN_MERGE_PTHREAD
&&
mall
->
numP
>
mall
->
numC
)
{
state
=
shrink_redistribution
();
//TODO REFACTOR
}
else
{
state
=
check_slurm_comm
(
mall
->
myId
,
mall
->
root
,
mall
->
numP
,
&
(
mall
->
intercomm
),
mall
->
comm
,
mall
->
thread_comm
,
&
end_real_time
);
if
(
state
==
MAL_SPAWN_COMPLETED
)
{
mall_conf
->
results
->
spawn_time
[
mall_conf
->
grp
]
=
MPI_Wtime
()
-
mall_conf
->
results
->
spawn_start
;
if
(
mall_conf
->
spawn_type
==
COMM_SPAWN_PTHREAD
||
mall_conf
->
spawn_type
==
COMM_SPAWN_MERGE_PTHREAD
)
{
mall_conf
->
results
->
spawn_real_time
[
mall_conf
->
grp
]
=
end_real_time
-
mall_conf
->
results
->
spawn_start
;
}
//TODO Si es MERGE SHRINK, metodo diferente de redistribucion de datos
state
=
start_redistribution
();
}
}
}
else
if
(
state
==
MAL_DIST_PENDING
)
{
...
...
@@ -641,16 +646,71 @@ int end_redistribution() {
return
result
;
}
///=============================================
///=============================================
///=============================================
double
time_adapt
;
int
state_shrink
=
0
;
//TODO Refactor
pthread_t
thread_shrink
;
MPI_Comm
comm_shrink
;
int
thread_shrink_creation
();
void
*
thread_shrink_work
();
/*
* Crea una hebra para ejecutar una comunicación en segundo plano.
*/
int
thread_shrink_creation
()
{
if
(
pthread_create
(
&
thread_shrink
,
NULL
,
thread_shrink_work
,
NULL
))
{
printf
(
"Error al crear el hilo
\n
"
);
MPI_Abort
(
MPI_COMM_WORLD
,
-
1
);
return
-
1
;
}
return
MAL_SPAWN_PENDING
;
}
void
*
thread_shrink_work
()
{
proc_adapt_shrink
(
mall
->
numC
,
&
comm_shrink
,
mall
->
myId
);
state_shrink
=
2
;
pthread_exit
(
NULL
);
}
///=============================================
///=============================================
///=============================================
int
shrink_redistribution
()
{
double
time_adapt
=
MPI_Wtime
()
;
int
global_state
;
MPI_Comm
aux_comm
;
MPI_Comm_dup
(
mall
->
comm
,
&
aux_comm
);
proc_adapt_shrink
(
mall
->
numC
,
&
(
mall
->
comm
),
mall
->
myId
);
if
(
mall_conf
->
spawn_type
==
COMM_SPAWN_MERGE_PTHREAD
)
{
if
(
state_shrink
==
0
)
{
time_adapt
=
MPI_Wtime
();
state_shrink
=
1
;
MPI_Comm_dup
(
mall
->
comm
,
&
comm_shrink
);
thread_shrink_creation
();
return
MAL_SPAWN_PENDING
;
}
else
if
(
state_shrink
>
0
)
{
MPI_Allreduce
(
&
state_shrink
,
&
global_state
,
1
,
MPI_INT
,
MPI_MIN
,
mall
->
comm
);
if
(
global_state
<
2
)
return
MAL_SPAWN_PENDING
;
if
(
pthread_join
(
thread_shrink
,
NULL
))
{
printf
(
"Error al esperar al hilo
\n
"
);
MPI_Abort
(
MPI_COMM_WORLD
,
-
1
);
return
-
10
;
}
MPI_Comm_dup
(
mall
->
comm
,
&
aux_comm
);
mall
->
comm
=
comm_shrink
;
}
}
else
{
time_adapt
=
MPI_Wtime
();
MPI_Comm_dup
(
mall
->
comm
,
&
aux_comm
);
proc_adapt_shrink
(
mall
->
numC
,
&
(
mall
->
comm
),
mall
->
myId
);
}
//TODO REFACTOR -- Que solo la llamada de collect iters este fuera de los hilos
zombies_collect_suspended
(
aux_comm
,
mall
->
myId
,
mall
->
numP
,
mall
->
numC
,
mall
->
root
,
(
void
*
)
mall_conf
->
results
,
mall
->
user_comm
);
MPI_Comm_free
(
&
aux_comm
);
if
(
mall
->
myId
<
mall
->
numC
)
{
MPI_Comm_free
(
&
aux_comm
);
MPI_Comm_dup
(
mall
->
comm
,
&
aux_comm
);
mall
->
thread_comm
=
aux_comm
;
MPI_Comm_dup
(
mall
->
comm
,
&
aux_comm
);
...
...
Codes/malleability/malleabilityStates.h
View file @
5cd121d6
...
...
@@ -19,14 +19,18 @@
#define COMM_PHY_NODES 1
#define COMM_PHY_CPU 2
//
TODO Separar PTHREAD
//
SPAWN METHODS
#define COMM_SPAWN_SERIAL 0
#define COMM_SPAWN_PTHREAD 1
#define COMM_SPAWN_MERGE 2
#define COMM_SPAWN_MERGE_PTHREAD 3
//#define COMM_SPAWN_BASELINE 0
//#define COMM_SPAWN_MERGE 1
//SPAWN STRATEGIES
#define COMM_SPAWN_MULTIPLE 0
#define COMM_SPAWN_SINGLE 1
//#define COMM_SPAWN_SERIAL 0
//#define COMM_SPAWN_PTHREAD 1
#define MAL_USE_NORMAL 0
#define MAL_USE_IBARRIER 1
...
...
Codes/recordMachinefile.sh
View file @
5cd121d6
...
...
@@ -15,3 +15,4 @@ elif [ $dist == "cpu" ]; then
fi
$dir
/Recordnodelist.o
$numP
$dist
echo
$numP
Codes/runBase.sh
View file @
5cd121d6
#!/bin/bash
#SBATCH -p P1
#SBATCH -N 1
#SBATCH --exclude=c01,c00,c02
...
...
@@ -17,6 +18,7 @@ module load mpich-3.4.1-noucx
numP
=
$(
bash recordMachinefile.sh
$1
)
mpirun
-print-all-exitcodes
-f
hostfile.o
$SLURM_JOB_ID
$dir$codeDir
/a.out
$1
$2
$nodelist
$nodes
#mpirun -np $numP $dir$codeDir/a.out $1 $2 $nodelist $nodes
rm
hostfile.o
$SLURM_JOB_ID
echo
"END RUN"
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment