Cambio en recogida de tiempo de iteracion, ahora se recoge el maximo entre...

Cambio en recogida de tiempo de iteracion, ahora se recoge el maximo entre todos los procesos. Hotfix en CheckRun. Se han anyadido comentarios para quitar uso de Slurm

Cambio en recogida de tiempo de iteracion, ahora se recoge el maximo entre...
Cambio en recogida de tiempo de iteracion, ahora se recoge el maximo entre todos los procesos. Hotfix en CheckRun. Se han anyadido comentarios para quitar uso de Slurm
14574ff0 · iker_martin · e0864c44 · 14574ff0 · 14574ff0 · 14574ff0
Commit 14574ff0 authored Feb 10, 2022 by iker_martin
--- a/Codes/IOcodes/results.c
+++ b/Codes/IOcodes/results.c
@@ -121,6 +121,20 @@ void reset_results_index(results_data *results) {
 }
+/*
+ * Obtiene para cada iteracion, el tiempo maximo entre todos los procesos
+ * que han participado.
+ *
+ * Es necesario obtener el maximo, pues es el que representa el tiempo real
+ * que se ha utilizado.
+ */
+void compute_results_iter(results_data *results, int myId, int root, MPI_Comm comm) {
+  if(myId == root)
+    MPI_Reduce(MPI_IN_PLACE, results->iters_time, results->iter_index, MPI_DOUBLE, MPI_MAX, root, comm);
+  else
+    MPI_Reduce(results->iters_time, NULL, results->iter_index, MPI_DOUBLE, MPI_MAX, root, comm);
+}
 //======================================================||
 //======================================================||
 //===============PRINT RESULTS FUNCTIONS================||

--- a/Codes/IOcodes/results.h
+++ b/Codes/IOcodes/results.h
@@ -23,6 +23,8 @@ void recv_results(results_data *results, int root, int resizes, MPI_Comm interco
 void set_results_post_reconfig(results_data *results, int grp, int sdr, int adr);
 void reset_results_index(results_data *results);
+void compute_results_iter(results_data *results, int myId, int root, MPI_Comm comm);
 void print_iter_results(results_data results, int last_normal_iter_index);
 void print_global_results(results_data results, int resizes);
 void init_results_data(results_data *results, int resizes, int iters_size);

--- a/Codes/Main/Main.c
+++ b/Codes/Main/Main.c
@@ -267,6 +267,7 @@ int print_local_results() {
  int ptr_local, ptr_out, err;
  char *file_name;
+  compute_results_iter(results, group->myId, ROOT, comm);
  if(group->myId == ROOT) {
    ptr_out = dup(1);
@@ -278,7 +279,7 @@ int print_local_results() {
    create_out_file(file_name, &ptr_local, 1);
    print_config_group(config_file, group->grp);
-    print_iter_results(*results, config_file->iters[group->grp] -1);
+    print_iter_results(*results, config_file->iters[group->grp] - 1);
    free(file_name);
    fflush(stdout);

--- a/Codes/malleability/BaseCode.c
+++ b/Codes/malleability/BaseCode.c
-#include <stdio.h>
-#include <stdlib.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <mpi.h>
-#include <pthread.h>
-#include <math.h>
-#include <string.h>
-#include <slurm/slurm.h>
-#include "ProcessDist.h"
-/*
- * ESTE CODIGO ES PARA COMPROBAR EL FUNCIONAMIENTO DEL FICHERO ProcessDist.h
- * NO TIENE QUE VER CON EL BENCHMARK DE MALEABILIDAD
- */
-#define ROOT 0
-#define MAXGRP 3
-#define TYPE_D 1
-// 1 Es nodos
-// 2 Es por nucleos
-// Función para crear un fichero con el formato GxNPyIDz.o{jobId}.
-// El proceso que llama a la función pasa a tener como salida estandar
-// dicho fichero.
-int create_out_file(int myId, int numP, int grp, char *jobId);
-int create_out_file(int myId, int numP, int grp, char *jobId) {
-  int ptr, err;
-  char *file_name;
-  file_name = NULL;
-  file_name = malloc(40 * sizeof(char));
-  if(file_name == NULL) return -1; // No ha sido posible alojar la memoria
-  err = snprintf(file_name, 40, "G%dNP%dID%d.o%s", grp, numP, myId, jobId);
-  if(err < 0) return -2; // No ha sido posible obtener el nombre de fichero
-  ptr = open(file_name, O_WRONLY | O_CREAT | O_APPEND, 0644);
-  if(ptr < 0) return -3; // No ha sido posible crear el fichero
-  err = close(1);
-  if(err < 0) return -4; // No es posible modificar la salida estandar
-  err = dup(ptr);
-  if(err < 0) return -4; // No es posible modificar la salida estandar
-  return 0;
-}
-// Se realizan varios tests de ancho de banda
-// al mandar N datos a los procesos impares desde el
-// par inmediatamente anterior. Tras esto, los impares
-// vuelven a enviar los N datos al proceso par.
-//
-// Tras las pruebas se imprime el ancho de banda, todo
-// el tiempo necesario para realizar todas las pruebas y
-// finalmente el tiempo medio por prueba.
-void bandwidth(int myId, double latency, int n);
-void bandwidth(int myId, double latency, int n) {
-  int i, loop_count = 100, n_bytes;
-  double start_time, stop_time, elapsed_time, bw, time;
-  char *aux;
-  n_bytes = n * sizeof(char);
-  aux = malloc(n_bytes);
-  elapsed_time = 0;
-  for(i=0; i<loop_count; i++){
-    MPI_Barrier(MPI_COMM_WORLD);
-    start_time = MPI_Wtime();
-    if(myId %2 == 0){
-      MPI_Ssend(aux, n, MPI_CHAR, myId+1, 99, MPI_COMM_WORLD);
-      MPI_Recv(aux, n, MPI_CHAR, myId+1, 99, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
-    }
-    else if(myId %2 == 1){
-      MPI_Recv(aux, n, MPI_CHAR, myId-1, 99, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
-      MPI_Ssend(aux, n, MPI_CHAR, myId-1, 99, MPI_COMM_WORLD);
-    }
-    MPI_Barrier(MPI_COMM_WORLD);
-    stop_time = MPI_Wtime();
-    elapsed_time += stop_time - start_time;
-  }
-  if(myId %2 == 0) {
-    time = elapsed_time / loop_count - latency;
-    bw = ((double)n_bytes * 2) / time;
-    printf("MyId %d Bw=%lf GB/s\nTot time=%lf\nTime=%lf\n", myId, bw/ 1000000000.0, elapsed_time, time);
-  }
-}
-// Se realizan varios tests de latencia al 
-// mandar un único dato de tipo CHAR a los procesos impares
-// desde el par inmediatamente anterior. Tras esto, los impares
-// vuelven a enviar el dato al proceso par.
-//
-// Tras las pruebas se imprime el tiempo necesario para realizar
-// TODAS las pruebas y se devuleve el tiempo medio (latencia) de
-// las pruebas
-double ping_pong(int myId, int start);
-double ping_pong(int myId, int start) {
-  int i, loop_count = 100;
-  double start_time, stop_time, elapsed_time;
-  char aux;
-  aux = '0';
-  elapsed_time = 0;
-  for(i=0; i<loop_count; i++){
-    MPI_Barrier(MPI_COMM_WORLD);
-    start_time = MPI_Wtime();
-    if(myId % 2 == 0){
-      MPI_Ssend(&aux, 1, MPI_CHAR, myId+1, 99, MPI_COMM_WORLD);
-      MPI_Recv(&aux, 1, MPI_CHAR, myId+1, 99, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
-    }
-    else if(myId % 2 == 1){
-      MPI_Recv(&aux, 1, MPI_CHAR, myId-1, 99, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
-      MPI_Ssend(&aux, 1, MPI_CHAR, myId-1, 99, MPI_COMM_WORLD);
-    }
-    MPI_Barrier(MPI_COMM_WORLD);
-    stop_time = MPI_Wtime();
-    elapsed_time += stop_time - start_time;
-  }
-  if(myId %2 == 0 && start != 0) {
-    printf("MyId %d Ping=%lf\n", myId, elapsed_time);
-    elapsed_time/=loop_count;
-  }
-  MPI_Bcast(&elapsed_time, 1, MPI_DOUBLE, ROOT, MPI_COMM_WORLD);
-  return elapsed_time;
-}
-// Trabajo común para todos los grupos de procesos
-int work(int myId, int numP, char **argv, char *job_id) {
-  int grp, n_value, aux=0;
-  double latency;
-  MPI_Comm comm = MPI_COMM_NULL, comm_par= MPI_COMM_NULL;
-  int rootBcast = MPI_PROC_NULL;
-  if(myId == ROOT) rootBcast = MPI_ROOT;
-  //     1.000.000.00 1GB
-  n_value = 400000000;
-  grp = 0;
-  // Obtener que grupo de procesos soy de los padres
-  MPI_Comm_get_parent(&comm_par);
-  if(comm_par != MPI_COMM_NULL) {
-    MPI_Bcast(&grp, 1, MPI_INT, ROOT, comm_par);
-    grp+=1;
-    MPI_Barrier(comm_par);
-    MPI_Bcast(&aux, 1, MPI_INT, rootBcast, comm_par);
-    //MPI_Comm_free(&comm_par);
-    MPI_Comm_disconnect(&comm_par);
-  }
-  // Dividir los resultados por procesos
-  //create_out_file(myId, numP, grp, job_id);
-  /*----- PRUEBAS PRESTACIONES -----*/
-  // Asegurar que se ha inicializado la comunicación de MPI
-  ping_pong(myId, 0);
-  MPI_Barrier(MPI_COMM_WORLD);
-  // Obtener la latencia de la red
-  latency = ping_pong(myId, 1);
-  // Obtener el ancho de banda
-  bandwidth(myId, latency, n_value);
-  /*----- CREACIÓN DE PROCESOS -----*/
-  // Creación de un nuevo grupo de procesos
-  // Para evitar que se creen más grupos hay que asignar
-  // el valor 0 en la variable MAXGRP
-  if(grp != MAXGRP) {
-    // Inicialización de la comunicación con SLURM
-    int aux = numP;
-    init_slurm_comm(argv, myId, aux, ROOT, TYPE_D, COMM_SPAWN_SERIAL);
-    // Esperar a que la comunicación y creación de procesos
-    // haya finalizado
-    int test = -1;
-    while(test != MPI_SUCCESS) {
-      test = check_slurm_comm(myId, ROOT, MPI_COMM_WORLD, &comm);
-    }
-    // Enviar a los hijos que grupo de procesos son
-    MPI_Bcast(&grp, 1, MPI_INT, rootBcast, comm);
-    MPI_Barrier(comm);
-    MPI_Bcast(&aux, 1, MPI_INT, ROOT, comm);
-    // Desconectar intercomunicador con los hijos
-    MPI_Comm_disconnect(&comm);
-    //MPI_Comm_free(&comm);
-  } //IF GRP
-  if(comm != MPI_COMM_NULL || comm_par != MPI_COMM_NULL) {
-    printf("GRP=%d || El comunicador no esta a NULO\n", grp);
-    fflush(stdout);
-  }
-  return grp;
-}
-int main(int argc, char ** argv) {
-  int rank, numP, grp, len, pid;
-  char *tmp;
-  MPI_Init(&argc, &argv);
-  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-  MPI_Comm_size(MPI_COMM_WORLD, &numP);  
-  pid = getpid();
-  // Imprimir datos sobre el comunicador de
-  // este grupo de procesos
-  tmp = getenv("SLURM_JOB_ID");
-  if(rank == ROOT) {
-    //system("printenv"); // Imprime todas las variables de entorno
-    printf("DATA\n");
-    //print_Info(MPI_COMM_WORLD);
-  }
-  // Imprimir nombre del nodo en el que se encuentra el proceso
-  char *name = malloc(MPI_MAX_PROCESSOR_NAME * sizeof(char));
-  MPI_Get_processor_name(name,&len);
-  printf("ID=%d Name %s PID=%d\n", rank, name, pid); 
-  fflush(stdout);
-  MPI_Barrier(MPI_COMM_WORLD);
-  // Se manda el trabajo a los hijos
-  grp = work(rank, numP, argv, tmp);
-  fflush(stdout);
-  MPI_Barrier(MPI_COMM_WORLD);
-  MPI_Finalize();
-  return 0;
-}
--- a/Codes/malleability/malleabilityManager.c
+++ b/Codes/malleability/malleabilityManager.c
@@ -561,7 +561,6 @@ int check_redistribution() {
    printf("P%d aborting -- Test Async\n", mall->myId);
    MPI_Abort(MPI_COMM_WORLD, test_err);
  }
-  //FIXME No se tiene en cuenta el estado MAL_APP_ENDED
  MPI_Allreduce(&completed, &all_completed, 1, MPI_INT, MPI_MIN, mall->comm);
  if(!all_completed) return MAL_DIST_PENDING; // Continue only if asynchronous send has ended 
@@ -631,7 +630,7 @@ int shrink_redistribution() {
    MPI_Comm_dup(mall->comm, &aux_comm);
    proc_adapt_shrink( mall->numC, &(mall->comm), mall->myId);
-    zombies_collect_suspended(aux_comm, mall->myId, mall->numP, mall->numC, mall->root);
+    zombies_collect_suspended(aux_comm, mall->myId, mall->numP, mall->numC, mall->root, (void *) mall_conf->results, mall->user_comm);
    MPI_Comm_free(&aux_comm);
    if(mall->myId < mall->numC) {

--- a/Codes/malleability/malleabilityZombies.c
+++ b/Codes/malleability/malleabilityZombies.c
@@ -6,6 +6,7 @@
 #include <mpi.h>
 //#include <slurm/slurm.h>
 #include <signal.h>
+#include "../IOcodes/results.h"
 #include "malleabilityZombies.h"
 #define PIDS_QTY 320
@@ -17,7 +18,7 @@ int offset_pids, *pids = NULL;
 void gestor_usr2() {}
-void zombies_collect_suspended(MPI_Comm comm, int myId, int numP, int numC, int root) {
+void zombies_collect_suspended(MPI_Comm comm, int myId, int numP, int numC, int root, void *results_void, MPI_Comm user_comm) {
  int pid = getpid();
  int *pids_counts = malloc(numP * sizeof(int));
  int *pids_displs = malloc(numP * sizeof(int));
@@ -41,6 +42,9 @@ void zombies_collect_suspended(MPI_Comm comm, int myId, int numP, int numC, int
  free(pids_displs);
  if(myId >= numC) {
+    // Needed to ensure iteration times are collected before suspending these processes
+    results_data *results = (results_data *) results_void;
+    compute_results_iter(results, myId, root, user_comm); 
    zombies_suspend();
  }
 }

--- a/Codes/malleability/malleabilityZombies.h
+++ b/Codes/malleability/malleabilityZombies.h
@@ -7,7 +7,7 @@
 //#include <slurm/slurm.h>
 #include <signal.h>
-void zombies_collect_suspended(MPI_Comm comm, int myId, int numP, int numC, int root);
+void zombies_collect_suspended(MPI_Comm comm, int myId, int numP, int numC, int root, void *results_void, MPI_Comm user_comm);
 void zombies_service_init();
 void zombies_service_free();
 void zombies_awake();
--- a/Exec/CheckRun.sh
+++ b/Exec/CheckRun.sh
@@ -39,6 +39,9 @@ then
 fi
 rm errores2.txt
+#Comprobar que el número de archivos es correcto
+#Pueden estar todos los archivos pero no estar los archivos
+#completos -- Esto se comprueba más tarde
 qtyG=$(ls R*/R*_Global.out | wc -l)
 qtyG=$(($qtyG * 2))
 qtyL=$(ls R*/R*_G?N*.out | wc -l)
@@ -128,6 +131,7 @@ fi
 #Comprobar que todas las ejecuciones tienen todas las ejecucciones que tocan
 #Solo es necesario comprobar el global.
 qty_missing=0
+cd $dir$ResultsDir$ResultsDirName
 for ((i=1; i<$maxIndex; i++))
 do
  qtyEx=$(grep Tex -r Run$i | wc -l)