ConjugateGradient.c

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <mkl_blas.h>
#include <mkl_spblas.h>
#include "ScalarVectors.h"
#include "SparseMatrices.h"
#include <mpi.h>
#include <string.h>
#include "../malleability/malleabilityManager.h"

//#define ONLY_SYM 0
#define ROOT 0
//#define DEBUG 0
#define MAX_PROCS_SET 16

typedef struct {
  double umbral, tol;
  int iter, maxiter, n;

  double beta, rho, alpha;
  double *res, *z, *d, *vec;
  SparseMatrix subm;
  double *d_full;

  int *dist_elem, *displs_elem;
  int *dist_rows, *displs_rows;
  int *vlen;
} Compute_data;

struct Dist_data {
  int ini;
  int fin;

  int tamBl; // Numero de filas
  int tot_r; // Total de filas en la matriz

  int myId;
  int numP;

  int numP_parents;

  MPI_Comm comm, comm_children, comm_parents;
  MPI_Datatype scalars, arrays;
};

void init_app(Compute_data *computeData, struct Dist_data *dist_data, char* argv[]);
void get_mat_dist(Compute_data *computeData, struct Dist_data dist_data, SparseMatrix mat);
void get_rows_dist(Compute_data *computeData, int numP, int n);
void mat_alloc(Compute_data *computeData, SparseMatrix mat, struct Dist_data dist_data);
void computeSolution(Compute_data computeData, double **subsol, SparseMatrix mat, int myId, double **full_vec);
void pre_compute(Compute_data *computeData, struct Dist_data dist_data, double *subsol, double *full_vec);
int compute(Compute_data *computeData, struct Dist_data *dist_data, int sm);
void free_computeData(Compute_data *computeData);

//===================================MALLEABILITY FUNCTIONS====================================================

int n_check = 30;

int dist_old(struct Dist_data *dist_data, Compute_data *computeData, int num_children, int sm, int ss, int rm, int rs);
void send_matrix(struct Dist_data dist_data, Compute_data computeData, int rootBcast, int numP_child, int idI, int idE,
                 int *sendcounts, int *recvcounts,int *sdispls, int *rdispls);

void dist_new(struct Dist_data *dist_data, Compute_data *computeData);
void recv_matrix(struct Dist_data *dist_data, Compute_data *computeData, int idI, int idE,
                 int *sendcounts, int *recvcounts,int *sdispls, int *rdispls);
//----------------------------------------------------------------------------------------------------
void get_dist(int total_r, int id, int numP, struct Dist_data *dist_data);
void set_counts(int id, int numP, struct Dist_data data_dist, int *sendcounts);
void getIds_intercomm(struct Dist_data dist_data, int numP_other, int **idS);
//----------------------------------------------------------------------------------------------------

int main (int argc, char *argv[]) {
	int terminate;
	int req, num_nodes, num_cpus = 20;
	int sm, ss, rm, rs;
        char *nodelist = NULL;
        Compute_data computeData;

        computeData.z = NULL; computeData.d_full = NULL, computeData.d = NULL;
        computeData.vec = NULL; computeData.res = NULL;
        computeData.dist_elem = NULL; computeData.displs_elem = NULL;
        computeData.dist_rows = NULL; computeData.displs_rows = NULL;
	computeData.subm.vptr = NULL;
	computeData.vlen = NULL;

	//FIXME Cambiar
	sm = 1; ss = 1; rm = 0; rs = 1;

        int numP, myId, num_children = 0;
        struct Dist_data dist_data;
        if (argc >= 5) {
          num_children = atoi(argv[2]);
          nodelist = argv[3];
          num_nodes = atoi(argv[4]);
          num_cpus = num_nodes * num_cpus;
        }

        MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &req);
        MPI_Comm_size(MPI_COMM_WORLD, &numP);
        MPI_Comm_rank(MPI_COMM_WORLD, &myId);

	printf("Nuevo set %d/%d\n", myId, numP);
        dist_data.myId = myId;
        dist_data.numP = numP;
	dist_data.comm = MPI_COMM_WORLD;

        int new_group = init_malleability(myId, numP, ROOT, dist_data.comm, argv[0], nodelist, num_cpus, num_nodes);

	if( !new_group ) { //First set of processes
	  init_app(&computeData, &dist_data, argv);
          dist_old(&dist_data, &computeData, num_children, sm, ss, rm, rs);
        } else {
          dist_new(&dist_data, &computeData);
	}


	if ( computeData.iter == 0 ) {
          terminate = compute(&computeData, &dist_data, sm);
	}
	terminate = 1;
        if(myId == ROOT && terminate) {
	  printf ("End(%d) --> (%d,%20.10e)\n", computeData.n, computeData.iter, computeData.tol);
        }

	// End of CG
        free_malleability();
        free_computeData(&computeData);
        MPI_Finalize();
}

/*
 * Init application data before
 * starting iterative computation
 */
void init_app(Compute_data *computeData, struct Dist_data *dist_data, char* argv[]) {
	SparseMatrix mat, sym;
        double *full_vec = NULL;
	double *subsol = NULL;

        if(dist_data->myId == ROOT) {
#ifdef ONLY_SYM
  	  printf ("Working with symmetric format\n");
	  CreateSparseMatrixHB (argv[1], &mat, 1);
#else
	  printf ("Working with general format\n");
	  CreateSparseMatrixHB (argv[1], &sym, 1);
	  DesymmetrizeSparseMatrices (sym, &mat);
	  RemoveSparseMatrix (&sym);
#endif
          computeData->n = mat.dim1;
        }

        // Communicate number of rows to distribute and number of elements in the matrix
        MPI_Bcast(&computeData->n, 1, MPI_INT, ROOT, MPI_COMM_WORLD);

	// Each process calcules their own distribution
        get_dist(computeData->n, dist_data->myId, dist_data->numP, dist_data);
        
        if(dist_data->myId == ROOT) { // ROOT gets rows and vpos/vval distribution
	  get_mat_dist(computeData, *dist_data, mat);
          TransformHeadertoLength(mat.vptr, computeData->n); // From vptr to vlen
        } else { // Non ROOT proceses gets row distribution
          get_rows_dist(computeData, dist_data->numP, computeData->n);
          CreateInts (&computeData->dist_elem, dist_data->numP*2);
	  InitInts (computeData->dist_elem, dist_data->numP * 2, 0.0, 0); 
          computeData->displs_elem = computeData->dist_elem + dist_data->numP;
        }
        // Allocate for each process their submatrix and get their distribution from ROOT
	mat_alloc(computeData, mat, *dist_data);

	computeSolution(*computeData, &subsol, mat, dist_data->myId, &full_vec);
	pre_compute(computeData, *dist_data, subsol, full_vec);

        //Free Initial data
	RemoveDoubles(&subsol);
	RemoveDoubles(&full_vec);
        if(dist_data->myId == ROOT) {
	  RemoveSparseMatrix(&mat);
        }
}

/*
 * MPI Dist
 * Broadcast the vptr array and each process gets the data that corresponds to itself.
 *
 * mat.vptr must be in vlen format to work correctly
 */
void get_mat_dist(Compute_data *computeData, struct Dist_data dist_data, SparseMatrix mat) {
	int i, j;
        struct Dist_data dist_data_aux;

#ifdef DEBUG
        if(dist_data.myId == ROOT) printf("Distribuyendo vptr\n");
#endif
        CreateInts (&computeData->dist_rows, dist_data.numP);
        CreateInts (&computeData->displs_rows, dist_data.numP);
        CreateInts (&computeData->dist_elem, dist_data.numP*2);
        computeData->displs_elem = computeData->dist_elem + dist_data.numP;

        InitInts (computeData->dist_rows, dist_data.numP, 0, 0);
        InitInts (computeData->displs_rows, dist_data.numP, 0, 0);
        InitInts (computeData->dist_elem, dist_data.numP*2, 0, 0);

	// Fill dist_rows and dist_elem so each process can make ScatterV or GatherV calls
        for(i=0; i<dist_data.numP; i++) {
          get_dist(computeData->n, i, dist_data.numP, &dist_data_aux);

          computeData->dist_rows[i] = dist_data_aux.tamBl;
          computeData->dist_elem[i] = mat.vptr[dist_data_aux.fin] - mat.vptr[dist_data_aux.ini];

          // Fill displacements
          if(i!=0) { 
            computeData->displs_elem[i] = computeData->displs_elem[i-1] + computeData->dist_elem[i-1];
            computeData->displs_rows[i] = computeData->displs_rows[i-1] + computeData->dist_rows[i-1];
          }
        }

#ifdef DEBUG
        printf("Proc %d almacena %d filas con %d elementos\n", dist_data.myId, computeData->dist_rows[dist_data.myId], computeData->dist_elem[dist_data.myId]);
        fflush(stdout);
#endif
}

/*
 * MPI Dist
 * Get the rows distribution of n rows in a given number of processes
 */
void get_rows_dist(Compute_data *computeData, int numP, int n) {
	int i, j;
        struct Dist_data dist_data;

        CreateInts (&(computeData->dist_rows), numP);
        CreateInts (&(computeData->displs_rows), numP);

        InitInts (computeData->dist_rows, numP, 0, 0);
        InitInts (computeData->displs_rows, numP, 0, 0);

	// Fill dist_rows and dist_elem so each process can make ScatterV or GatherV calls
        for(i=0; i<numP; i++) {
          get_dist(n, i, numP, &dist_data);

          computeData->dist_rows[i] = dist_data.tamBl;

          // Fill displacements
          if(i!=0) { 
            computeData->displs_rows[i] = computeData->displs_rows[i-1] + computeData->dist_rows[i-1];
          }
        }
}

/*
 * Matrix allocation
 *
 * The matrix that each process will use is allocated and
 * their vptr array initialised.
 *
 * MPI Dist
 * Distribute vpos and vvalues data among processes
 * Both arrays have the same distribution
 */
void mat_alloc(Compute_data *computeData, SparseMatrix mat, struct Dist_data dist_data) {
	int i;
	int elems; // Number of elements this process has
#ifdef DEBUG
        if(dist_data.myId == ROOT) printf("Distribuyendo vpos y vvalue\n");
#endif

	// dist_rows[myId] is the number of rows, n the number of columns, and dist_elem[myId] is the number of elements this process will have in the matrix
        CreateSparseMatrixVptr(&(computeData->subm), dist_data.tamBl, computeData->n, 0);
        computeData->subm.vptr[0] = 0;

        MPI_Scatterv((mat.vptr)+1, computeData->dist_rows, computeData->displs_rows, MPI_INT, (computeData->subm.vptr)+1, dist_data.tamBl, MPI_INT, ROOT, MPI_COMM_WORLD);

        CreateInts(&(computeData->vlen), dist_data.tamBl+1);
        for(i=0; i<dist_data.tamBl+1; i++) {
          computeData->vlen[i] = computeData->subm.vptr[i];
        }
        TransformLengthtoHeader(computeData->subm.vptr, computeData->subm.dim1); // The array is converted from vlen to vptr
        elems = computeData->subm.vptr[dist_data.tamBl];
        CreateSparseMatrixValues(&(computeData->subm), dist_data.tamBl, computeData->n, elems, 0);

        MPI_Scatterv(mat.vpos, computeData->dist_elem, computeData->displs_elem, MPI_INT,    computeData->subm.vpos, elems, MPI_INT,    ROOT, MPI_COMM_WORLD);
        MPI_Scatterv(mat.vval, computeData->dist_elem, computeData->displs_elem, MPI_DOUBLE, computeData->subm.vval, elems, MPI_DOUBLE, ROOT, MPI_COMM_WORLD);

	// Free elem arrays, as they are not going to be used again
        RemoveInts (&computeData->dist_elem);
}

/*
 * Compute solution
 */
void computeSolution(Compute_data computeData, double **subsol, SparseMatrix mat, int myId, double **full_vec) {
        
	CreateDoubles (subsol, computeData.dist_rows[myId]);
	InitDoubles (*subsol, computeData.dist_rows[myId], 0.0, 0.0);
	CreateDoubles(full_vec, computeData.n);
	InitDoubles (*full_vec, computeData.n, 1.0, 0.0);

//Compute SOLUTION
#ifdef ONLY_SYM
	ProdSymSparseMatrixVector (computeData.subm, *full_vec, *subsol);                  // sol += A * x
#else
	ProdSparseMatrixVector (computeData.subm, *full_vec, *subsol);                    	// sol += A * x
#endif
/*
#ifdef DEBUG
	int aux, i;
	double *solD = NULL, *sol = NULL;
	if(myId == ROOT) {
          printf("Computing solution\n");
	  CreateDoubles (&sol, computeData.n);
	  CreateDoubles (&solD, computeData.n);
	  InitDoubles (sol, computeData.n, 0.0, 0.0);
	  InitDoubles (solD, computeData.n, 0.0, 0.0);

          TransformLengthtoHeader(mat.vptr, mat.dim1); // vlen to vptr (At mat_alloc was needed as vlen)
        }

	MPI_Gatherv(*subsol, computeData.dist_rows[myId], MPI_DOUBLE, sol, computeData.dist_rows, computeData.displs_rows, MPI_DOUBLE, ROOT, MPI_COMM_WORLD);

        if(myId == ROOT) {

#ifdef ONLY_SYM
	  ProdSymSparseMatrixVector (mat, *full_vec, solD);                   // solD += A * x
#else
	  ProdSparseMatrixVector (mat, *full_vec, solD);                      // solD += A * x
#endif // ONLY_SIM
          aux = 1;
          printf("Checking sol array is ok\n");
          for(i=0; i<mat.dim1; i++) {
            if(sol[i] != solD[i]) {
              printf("[%d]Expected %lf - Result %lf\n", i, solD[i],sol[i]);
              aux = 0;
            }
          }
          if(aux) printf("sol array is correct\n");
          
        }
	RemoveDoubles (&sol);
	RemoveDoubles (&solD);
#endif // DEBUG
*/
}

/*
 * Realiza los preparativos para pasar al bucle de computo principal
 * inicializando los datos y realizando una primera iteración
 */
void pre_compute(Compute_data *computeData, struct Dist_data dist_data, double *subsol, double *full_vec) {

	int IZERO = 0, IONE = 1; 
	double DONE = 1.0, DMONE = -1.0, DZERO = 0.0;

        if(dist_data.myId == ROOT) {
	  printf("Start CG\n");
        }

        computeData->res = NULL; computeData->z = NULL; computeData->d = NULL;
	computeData->umbral = 1.0e-8;

	CreateDoubles(&computeData->res, dist_data.tamBl); 
	CreateDoubles(&computeData->z, dist_data.tamBl); 
	CreateDoubles(&computeData->d, dist_data.tamBl);
	CreateDoubles (&computeData->vec, dist_data.tamBl);
	CreateDoubles (&computeData->d_full, computeData->n);

	InitDoubles (computeData->vec, dist_data.tamBl, DZERO, DZERO); // x = 0
	InitDoubles (full_vec, computeData->n, DZERO, DZERO); // full_x = 0
	
	computeData->iter = 0;

#ifdef ONLY_SYM
	ProdSymSparseMatrixVector (computeData->subm, full_vec, computeData->z);                     				// z += A * full_x
//	mkl_dcsrsymv ("U", &n, mat.vval, mat.vptr, mat.vpos, vec, z); 			   // z = A * full_x
#else
	ProdSparseMatrixVector (computeData->subm, full_vec, computeData->z);                       				// z += A * full_x
#endif
	dcopy (&(dist_data.tamBl), subsol, &IONE, computeData->res, &IONE);             					// res = b
	daxpy (&(dist_data.tamBl), &DMONE, computeData->z, &IONE, computeData->res, &IONE);           				// res -= z
	//dcopy (&(computeData.subm.dim1), computeData.res, &IONE, &(computeData.d+computeData.displs_rows[myId]), &IONE);      // d_full = res
        MPI_Allgatherv(computeData->res, dist_data.tamBl, MPI_DOUBLE, computeData->d_full, computeData->dist_rows, computeData->displs_rows, MPI_DOUBLE, MPI_COMM_WORLD);
	dcopy (&(dist_data.tamBl), &(computeData->d_full[dist_data.ini]), &IONE, computeData->d, &IONE);             		// d = d_full[ini] to d_full[ini+tamBl]
	computeData->beta = ddot (&(dist_data.tamBl), computeData->res, &IONE, computeData->res, &IONE);      			// beta = res' * res
        MPI_Allreduce(MPI_IN_PLACE, &computeData->beta, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
	computeData->tol = sqrt (computeData->beta);                                          			   		// tol = sqrt(beta) = norm (res)
}

/*
 * Bucle de computo principal
 */
int compute(Compute_data *computeData, struct Dist_data *dist_data, int sm) {
	int IZERO = 0, IONE = 1; 
	double DONE = 1.0, DMONE = -1.0, DZERO = 0.0;
	int state = MALL_NOT_STARTED;
        int ended_loop = 1;
        int cnt = 0;

        computeData->maxiter = 1000;

	while ((computeData->iter < computeData->maxiter) && (computeData->tol > computeData->umbral)) {
	//while (computeData->tol > computeData->umbral) {
                if (computeData->iter == 3) { state = malleability_checkpoint(); }
		if (dist_data->myId == 0) {printf("TEST %d\n", computeData->iter);}
		if ((state == MALL_COMPLETED && sm == 0) || state == MALL_ZOMBIE) {break;}	

//		if(dist_data->myId == ROOT) printf ("(%d,%20.10e)\n", computeData->iter, computeData->tol);

//      	COMPUTATION
#ifdef ONLY_SYM
		ProdSymSparseMatrixVector (computeData->subm, computeData->d_full, computeData->z);                     // z += A * d_full
#else
		ProdSparseMatrixVector (computeData->subm, computeData->d_full, computeData->z);                    	// z += A * d_full
#endif
        	computeData->rho = ddot (&(dist_data->tamBl), computeData->d, &IONE, computeData->z, &IONE);		// rho = (d * z)
	        MPI_Allreduce(MPI_IN_PLACE, &computeData->rho, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);			// Reduce(rho, SUM)
		computeData->rho = computeData->beta / computeData->rho;                 		                // rho = beta / aux
		daxpy (&(dist_data->tamBl), &computeData->rho, computeData->d, &IONE, computeData->vec, &IONE);		// x += rho * d
		computeData->rho = -computeData->rho;
		daxpy (&(dist_data->tamBl), &computeData->rho, computeData->z, &IONE, computeData->res, &IONE);         // res -= rho * z
		computeData->alpha = computeData->beta;                                               		        // alpha = beta
		computeData->beta = ddot (&(dist_data->tamBl), computeData->res, &IONE, computeData->res, &IONE);       // beta = res' * res
	        MPI_Allreduce(MPI_IN_PLACE, &computeData->beta, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);		// Reduce(beta, SUM)
		computeData->alpha = computeData->beta / computeData->alpha;                                       	// alpha = beta / alpha
		dscal (&(dist_data->tamBl), &computeData->alpha, computeData->d, &IONE);                   		// d = alpha * d
		daxpy (&(dist_data->tamBl), &DONE, computeData->res, &IONE, computeData->d, &IONE);        		// d += res
	        MPI_Allgatherv(computeData->d, dist_data->tamBl, MPI_DOUBLE, computeData->d_full, 
				computeData->dist_rows, computeData->displs_rows, MPI_DOUBLE, MPI_COMM_WORLD);		// d_full = Gather(d)
		computeData->tol = sqrt (computeData->beta);                                          			// tol = sqrt(beta) = norm (res)
		computeData->iter++;

	}
#ifdef DEBUG
	if(dist_data->myId == ROOT) printf ("Ended loop\n");
#endif
	return ended_loop;
}


void free_computeData(Compute_data *computeData) {
	if(computeData->res != NULL) {
	RemoveDoubles (&computeData->res); 
	}
	if(computeData->z != NULL) {
        RemoveDoubles (&computeData->z); 
	}
	if(computeData->d != NULL) {
        RemoveDoubles (&computeData->d);
	}
	if(computeData->vec != NULL) {
	RemoveDoubles (&computeData->vec);
	}


	if(computeData->d_full != NULL) {
        RemoveDoubles (&computeData->d_full);
	}
	if(computeData->subm.vptr != NULL) {
	RemoveSparseMatrix2 (&computeData->subm);
	}

	if(computeData->dist_rows != NULL) {
        RemoveInts (&computeData->dist_rows);
	}
	if(computeData->displs_rows != NULL) {
        RemoveInts (&computeData->displs_rows);
	}
	if(computeData->vlen != NULL) {
        RemoveInts (&computeData->vlen);
	}
}

/*
 *  _____________________________________________________________________________________
 * ||                                                                                   ||
 * ||                                                                                   ||
 * ||                            DISTRIBUTION FUNCTIONS                                 ||
 * ||                                                                                   ||
 * ||                                                                                   ||
 * \_____________________________________________________________________________________/
*/

/*
 * Las siguientes funciones están todas relacionadas con la distribución de los datos
 * o procesos.
 */

/*
 * ========================================================================================
 * ========================================================================================
 * ========================PARENTS COMMUNICATION FUNCTIONS=================================
 * ========================================================================================
 * ========================================================================================
*/

/*
 */
int dist_old(struct Dist_data *dist_data, Compute_data *computeData, int num_children, int sm, int ss, int rm, int rs) {
    int phy_dist = 2;
    set_malleability_configuration(sm, ss, phy_dist, rm, rs);
    set_children_number(num_children);
    
    malleability_add_data(&(computeData->iter), 1, MAL_INT, 1, 1);
    //malleability_add_data(&(computeData->tol), 1, MAL_DOUBLE, 1, 1);
    //malleability_add_data(&(computeData->beta), 1, MAL_DOUBLE, 1, 1);
    //malleability_add_data(&(computeData->umbral), 1, MAL_DOUBLE, 1, 1);
/*
    malleability_add_data(&(computeData->vec), computeData->n, MAL_DOUBLE, 0, 1);
    malleability_add_data(&(computeData->res), computeData->n, MAL_DOUBLE, 0, 1);
    malleability_add_data(&(computeData->z), computeData->n, MAL_DOUBLE, 0, 1);
    malleability_add_data(&(computeData->d_full), computeData->n, MAL_DOUBLE, 1, 1);

    malleability_add_data(&(computeData->vlen), computeData->n, MAL_INT, 1, 1); //FIXME Ultimo valor puede sere asinc
    malleability_add_data(&(computeData->subm.vpos), computeData->n, MAL_INT, 1, 1);
    malleability_add_data(&(computeData->subm.vval), computeData->n, MAL_DOUBLE, 1, 1);
    */
}

/*
    MPI_Bcast(computeData->d_full, computeData->n, MPI_DOUBLE, rootBcast, dist_data.comm_children);
    MPI_Alltoallv(computeData->res, sendcounts, sdispls, dist_data.arrays, NULL, recvcounts, rdispls, MPI_INT, dist_data.comm_children);
    */

void send_matrix(struct Dist_data dist_data, Compute_data computeData, int rootBcast, int numP_child, int idI, int idE,
                 int *sendcounts, int *recvcounts,int *sdispls, int *rdispls) {
    int i;

    TransformHeadertoLength(computeData.subm.vptr, computeData.subm.dim1); // De vptr a vlen
    // Distribuir vlen con los hijos
    MPI_Alltoallv(computeData.subm.vptr+1, sendcounts, sdispls, MPI_INT, NULL, recvcounts, rdispls, MPI_INT, dist_data.comm_children);
    TransformLengthtoHeader(computeData.subm.vptr, computeData.subm.dim1); // De vlen a vptr

    // Calcular cuantos elementos se van a enviar a cada proceso hijo
    if(idI == 0 && sendcounts[0] > 0) {    
      sendcounts[0] = computeData.subm.vptr[sdispls[0] + sendcounts[0]] - computeData.subm.vptr[sdispls[0]];
      idI++;
    }
    for(i=idI; i<idE; i++) {
      if(sendcounts[i] > 0) {    
        sendcounts[i] = computeData.subm.vptr[sdispls[i] + sendcounts[i]] - computeData.subm.vptr[sdispls[i]];
      }
      sdispls[i] = sdispls[i-1] + sendcounts[i-1];
    }
    //print_counts(dist_data, sendcounts, sdispls, numP_child, "Send");

    /* COMUNICACION DE DATOS */
    MPI_Alltoallv(computeData.subm.vpos, sendcounts, sdispls, MPI_INT, NULL, recvcounts, rdispls, MPI_INT, dist_data.comm_children);
    MPI_Alltoallv(computeData.subm.vval, sendcounts, sdispls, MPI_DOUBLE, NULL, recvcounts, rdispls, MPI_DOUBLE, dist_data.comm_children);
}

/*
 * ========================================================================================
 * ========================================================================================
 * ========================CHILDREN COMMUNICATION FUNCTIONS================================
 * ========================================================================================
 * ========================================================================================
*/

/*
 * Función llamada por un set de procesos hijos.
 *
 * Primero los hijos obtienen de los padres una información iniciar 
 * con la que conocer el tamaño de sus vectores y matriz, como asi 
 * tambien cuantos datos van a recibir de cada padre.
 *
 * Tras esto se preparan para recibir los datos de los padres.
 *
 */
void dist_new(struct Dist_data *dist_data, Compute_data *computeData) {
    void *value = NULL;
    malleability_get_data(&value, 0, 1, 1);
    computeData->iter = *((int *)value);
   /* malleability_get_data(&value, 1, 1, 1);
    computeData->tol = *((double *)value);
    malleability_get_data(&value, 2, 1, 1);
    computeData->beta = *((double *)value);
    malleability_get_data(&value, 3, 1, 1);
    computeData->umbral = *((double *)value);
*/
    /*
    malleability_get_data(&value, 0, 0, 1);
    computeData->vec = ((double *)value);
    malleability_get_data(&value, 1, 0, 1);
    computeData->res = ((double *)value);
    malleability_get_data(&value, 2, 0, 1);
    computeData->z = ((double *)value);
    malleability_get_data(&value, 4, 1, 1);
    computeData->d_full = ((double *)value);

    malleability_get_data(&value, 5, 1, 1);
    computeData->subm.vptr = ((int *)value);
    malleability_get_data(&value, 6, 1, 1);
    computeData->subm.vpos = ((int *)value);
    malleability_get_data(&value, 7, 1, 1);
    computeData->subm.vval = ((double *)value);
    TransformLengthtoHeader(computeData->subm.vptr, computeData->subm.dim1); // De vlen a vptr
    */
}

/*
    MPI_Bcast(computeData->d_full, computeData->n, MPI_DOUBLE, ROOT, dist_data->comm_parents); // Recibir vectores RES y D_FULL
    MPI_Alltoallv(aux, sendcounts, sdispls, MPI_INT, computeData->res, recvcounts, rdispls, dist_data->arrays, dist_data->comm_parents);
    dcopy (&(dist_data->tamBl), &(computeData->d_full[dist_data->ini]), &IONE, computeData->d, &IONE); // Copia parcial de D_FULL a D
*/

void recv_matrix(struct Dist_data *dist_data, Compute_data *computeData, int idI, int idE,
                 int *sendcounts, int *recvcounts,int *sdispls, int *rdispls) {
    int i;
    double *aux;
    int *aux_int, elems;
    Compute_data dist_parents;

    /* PREPARAR DATOS DE RECEPCION SOBRE MATRIZ */
    get_rows_dist(&dist_parents, dist_data->numP_parents, computeData->n);

    get_rows_dist(computeData, dist_data->numP, computeData->n);
    CreateSparseMatrixVptr(&(computeData->subm), dist_data->tamBl, computeData->n, 0);

    MPI_Alltoallv(aux_int, sendcounts, sdispls, MPI_INT, (computeData->subm.vptr)+1, recvcounts, rdispls, MPI_INT, dist_data->comm_parents);

    TransformLengthtoHeader(computeData->subm.vptr, computeData->subm.dim1); // De vlen a vptr
    elems = computeData->subm.vptr[dist_data->tamBl];
    CreateSparseMatrixValues(&(computeData->subm), dist_data->tamBl, computeData->n, elems, 0);

    // Calcular cuantos elementos se van a recibir de cada proceso padre
    if(idI == 0 && recvcounts[0] > 0) {    
      recvcounts[0] = computeData->subm.vptr[rdispls[0] + recvcounts[0]] - computeData->subm.vptr[rdispls[0]];
      idI++;
    }
    for(i=idI; i<idE; i++) {
      if(recvcounts[i] > 0) {    
        recvcounts[i] = computeData->subm.vptr[rdispls[i] + recvcounts[i]] - computeData->subm.vptr[rdispls[i]];
      }
      rdispls[i] = rdispls[i-1] + recvcounts[i-1];
    }
    //print_counts(*dist_data, recvcounts, rdispls, numP_parents, "Recv");

    /* COMUNICACION DE DATOS */
    MPI_Alltoallv(aux_int, sendcounts, sdispls, MPI_INT, computeData->subm.vpos, recvcounts, rdispls, MPI_INT, dist_data->comm_parents);
    MPI_Alltoallv(aux, sendcounts, sdispls, MPI_DOUBLE, computeData->subm.vval, recvcounts, rdispls, MPI_DOUBLE, dist_data->comm_parents);

    free(dist_parents.dist_rows);
    free(dist_parents.displs_rows);
}

/*
 * ========================================================================================
 * ========================================================================================
 * ================================DISTRIBUTION FUNCTIONS==================================
 * ========================================================================================
 * ========================================================================================
*/

/*
 * Obtiene para el Id que se pasa junto a su
 * numero de procesos total, con cuantas filas (tamBl),
 * elementos por fila, y total de filas (fin - ini)
 * con las que va a trabajar el proceso
 */
void get_dist(int total_r, int id, int numP, struct Dist_data *dist_data) {
  int rem;

  dist_data->tot_r = total_r;
  dist_data->tamBl = total_r / numP;
  rem = total_r % numP;

  if(id < rem) { // First subgroup
    dist_data->ini = id * dist_data->tamBl + id;
    dist_data->fin = (id+1) * dist_data->tamBl + (id+1);
  } else { // Second subgroup
    dist_data->ini = id * dist_data->tamBl + rem;
    dist_data->fin = (id+1) * dist_data->tamBl + rem;
  }
  
  if(dist_data->fin > total_r) {
    dist_data->fin = total_r;
  }
  if(dist_data->ini > dist_data->fin) {
    dist_data->ini = dist_data->fin;
  }

  dist_data->tamBl = dist_data->fin - dist_data->ini;
}

/*
 * Obtiene para un Id de proceso, cuantos elementos va 
 * a enviar/recibir el proceso myId
 */
void set_counts(int id, int numP, struct Dist_data data_dist, int *sendcounts) {
  struct Dist_data other;
  int biggest_ini, smallest_end, tot_rows;

  get_dist(data_dist.tot_r, id, numP, &other);

  // Si el rango de valores no coincide, se pasa al siguiente proceso
  if(data_dist.ini >= other.fin || data_dist.fin <= other.ini) {
    return;
  }

  // Obtiene el proceso con mayor ini entre los dos procesos
  if(data_dist.ini > other.ini) { 
    biggest_ini = data_dist.ini;
  } else {
    biggest_ini = other.ini;
  }

  // Obtiene el proceso con menor fin entre los dos procesos
  if(data_dist.fin < other.fin) {
    smallest_end = data_dist.fin;
  } else {
    smallest_end = other.fin;
  }
  sendcounts[id] = smallest_end - biggest_ini; // Numero de elementos a enviar/recibir del proceso Id
}

/*
 * Obtiene para un proceso de un grupo a que rango procesos de 
 * otro grupo tiene que enviar o recibir datos.
 *
 * Devuelve el primer identificador y el último (Excluido) con el que
 * comunicarse.
 */
void getIds_intercomm(struct Dist_data dist_data, int numP_other, int **idS) {
    int idI, idE;
    int tamOther = dist_data.tot_r / numP_other;
    int remOther = dist_data.tot_r % numP_other;
    int middle = (tamOther + 1) * remOther;

    if(middle > dist_data.ini) { // First subgroup
      idI = dist_data.ini / (tamOther + 1);
    } else { // Second subgroup
      idI = ((dist_data.ini - middle) / tamOther) + remOther;
    }

    if(middle >= dist_data.fin) { // First subgroup
      idE = dist_data.fin / (tamOther + 1);
      idE = (dist_data.fin % (tamOther + 1) > 0 && idE+1 <= numP_other) ? idE+1 : idE;
    } else { // Second subgroup
      idE = ((dist_data.fin - middle) / tamOther) + remOther;
      idE = ((dist_data.fin - middle) % tamOther > 0 && idE+1 <= numP_other) ? idE+1 : idE;
    }

    //free(*idS);
    CreateInts(idS, 2);
    (*idS)[0] = idI;
    (*idS)[1] = idE;
}

/*
 
	  double starttime, endtime, total, res;
          MPI_Barrier(MPI_COMM_WORLD);
	  starttime = MPI_Wtime();
	  endtime = MPI_Wtime();
          total = endtime - starttime;
          MPI_Reduce(&total, &res, 1, MPI_DOUBLE, MPI_MAX, ROOT, MPI_COMM_WORLD);
          if(dist_data.myId == ROOT) {printf("Tiempo BCAST PADRE %f\n", total); fflush(stdout);}
 */