#include <stdio.h>
#include <stdlib.h>
#include <math.h>
//#include <mkl_blas.h>
//#include <mkl_spblas.h>
#include "mymkl.h"
#include "ScalarVectors.h"
#include "SparseMatrices.h"
#include <mpi.h>
#include <string.h>
#include "../malleability/malleabilityManager.h"

#include <unistd.h>

//#define ONLY_SYM 0
#define ROOT 0
//#define DEBUG 0
#define MAX_PROCS_SET 16

typedef struct {
  double umbral, tol;
  int iter, maxiter, n;

  double beta, rho, alpha;
  double *res, *z, *d, *vec;
  SparseMatrix subm;
  double *d_full;

  int *dist_elem, *displs_elem;
  int *dist_rows, *displs_rows;
  int *vlen;
} Compute_data;

struct Dist_data {
  int ini;
  int fin;

  int tamBl; // Numero de filas
  int tot_r; // Total de filas en la matriz

  int myId;
  int numP;

  int numP_parents;

  MPI_Comm comm, comm_children, comm_parents;
  MPI_Datatype scalars, arrays;
};

void dumb(Compute_data *computeData, struct Dist_data *dist_data); //FIXME Delte me

void init_app(Compute_data *computeData, struct Dist_data *dist_data, char* argv[]);
void get_mat_dist(Compute_data *computeData, struct Dist_data dist_data, SparseMatrix mat);
void get_rows_dist(Compute_data *computeData, int numP, int n);
void mat_alloc(Compute_data *computeData, SparseMatrix mat, struct Dist_data dist_data);
void computeSolution(Compute_data computeData, double **subsol, SparseMatrix mat, int myId, double **full_vec);
void pre_compute(Compute_data *computeData, struct Dist_data dist_data, double *subsol, double *full_vec);
int compute(Compute_data *computeData, struct Dist_data *dist_data, int sm);
void free_computeData(Compute_data *computeData, int terminate);

//===================================MALLEABILITY FUNCTIONS====================================================

int n_check = 30;

int dist_old(struct Dist_data *dist_data, Compute_data *computeData, int num_children, int sm, int ss, int rm, int rs, int send_sync);
void dist_new(struct Dist_data *dist_data, Compute_data *computeData);
void update_dist_data(struct Dist_data *dist_data);
void print_global_results();
//----------------------------------------------------------------------------------------------------
void get_dist(int total_r, int id, int numP, struct Dist_data *dist_data);
void set_counts(int id, int numP, struct Dist_data data_dist, int *sendcounts);
void getIds_intercomm(struct Dist_data dist_data, int numP_other, int **idS);
//----------------------------------------------------------------------------------------------------

int main (int argc, char *argv[]) {
	int terminate;
	int req, num_nodes, num_cpus = 20;
	int sm, ss, rm, rs, send_sync;
        char *nodelist = NULL;
        Compute_data computeData;

        computeData.z = NULL; computeData.d_full = NULL, computeData.d = NULL;
        computeData.vec = NULL; computeData.res = NULL;
        computeData.dist_elem = NULL; computeData.displs_elem = NULL;
        computeData.dist_rows = NULL; computeData.displs_rows = NULL;
	computeData.subm.vptr = NULL;
	computeData.vlen = NULL;

	send_sync=1;
	sm = 1; ss = 1; rm = 0; rs = 1;

        int numP, myId, num_children = 0;
        struct Dist_data dist_data;
        if (argc >= 10) {
          num_children = atoi(argv[2]);
	  sm = atoi(argv[3]);
	  ss = atoi(argv[4]);
	  rm = atoi(argv[5]);
	  rs = atoi(argv[6]);
	  send_sync = atoi(argv[7]);
          nodelist = argv[8];
          num_nodes = atoi(argv[9]);
          num_cpus = num_nodes * num_cpus;
	}

        MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &req);
        MPI_Comm_size(MPI_COMM_WORLD, &numP);
        MPI_Comm_rank(MPI_COMM_WORLD, &myId);
	dist_data.comm = MPI_COMM_WORLD;

        int new_group = init_malleability(myId, numP, ROOT, dist_data.comm, argv[0], nodelist, num_cpus, num_nodes);
        update_dist_data(&dist_data);

	if( !new_group ) { //First set of processes
	  init_app(&computeData, &dist_data, argv);
          dist_old(&dist_data, &computeData, num_children, sm, ss, rm, rs, send_sync);
	  MPI_Barrier(MPI_COMM_WORLD);
          set_global_time(MPI_Wtime());
        } else {
          dist_new(&dist_data, &computeData);
	}

//        if(computeData.iter==0)
	terminate = compute(&computeData, &dist_data, sm);

	if(terminate) {
          update_dist_data(&dist_data);
          MPI_Barrier(dist_data.comm);
          if(dist_data.myId == ROOT) {
  	    print_global_results();
	    printf ("End(%d) --> (%d,%20.10e)\n", computeData.n, computeData.iter, computeData.tol);
	  }
        }

	// End of CG
        free_malleability();
        free_computeData(&computeData, 1);
	if(sm && numP > num_children && dist_data.myId == 0) {
          MPI_Abort(MPI_COMM_WORLD, -100);
	}
        MPI_Finalize();
}

/*
 * Init application data before
 * starting iterative computation
 */
void init_app(Compute_data *computeData, struct Dist_data *dist_data, char* argv[]) {
	SparseMatrix mat, sym;
        double *full_vec = NULL;
	double *subsol = NULL;

        if(dist_data->myId == ROOT) {
#ifdef ONLY_SYM
  	  printf ("Working with symmetric format\n");
	  CreateSparseMatrixHB (argv[1], &mat, 1);
#else
	  printf ("Working with general format\n");
	  CreateSparseMatrixHB (argv[1], &sym, 1);
	  DesymmetrizeSparseMatrices (sym, &mat);
	  RemoveSparseMatrix (&sym);
#endif
          computeData->n = mat.dim1;
        }

        // Communicate number of rows to distribute and number of elements in the matrix
        MPI_Bcast(&computeData->n, 1, MPI_INT, ROOT, MPI_COMM_WORLD);

	// Each process calcules their own distribution
        get_dist(computeData->n, dist_data->myId, dist_data->numP, dist_data);
        
        if(dist_data->myId == ROOT) { // ROOT gets rows and vpos/vval distribution
	  get_mat_dist(computeData, *dist_data, mat);
          TransformHeadertoLength(mat.vptr, computeData->n); // From vptr to vlen
        } else { // Non ROOT proceses gets row distribution
          get_rows_dist(computeData, dist_data->numP, computeData->n);
          CreateInts (&computeData->dist_elem, dist_data->numP*2);
	  InitInts (computeData->dist_elem, dist_data->numP * 2, 0.0, 0); 
          computeData->displs_elem = computeData->dist_elem + dist_data->numP;
        }
        // Allocate for each process their submatrix and get their distribution from ROOT
	mat_alloc(computeData, mat, *dist_data);

	computeSolution(*computeData, &subsol, mat, dist_data->myId, &full_vec);
	pre_compute(computeData, *dist_data, subsol, full_vec);

        //Free Initial data
	RemoveDoubles(&subsol);
	RemoveDoubles(&full_vec);
        if(dist_data->myId == ROOT) {
	  RemoveSparseMatrix(&mat);
        }
}

/*
 * MPI Dist
 * Broadcast the vptr array and each process gets the data that corresponds to itself.
 *
 * mat.vptr must be in vlen format to work correctly
 */
void get_mat_dist(Compute_data *computeData, struct Dist_data dist_data, SparseMatrix mat) {
	int i, j;
        struct Dist_data dist_data_aux;

#ifdef DEBUG
        if(dist_data.myId == ROOT) printf("Distribuyendo vptr\n");
#endif
        CreateInts (&computeData->dist_rows, dist_data.numP);
        CreateInts (&computeData->displs_rows, dist_data.numP);
        CreateInts (&computeData->dist_elem, dist_data.numP*2);
        computeData->displs_elem = computeData->dist_elem + dist_data.numP;

        InitInts (computeData->dist_rows, dist_data.numP, 0, 0);
        InitInts (computeData->displs_rows, dist_data.numP, 0, 0);
        InitInts (computeData->dist_elem, dist_data.numP*2, 0, 0);

	// Fill dist_rows and dist_elem so each process can make ScatterV or GatherV calls
        for(i=0; i<dist_data.numP; i++) {
          get_dist(computeData->n, i, dist_data.numP, &dist_data_aux);

          computeData->dist_rows[i] = dist_data_aux.tamBl;
          computeData->dist_elem[i] = mat.vptr[dist_data_aux.fin] - mat.vptr[dist_data_aux.ini];

          // Fill displacements
          if(i!=0) { 
            computeData->displs_elem[i] = computeData->displs_elem[i-1] + computeData->dist_elem[i-1];
            computeData->displs_rows[i] = computeData->displs_rows[i-1] + computeData->dist_rows[i-1];
          }
        }

#ifdef DEBUG
        printf("Proc %d almacena %d filas con %d elementos\n", dist_data.myId, computeData->dist_rows[dist_data.myId], computeData->dist_elem[dist_data.myId]);
        fflush(stdout);
#endif
}

/*
 * MPI Dist
 * Get the rows distribution of n rows in a given number of processes
 */
void get_rows_dist(Compute_data *computeData, int numP, int n) {
	int i, j;
        struct Dist_data dist_data;

        CreateInts (&(computeData->dist_rows), numP);
        CreateInts (&(computeData->displs_rows), numP);

        InitInts (computeData->dist_rows, numP, 0, 0);
        InitInts (computeData->displs_rows, numP, 0, 0);

	// Fill dist_rows and dist_elem so each process can make ScatterV or GatherV calls
        for(i=0; i<numP; i++) {
          get_dist(n, i, numP, &dist_data);
          computeData->dist_rows[i] = dist_data.tamBl;

          // Fill displacements
          if(i!=0) { 
            computeData->displs_rows[i] = computeData->displs_rows[i-1] + computeData->dist_rows[i-1];
          }
        }
}

/*
 * Matrix allocation
 *
 * The matrix that each process will use is allocated and
 * their vptr array initialised.
 *
 * MPI Dist
 * Distribute vpos and vvalues data among processes
 * Both arrays have the same distribution
 */
void mat_alloc(Compute_data *computeData, SparseMatrix mat, struct Dist_data dist_data) {
	int i;
	int elems; // Number of elements this process has
#ifdef DEBUG
        if(dist_data.myId == ROOT) printf("Distribuyendo vpos y vvalue\n");
#endif

	// dist_rows[myId] is the number of rows, n the number of columns, and dist_elem[myId] is the number of elements this process will have in the matrix
        CreateSparseMatrixVptr(&(computeData->subm), dist_data.tamBl, computeData->n, 0);
        computeData->subm.vptr[0] = 0;

        MPI_Scatterv((mat.vptr)+1, computeData->dist_rows, computeData->displs_rows, MPI_INT, (computeData->subm.vptr)+1, dist_data.tamBl, MPI_INT, ROOT, MPI_COMM_WORLD);

        CreateInts(&(computeData->vlen), dist_data.tamBl);
        for(i=0; i<dist_data.tamBl; i++) {
          computeData->vlen[i] = computeData->subm.vptr[i+1];
        }
        TransformLengthtoHeader(computeData->subm.vptr, computeData->subm.dim1); // The array is converted from vlen to vptr
        elems = computeData->subm.vptr[dist_data.tamBl];
        CreateSparseMatrixValues(&(computeData->subm), dist_data.tamBl, computeData->n, elems, 0);

        MPI_Scatterv(mat.vpos, computeData->dist_elem, computeData->displs_elem, MPI_INT,    computeData->subm.vpos, elems, MPI_INT,    ROOT, MPI_COMM_WORLD);
        MPI_Scatterv(mat.vval, computeData->dist_elem, computeData->displs_elem, MPI_DOUBLE, computeData->subm.vval, elems, MPI_DOUBLE, ROOT, MPI_COMM_WORLD);

	// Free elem arrays, as they are not going to be used again
        RemoveInts (&computeData->dist_elem);
}

/*
 * Compute solution
 */
void computeSolution(Compute_data computeData, double **subsol, SparseMatrix mat, int myId, double **full_vec) {
        
	CreateDoubles (subsol, computeData.dist_rows[myId]);
	InitDoubles (*subsol, computeData.dist_rows[myId], 0.0, 0.0);
	CreateDoubles(full_vec, computeData.n);
	InitDoubles (*full_vec, computeData.n, 1.0, 0.0);

//Compute SOLUTION
#ifdef ONLY_SYM
	ProdSymSparseMatrixVector (computeData.subm, *full_vec, *subsol);                  // sol += A * x
#else
	ProdSparseMatrixVector (computeData.subm, *full_vec, *subsol);                    	// sol += A * x
#endif
/*
#ifdef DEBUG
	int aux, i;
	double *solD = NULL, *sol = NULL;
	if(myId == ROOT) {
          printf("Computing solution\n");
	  CreateDoubles (&sol, computeData.n);
	  CreateDoubles (&solD, computeData.n);
	  InitDoubles (sol, computeData.n, 0.0, 0.0);
	  InitDoubles (solD, computeData.n, 0.0, 0.0);

          TransformLengthtoHeader(mat.vptr, mat.dim1); // vlen to vptr (At mat_alloc was needed as vlen)
        }

	MPI_Gatherv(*subsol, computeData.dist_rows[myId], MPI_DOUBLE, sol, computeData.dist_rows, computeData.displs_rows, MPI_DOUBLE, ROOT, MPI_COMM_WORLD);

        if(myId == ROOT) {

#ifdef ONLY_SYM
	  ProdSymSparseMatrixVector (mat, *full_vec, solD);                   // solD += A * x
#else
	  ProdSparseMatrixVector (mat, *full_vec, solD);                      // solD += A * x
#endif // ONLY_SIM
          aux = 1;
          printf("Checking sol array is ok\n");
          for(i=0; i<mat.dim1; i++) {
            if(sol[i] != solD[i]) {
              printf("[%d]Expected %lf - Result %lf\n", i, solD[i],sol[i]);
              aux = 0;
            }
          }
          if(aux) printf("sol array is correct\n");
          
        }
	RemoveDoubles (&sol);
	RemoveDoubles (&solD);
#endif // DEBUG
*/
}

/*
 * Realiza los preparativos para pasar al bucle de computo principal
 * inicializando los datos y realizando una primera iteración
 */
void pre_compute(Compute_data *computeData, struct Dist_data dist_data, double *subsol, double *full_vec) {

	int IZERO = 0, IONE = 1; 
	double DONE = 1.0, DMONE = -1.0, DZERO = 0.0;

        if(dist_data.myId == ROOT) {
	  printf("Start CG\n");
        }

        computeData->res = NULL; computeData->z = NULL; computeData->d = NULL;
	computeData->umbral = 1.0e-8;

	CreateDoubles(&computeData->res, dist_data.tamBl); 
	CreateDoubles(&computeData->z, dist_data.tamBl); 
	CreateDoubles(&computeData->d, dist_data.tamBl);
	CreateDoubles (&computeData->vec, dist_data.tamBl);
	CreateDoubles (&computeData->d_full, computeData->n);

	InitDoubles (computeData->vec, dist_data.tamBl, DZERO, DZERO); // x = 0
	InitDoubles (full_vec, computeData->n, DZERO, DZERO); // full_x = 0
	
	computeData->iter = 0;

#ifdef ONLY_SYM
	ProdSymSparseMatrixVector (computeData->subm, full_vec, computeData->z);                     				// z += A * full_x
//	mkl_dcsrsymv ("U", &n, mat.vval, mat.vptr, mat.vpos, vec, z); 			   // z = A * full_x
#else
	ProdSparseMatrixVector (computeData->subm, full_vec, computeData->z);                       				// z += A * full_x
#endif
	rcopy (&(dist_data.tamBl), subsol, &IONE, computeData->res, &IONE);             					// res = b
	raxpy (&(dist_data.tamBl), &DMONE, computeData->z, &IONE, computeData->res, &IONE);           				// res -= z
	//rcopy (&(computeData.subm.dim1), computeData.res, &IONE, &(computeData.d+computeData.displs_rows[myId]), &IONE);      // d_full = res
        MPI_Allgatherv(computeData->res, dist_data.tamBl, MPI_DOUBLE, computeData->d_full, computeData->dist_rows, computeData->displs_rows, MPI_DOUBLE, MPI_COMM_WORLD);
	rcopy (&(dist_data.tamBl), &(computeData->d_full[dist_data.ini]), &IONE, computeData->d, &IONE);             		// d = d_full[ini] to d_full[ini+tamBl]
	computeData->beta = rdot (&(dist_data.tamBl), computeData->res, &IONE, computeData->res, &IONE);      			// beta = res' * res
        MPI_Allreduce(MPI_IN_PLACE, &computeData->beta, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
	computeData->tol = sqrt (computeData->beta);                                          			   		// tol = sqrt(beta) = norm (res)
}

/*
 * Bucle de computo principal
 */
int compute(Compute_data *computeData, struct Dist_data *dist_data, int sm) {
	int IZERO = 0, IONE = 1; 
	double DONE = 1.0, DMONE = -1.0, DZERO = 0.0;
	int state = MALL_NOT_STARTED;
        int ended_loop = 1;
        int cnt = 0;
	int reconfigure = 0, rec_iter = 500;

        computeData->maxiter = 1000;

	while ((computeData->iter < computeData->maxiter) && (computeData->tol > computeData->umbral)) {

	        MPI_Allgatherv(computeData->d, dist_data->tamBl, MPI_DOUBLE, computeData->d_full, 
				computeData->dist_rows, computeData->displs_rows, MPI_DOUBLE, dist_data->comm);		// d_full = Gather(d)
//      	COMPUTATION
#ifdef ONLY_SYM
		ProdSymSparseMatrixVector (computeData->subm, computeData->d_full, computeData->z);                     // z += A * d_full
#else
		ProdSparseMatrixVector (computeData->subm, computeData->d_full, computeData->z);                    	// z += A * d_full
#endif
        	computeData->rho = rdot (&(dist_data->tamBl), computeData->d, &IONE, computeData->z, &IONE);		// rho = (d * z)
	        MPI_Allreduce(MPI_IN_PLACE, &computeData->rho, 1, MPI_DOUBLE, MPI_SUM, dist_data->comm);		// Reduce(rho, SUM)
		computeData->rho = computeData->beta / computeData->rho;                 		                // rho = beta / aux
		raxpy (&(dist_data->tamBl), &computeData->rho, computeData->d, &IONE, computeData->vec, &IONE);		// x += rho * d
		computeData->rho = -computeData->rho;
		raxpy (&(dist_data->tamBl), &computeData->rho, computeData->z, &IONE, computeData->res, &IONE);         // res -= rho * z
		computeData->alpha = computeData->beta;                                               		        // alpha = beta
		computeData->beta = rdot (&(dist_data->tamBl), computeData->res, &IONE, computeData->res, &IONE);       // beta = res' * res
	        MPI_Allreduce(MPI_IN_PLACE, &computeData->beta, 1, MPI_DOUBLE, MPI_SUM, dist_data->comm);		// Reduce(beta, SUM)
		computeData->alpha = computeData->beta / computeData->alpha;                                       	// alpha = beta / alpha
		rscal (&(dist_data->tamBl), &computeData->alpha, computeData->d, &IONE);                   		// d = alpha * d
		raxpy (&(dist_data->tamBl), &DONE, computeData->res, &IONE, computeData->d, &IONE);        		// d += res

	        //MPI_Allgatherv(computeData->d, dist_data->tamBl, MPI_DOUBLE, computeData->d_full, 
		//		computeData->dist_rows, computeData->displs_rows, MPI_DOUBLE, dist_data->comm);		// d_full = Gather(d)
		computeData->tol = sqrt (computeData->beta);                                          			// tol = sqrt(beta) = norm (res)
		computeData->iter++;

                if (computeData->iter == rec_iter) reconfigure = 1;
		if (reconfigure) {
		  state = malleability_checkpoint();
		  if ((state == MALL_COMPLETED && sm == 0) || state == MALL_ZOMBIE) { ended_loop = 0; break; }
		  else if(state == MALL_COMPLETED) {
		    reconfigure = 0;
                    free_computeData(computeData, 0);
                    update_dist_data(dist_data);
		    dist_new(dist_data, computeData);
		  }

		}

	}
#ifdef DEBUG
	if(dist_data->myId == ROOT) printf ("Ended loop\n");
#endif
	return ended_loop;
}

void dumb(Compute_data *computeData, struct Dist_data *dist_data) {
  int i;

  if(dist_data->myId == 0) printf("Vptr="); 
  fflush(stdout); MPI_Barrier(dist_data->comm);
  for(i=0; i<dist_data->numP; i++) {
    if(dist_data->myId == i) {

      printf("%d, ", computeData->subm.vptr[dist_data->tamBl]);
      fflush(stdout);
    }
    MPI_Barrier(dist_data->comm);
  }
  if(dist_data->myId == 0) printf("\n"); 
  fflush(stdout); MPI_Barrier(dist_data->comm);


  if(dist_data->myId == 0) printf("Tol="); 
  fflush(stdout); MPI_Barrier(dist_data->comm);
  for(i=0; i<dist_data->numP; i++) {
    if(dist_data->myId == i) {

      printf("%lf, ", computeData->tol);
      fflush(stdout);
    }
    MPI_Barrier(dist_data->comm);
  }
  if(dist_data->myId == 0) printf("\n"); 
  fflush(stdout); MPI_Barrier(dist_data->comm);


  if(dist_data->myId == 0) printf("Z[last]="); 
  fflush(stdout); MPI_Barrier(dist_data->comm);
  for(i=0; i<dist_data->numP; i++) {
    if(dist_data->myId == i) {

      printf("%lf, ", computeData->z[dist_data->tamBl-1]);
      fflush(stdout);
    }
    MPI_Barrier(dist_data->comm);
  }
  if(dist_data->myId == 0) printf("\n"); 
  fflush(stdout); MPI_Barrier(dist_data->comm);

  if(dist_data->myId == 0) printf("D[last]="); 
  fflush(stdout); MPI_Barrier(dist_data->comm);
  for(i=0; i<dist_data->numP; i++) {
    if(dist_data->myId == i) {

      printf("%lf, ", computeData->d[dist_data->tamBl-1]);
      fflush(stdout);
    }
    MPI_Barrier(dist_data->comm);
  }
  if(dist_data->myId == 0) printf("\n"); 
  fflush(stdout); MPI_Barrier(dist_data->comm);

  if(dist_data->myId == 0) printf("res[last]="); 
  fflush(stdout); MPI_Barrier(dist_data->comm);
  for(i=0; i<dist_data->numP; i++) {
    if(dist_data->myId == i) {

      printf("%lf, ", computeData->res[dist_data->tamBl-1]);
      fflush(stdout);
    }
    MPI_Barrier(dist_data->comm);
  }
  if(dist_data->myId == 0) printf("\n"); 
  fflush(stdout); MPI_Barrier(dist_data->comm);

  if(dist_data->myId == 0) printf("Vec[last]="); 
  fflush(stdout); MPI_Barrier(dist_data->comm);
  for(i=0; i<dist_data->numP; i++) {
    if(dist_data->myId == i) {

      printf("%lf, ", computeData->vec[dist_data->tamBl-1]);
      fflush(stdout);
    }
    MPI_Barrier(dist_data->comm);
  }
  if(dist_data->myId == 0) printf("\n"); 
  fflush(stdout); MPI_Barrier(dist_data->comm);
}

void free_computeData(Compute_data *computeData, int terminate) {
	if(computeData->res != NULL) {
	RemoveDoubles (&computeData->res); 
	}
	if(computeData->z != NULL) {
        RemoveDoubles (&computeData->z); 
	}
	if(computeData->d != NULL) {
        RemoveDoubles (&computeData->d);
	}
	if(computeData->vec != NULL) {
	RemoveDoubles (&computeData->vec);
	}


	if(computeData->d_full != NULL && terminate) {
        RemoveDoubles (&computeData->d_full);
	}
	if(computeData->subm.vptr != NULL) {
	RemoveSparseMatrix2 (&computeData->subm);
	}

	if(computeData->dist_rows != NULL) {
        RemoveInts (&computeData->dist_rows);
	}
	if(computeData->displs_rows != NULL) {
        RemoveInts (&computeData->displs_rows);
	}
	if(computeData->vlen != NULL) {
        RemoveInts (&computeData->vlen);
	}
}

/*
 *  _____________________________________________________________________________________
 * ||                                                                                   ||
 * ||                                                                                   ||
 * ||                            DISTRIBUTION FUNCTIONS                                 ||
 * ||                                                                                   ||
 * ||                                                                                   ||
 * \_____________________________________________________________________________________/
*/

/*
 * Las siguientes funciones están todas relacionadas con la distribución de los datos
 * o procesos.
 */

/*
 * ========================================================================================
 * ========================================================================================
 * ========================PARENTS COMMUNICATION FUNCTIONS=================================
 * ========================================================================================
 * ========================================================================================
*/

/*
 */
int dist_old(struct Dist_data *dist_data, Compute_data *computeData, int num_children, int sm, int ss, int rm, int rs, int send_sync) {
    int phy_dist = 2;

    set_malleability_configuration(sm, ss, phy_dist, rm, rs);
    set_children_number(num_children);
    
    malleability_add_data(&(computeData->n), 1, MAL_INT, MAL_DATA_ALONE, 1, 1);
    malleability_add_data(&(computeData->iter), 1, MAL_INT, MAL_DATA_ALONE, 1, 1);
    malleability_add_data(&(computeData->tol), 1, MAL_DOUBLE, MAL_DATA_ALONE, 1, 1);
    malleability_add_data(&(computeData->beta), 1, MAL_DOUBLE, MAL_DATA_ALONE, 1, 1);
    malleability_add_data(&(computeData->umbral), 1, MAL_DOUBLE, MAL_DATA_ALONE, 1, 1);

    //malleability_add_data(computeData->d_full, computeData->n, MAL_DOUBLE, MAL_DATA_ALONE, 1, 1);
    malleability_add_data(computeData->d, computeData->n, MAL_DOUBLE, MAL_DATA_ALONE, 0, 1);

    malleability_add_data(computeData->vec, computeData->n, MAL_DOUBLE, MAL_DATA_ALONE, 0, 1);
    malleability_add_data(computeData->res, computeData->n, MAL_DOUBLE, MAL_DATA_ALONE, 0, 1);
    malleability_add_data(computeData->z, computeData->n, MAL_DOUBLE, MAL_DATA_ALONE, 0, 1);

    //FIXME SIguientes valores pueden ser asincronos
    malleability_add_data(computeData->vlen, computeData->n, MAL_INT, 1+MAL_DATA_INDEPENDENT, 0, send_sync);
    malleability_add_data(computeData->subm.vpos, computeData->n, MAL_INT, 1+MAL_DATA_DEPENDENT, 0, send_sync);
    malleability_add_data(computeData->subm.vval, computeData->n, MAL_DOUBLE, 1+MAL_DATA_DEPENDENT, 0, send_sync);
}

/*
 * ========================================================================================
 * ========================================================================================
 * ========================CHILDREN COMMUNICATION FUNCTIONS================================
 * ========================================================================================
 * ========================================================================================
*/

/*
 * Función llamada por un set de procesos hijos.
 *
 * Primero los hijos obtienen de los padres una información iniciar 
 * con la que conocer el tamaño de sus vectores y matriz, como asi 
 * tambien cuantos datos van a recibir de cada padre.
 *
 * Tras esto se preparan para recibir los datos de los padres.
 *
 */
void dist_new(struct Dist_data *dist_data, Compute_data *computeData) {
    int IONE = 1, i, is_synch;
    size_t entry, entries;
    void *value = NULL;
    is_synch = 1;
    entry = 0;

    malleability_get_data(&value, 0, 1, 1);
    computeData->n = *((int *)value);
    malleability_get_data(&value, 1, 1, 1);
    computeData->iter = *((int *)value);
    malleability_get_data(&value, 2, 1, 1);
    computeData->tol = *((double *)value);
    malleability_get_data(&value, 3, 1, 1);
    computeData->beta = *((double *)value);
    malleability_get_data(&value, 4, 1, 1);
    computeData->umbral = *((double *)value);

    //malleability_get_data(&value, 5, 1, 1);
    //computeData->d_full = ((double *)value);
    malleability_get_data(&value, entry++, 0, 1);
    computeData->d = ((double *)value);

    malleability_get_data(&value, entry++, 0, 1);
    computeData->vec = ((double *)value);
    malleability_get_data(&value, entry++, 0, 1);
    computeData->res = ((double *)value);
    malleability_get_data(&value, entry++, 0, 1);
    computeData->z = ((double *)value);

    get_dist(computeData->n, dist_data->myId, dist_data->numP, dist_data);
    get_rows_dist(computeData, dist_data->numP, computeData->n);
    //CreateDoubles(&computeData->d, dist_data->tamBl);
    //rcopy (&(dist_data->tamBl), &(computeData->d_full[dist_data->ini]), &IONE, computeData->d, &IONE);  // d = d_full[ini] to d_full[ini+tamBl]
    CreateDoubles(&computeData->d_full, computeData->n);
    rcopy (&(dist_data->tamBl), computeData->d, &IONE, &(computeData->d_full[dist_data->ini]), &IONE); // d_full[ini] to d_full[ini+tamBl] = d 

    malleability_get_entries(&entries, 0, 0); //Get if there is any asynch data to recover
    if(entries) { is_synch=0; entry=0; }

    malleability_get_data(&value, entry++, 0, is_synch);
    computeData->vlen = ((int *)value);
    
    CreateSparseMatrixVptr(&(computeData->subm), dist_data->tamBl, computeData->n, 0);
    computeData->subm.vptr[0] = 0;
    for(i=0; i<dist_data->tamBl; i++) {
      computeData->subm.vptr[i+1] = computeData->vlen[i];
    }
    TransformLengthtoHeader(computeData->subm.vptr, computeData->subm.dim1); // The array is converted from vlen to vptr

    malleability_get_data(&value, entry++, 0, is_synch);
    computeData->subm.vpos = ((int *)value);
    malleability_get_data(&value, entry++, 0, is_synch);
    computeData->subm.vval = ((double *)value);
}

void update_dist_data(struct Dist_data *dist_data) {
  int myId, numP;
  get_malleability_user_comm(&(dist_data->comm));
  MPI_Comm_size(dist_data->comm, &numP);
  MPI_Comm_rank(dist_data->comm, &myId);
  dist_data->myId = myId;
  dist_data->numP = numP;
}

/*
 * ========================================================================================
 * ========================================================================================
 * ================================DISTRIBUTION FUNCTIONS==================================
 * ========================================================================================
 * ========================================================================================
*/

/*
 * Obtiene para el Id que se pasa junto a su
 * numero de procesos total, con cuantas filas (tamBl),
 * elementos por fila, y total de filas (fin - ini)
 * con las que va a trabajar el proceso
 */
void get_dist(int total_r, int id, int numP, struct Dist_data *dist_data) {
  int rem;

  dist_data->tot_r = total_r;
  dist_data->tamBl = total_r / numP;
  rem = total_r % numP;

  if(id < rem) { // First subgroup
    dist_data->ini = id * dist_data->tamBl + id;
    dist_data->fin = (id+1) * dist_data->tamBl + (id+1);
  } else { // Second subgroup
    dist_data->ini = id * dist_data->tamBl + rem;
    dist_data->fin = (id+1) * dist_data->tamBl + rem;
  }
  
  if(dist_data->fin > total_r) {
    dist_data->fin = total_r;
  }
  if(dist_data->ini > dist_data->fin) {
    dist_data->ini = dist_data->fin;
  }

  dist_data->tamBl = dist_data->fin - dist_data->ini;
}

/*
 * Obtiene para un Id de proceso, cuantos elementos va 
 * a enviar/recibir el proceso myId
 */
void set_counts(int id, int numP, struct Dist_data data_dist, int *sendcounts) {
  struct Dist_data other;
  int biggest_ini, smallest_end, tot_rows;

  get_dist(data_dist.tot_r, id, numP, &other);

  // Si el rango de valores no coincide, se pasa al siguiente proceso
  if(data_dist.ini >= other.fin || data_dist.fin <= other.ini) {
    return;
  }

  // Obtiene el proceso con mayor ini entre los dos procesos
  if(data_dist.ini > other.ini) { 
    biggest_ini = data_dist.ini;
  } else {
    biggest_ini = other.ini;
  }

  // Obtiene el proceso con menor fin entre los dos procesos
  if(data_dist.fin < other.fin) {
    smallest_end = data_dist.fin;
  } else {
    smallest_end = other.fin;
  }
  sendcounts[id] = smallest_end - biggest_ini; // Numero de elementos a enviar/recibir del proceso Id
}

/*
 * Obtiene para un proceso de un grupo a que rango procesos de 
 * otro grupo tiene que enviar o recibir datos.
 *
 * Devuelve el primer identificador y el último (Excluido) con el que
 * comunicarse.
 */
void getIds_intercomm(struct Dist_data dist_data, int numP_other, int **idS) {
    int idI, idE;
    int tamOther = dist_data.tot_r / numP_other;
    int remOther = dist_data.tot_r % numP_other;
    int middle = (tamOther + 1) * remOther;

    if(middle > dist_data.ini) { // First subgroup
      idI = dist_data.ini / (tamOther + 1);
    } else { // Second subgroup
      idI = ((dist_data.ini - middle) / tamOther) + remOther;
    }

    if(middle >= dist_data.fin) { // First subgroup
      idE = dist_data.fin / (tamOther + 1);
      idE = (dist_data.fin % (tamOther + 1) > 0 && idE+1 <= numP_other) ? idE+1 : idE;
    } else { // Second subgroup
      idE = ((dist_data.fin - middle) / tamOther) + remOther;
      idE = ((dist_data.fin - middle) % tamOther > 0 && idE+1 <= numP_other) ? idE+1 : idE;
    }

    //free(*idS);
    CreateInts(idS, 2);
    (*idS)[0] = idI;
    (*idS)[1] = idE;
}

void print_global_results() {
  size_t i;
  double sp_time, sy_time, asy_time, mall_time, global_time;

  retrieve_results(&sp_time, &sy_time, &asy_time, &mall_time, &global_time);
  global_time = MPI_Wtime() - global_time;
  printf("T_spawn: %lf", sp_time);
  printf("\nT_SR: %lf", sy_time);
  printf("\nT_AR: %lf", asy_time);
  printf("\nT_Malleability: %lf", mall_time);
  printf("\nT_total: %lf\n", global_time);
}


/*
 
	  double starttime, endtime, total, res;
          MPI_Barrier(MPI_COMM_WORLD);
	  starttime = MPI_Wtime();
	  endtime = MPI_Wtime();
          total = endtime - starttime;
          MPI_Reduce(&total, &res, 1, MPI_DOUBLE, MPI_MAX, ROOT, MPI_COMM_WORLD);
          if(dist_data.myId == ROOT) {printf("Tiempo BCAST PADRE %f\n", total); fflush(stdout);}
 */