First commit. Added initial code for BiCGStab.

9135f28c · iker_martin · fbf4338a · 9135f28c · 9135f28c · 9135f28c
Commit 9135f28c authored Mar 14, 2024 by iker_martin
--- a/BiCGStab_Iker/BiCGStab.c
+++ b/BiCGStab_Iker/BiCGStab.c
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <math.h>
+#include <mkl_blas.h>
+#include <mpi.h>
+#include <hb_io.h>
+#include <vector>
+
+#include "reloj.h"
+#include "ScalarVectors.h"
+#include "SparseProduct.h"
+#include "ToolsMPI.h"
+#include "matrix.h"
+#include "common.h"
+
+// ================================================================================
+
+#define DIRECT_ERROR 1
+#define PRECOND 1
+// #define SPMV_OPTIMIZED 1
+#ifdef SPMV_OPTIMIZED
+  #define COLL_P2P_SPMV 0
+#endif
+
+void BiCGStab (SparseMatrix mat, double *x, double *b, int *sizes, int *dspls, int myId) {
+    int size = mat.dim2, sizeR = mat.dim1; 
+    int IONE = 1; 
+    double DONE = 1.0, DMONE = -1.0, DZERO = 0.0;
+    int n, n_dist, iter, maxiter, nProcs;
+    double beta, tol, tol0, alpha, umbral, rho, omega, tmp;
+    double *s = NULL, *q = NULL, *r = NULL, *p = NULL, *r0 = NULL, *y = NULL, *p_hat = NULL, *q_hat = NULL;
+    double *aux = NULL;
+    double t1, t2, t3, t4;
+    double reduce[2];
+#if PRECOND
+    int i, *posd = NULL;
+    double *diags = NULL;
+#endif
+
+    MPI_Comm_size(MPI_COMM_WORLD, &nProcs);
+    n = size; n_dist = sizeR; maxiter = 16 * size; umbral = 1.0e-8;
+    CreateDoubles (&s, n_dist);
+    CreateDoubles (&q, n_dist);
+    CreateDoubles (&r, n_dist);
+    CreateDoubles (&r0, n_dist);
+    CreateDoubles (&p, n_dist);
+    CreateDoubles (&y, n_dist);
+#if DIRECT_ERROR
+    // init exact solution
+    double *res_err = NULL, *x_exact = NULL;
+    CreateDoubles (&x_exact, n_dist);
+    CreateDoubles (&res_err, n_dist);
+    InitDoubles (x_exact, n_dist, DONE, DZERO);
+#endif // DIRECT_ERROR 
+
+#if PRECOND
+    CreateInts (&posd, n_dist);
+    CreateDoubles (&p_hat, n_dist);
+    CreateDoubles (&q_hat, n_dist);
+    CreateDoubles (&diags, n_dist);
+    GetDiagonalSparseMatrix2 (mat, dspls[myId], diags, posd);
+#pragma omp parallel for
+    for (i=0; i<n_dist; i++) 
+        diags[i] = DONE / diags[i];
+#endif
+    CreateDoubles (&aux, n); 
+
+#ifdef SPMV_OPTIMIZED
+    int *permP = NULL, *ipermP = NULL;
+    int *vdspP = NULL, *vdimP = NULL, *vdspR = NULL, *vdimR = NULL;
+    double *vecP = NULL;
+    MPI_Datatype *vectDatatypeP = NULL, *vectDatatypeR = NULL;
+
+    CreateInts (&ipermP, size);
+    CreateInts (&vdimP, nProcs); CreateInts (&vdspP, nProcs + 1);
+    CreateInts (&vdimR, nProcs); CreateInts (&vdspR, nProcs + 1);
+    vectDatatypeP = (MPI_Datatype *) malloc (nProcs * sizeof (MPI_Datatype));
+    vectDatatypeR = (MPI_Datatype *) malloc (nProcs * sizeof (MPI_Datatype));
+    createAlltoallwStruct (COLL_P2P_SPMV, MPI_COMM_WORLD, mat, sizes, dspls, vdimP, 
+                vdspP, &aux, &permP, ipermP, vdimR, vdspR, vectDatatypeP, vectDatatypeR);
+
+  // Code required before the loop  
+    PermuteInts (mat.vpos, ipermP, mat.vptr[mat.dim1]);
+#endif
+
+    iter = 0;
+#ifdef SPMV_OPTIMIZED
+    joinDistributeVectorSPMV (COLL_P2P_SPMV, MPI_COMM_WORLD, x, vecP, vdimP, vdspP, 
+                                vdimR, vdspR, vectDatatypeP, vectDatatypeR);
+    InitDoubles (s, sizeR, DZERO, DZERO);
+    ProdSparseMatrixVectorByRows (mat, 0, vecP, s);                  // s = A * x
+#else
+    MPI_Allgatherv (x, sizeR, MPI_DOUBLE, aux, sizes, dspls, MPI_DOUBLE, MPI_COMM_WORLD);
+    InitDoubles (s, sizeR, DZERO, DZERO);
+    ProdSparseMatrixVectorByRows (mat, 0, aux, s);                  // s = A * x
+#endif
+    dcopy (&n_dist, b, &IONE, r, &IONE);                                // r = b
+    daxpy (&n_dist, &DMONE, s, &IONE, r, &IONE);                        // r -= s
+
+    dcopy (&n_dist, r, &IONE, p, &IONE);                                // p = r
+    dcopy (&n_dist, r, &IONE, r0, &IONE);                               // r0 = r
+    // compute tolerance and <r0,r0>
+    rho = ddot (&n_dist, r, &IONE, r, &IONE);
+    MPI_Allreduce (MPI_IN_PLACE, &rho, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+
+    tol0 = sqrt (rho);
+    tol = tol0;
+
+#if DIRECT_ERROR
+    // compute direct error
+    double direct_err;
+    dcopy (&n_dist, x_exact, &IONE, res_err, &IONE);                    // res_err = x_exact
+    daxpy (&n_dist, &DMONE, x, &IONE, res_err, &IONE);                  // res_err -= x
+
+    // compute inf norm
+    direct_err = norm_inf(n_dist, res_err);
+    MPI_Allreduce(MPI_IN_PLACE, &direct_err, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
+
+    //    // compute euclidean norm
+    //    direct_err = ddot (&n_dist, res_err, &IONE, res_err, &IONE);            // direct_err = res_err' * res_err
+    //    MPI_Allreduce(MPI_IN_PLACE, &direct_err, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+    //    direct_err = sqrt(direct_err);
+#endif // DIRECT_ERROR
+
+    MPI_Barrier(MPI_COMM_WORLD);
+    if (myId == 0) 
+        reloj (&t1, &t2);
+
+    while ((iter < maxiter) && (tol > umbral)) {
+
+#if PRECOND
+        VvecDoubles (DONE, diags, p, DZERO, p_hat, n_dist);              // p_hat = D^-1 * p
+#else
+        p_hat = p;
+#endif
+#ifdef SPMV_OPTIMIZED
+        joinDistributeVectorSPMV (COLL_P2P_SPMV, MPI_COMM_WORLD, p_hat, vecP, vdimP, 
+                                    vdspP, vdimR, vdspR, vectDatatypeP, vectDatatypeR);
+        InitDoubles (s, sizeR, DZERO, DZERO);
+        ProdSparseMatrixVectorByRows (mat, 0, vecP, s);                   // s = A * p
+#else
+        MPI_Allgatherv (p_hat, sizeR, MPI_DOUBLE, aux, sizes, dspls, MPI_DOUBLE, MPI_COMM_WORLD);
+        InitDoubles (s, sizeR, DZERO, DZERO);
+        ProdSparseMatrixVectorByRows (mat, 0, aux, s);                   // s = A * p
+#endif
+
+        if (myId == 0) 
+#if DIRECT_ERROR
+            printf ("%d \t %g \t %g \t %g \n", iter, tol, umbral, direct_err);
+#else        
+        printf ("%d \t %g \n", iter, tol);
+#endif // DIRECT_ERROR
+        alpha = ddot (&n_dist, r0, &IONE, s, &IONE);
+        MPI_Allreduce (MPI_IN_PLACE, &alpha, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+
+        alpha = rho / alpha;
+
+        dcopy (&n_dist, r, &IONE, q, &IONE);                            // q = r
+        tmp = -alpha;
+        daxpy (&n_dist, &tmp, s, &IONE, q, &IONE);                      // q = r - alpha * s;
+
+        // second spmv
+#if PRECOND
+        VvecDoubles (DONE, diags, q, DZERO, q_hat, n_dist);             // q_hat = D^-1 * q
+#else
+        q_hat = q;
+#endif
+#ifdef SPMV_OPTIMIZED
+        joinDistributeVectorSPMV (COLL_P2P_SPMV, MPI_COMM_WORLD, q_hat, vecP, vdimP, 
+                                  vdspP, vdimR, vdspR, vectDatatypeP, vectDatatypeR);
+        InitDoubles (y, sizeR, DZERO, DZERO);
+        ProdSparseMatrixVectorByRows (mat, 0, vecP, y);                // y = A * q
+#else
+        MPI_Allgatherv (q_hat, sizeR, MPI_DOUBLE, aux, sizes, dspls, MPI_DOUBLE, MPI_COMM_WORLD);
+        InitDoubles (y, sizeR, DZERO, DZERO);
+        ProdSparseMatrixVectorByRows (mat, 0, aux, y);                // y = A * q
+#endif
+        // omega = <q, y> / <y, y>
+        reduce[0] = ddot (&n_dist, q, &IONE, y, &IONE);
+        reduce[1] = ddot (&n_dist, y, &IONE, y, &IONE);
+        MPI_Allreduce (MPI_IN_PLACE, reduce, 2, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+
+        omega = reduce[0] / reduce[1];
+
+        // x+1 = x + alpha * p + omega * q
+        daxpy (&n_dist, &alpha, p_hat, &IONE, x, &IONE); 
+        daxpy (&n_dist, &omega, q_hat, &IONE, x, &IONE); 
+
+        // r+1 = q - omega * y
+        dcopy (&n_dist, q, &IONE, r, &IONE);                            // r = q
+        tmp = -omega;
+        daxpy (&n_dist, &tmp, y, &IONE, r, &IONE);                      // r = q - omega * y;
+        
+        // rho = <r0, r+1> and tolerance
+        reduce[0] = ddot (&n_dist, r0, &IONE, r, &IONE);
+        reduce[1] = ddot (&n_dist, r, &IONE, r, &IONE);
+        MPI_Allreduce (MPI_IN_PLACE, reduce, 2, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+
+        tmp = reduce[0];
+        tol = sqrt (reduce[1]) / tol0;
+
+        // beta = (alpha / omega) * <r0, r+1> / <r0, r>
+        beta = (alpha / omega) * (tmp / rho);
+        rho = tmp;
+       
+        // p+1 = r+1 + beta * (p - omega * s)
+        tmp = -omega; 
+        daxpy (&n_dist, &tmp, s, &IONE, p, &IONE);                     // p -= omega * s
+        dscal (&n_dist, &beta, p, &IONE);                              // p = beta * p
+        daxpy (&n_dist, &DONE, r, &IONE, p, &IONE);                    // p += r
+
+#if DIRECT_ERROR
+        // compute direct error
+        dcopy (&n_dist, x_exact, &IONE, res_err, &IONE);               // res_err = x_exact
+        daxpy (&n_dist, &DMONE, x, &IONE, res_err, &IONE);             // res_err -= x
+
+        // compute inf norm
+        direct_err = norm_inf(n_dist, res_err);
+        MPI_Allreduce(MPI_IN_PLACE, &direct_err, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
+
+        //        // compute euclidean norm
+        //        direct_err = ddot (&n_dist, res_err, &IONE, res_err, &IONE);
+        //        MPI_Allreduce(MPI_IN_PLACE, &direct_err, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+        //        direct_err = sqrt(direct_err);
+#endif // DIRECT_ERROR
+
+        iter++;
+    }
+
+    MPI_Barrier(MPI_COMM_WORLD);
+    if (myId == 0) 
+        reloj (&t3, &t4);
+
+#ifdef SPMV_OPTIMIZED
+    // Code required after the loop 
+    PermuteInts (mat.vpos, permP, mat.vptr[mat.dim1]);
+
+    // Freeing memory for Permutation
+    free (vectDatatypeR); vectDatatypeR = NULL; free (vectDatatypeP); vectDatatypeP = NULL;
+    RemoveDoubles (&vecP); RemoveInts (&permP);
+    RemoveInts (&vdspR); RemoveInts (&vdimR); RemoveInts (&vdspP); RemoveInts (&vdimP);
+    RemoveInts (&ipermP);
+#endif
+
+    if (myId == 0) {
+        printf ("Size: %d \n", n);
+        printf ("Iter: %d \n", iter);
+        printf ("Tol: %g \n", tol);
+        printf ("Time_loop: %20.10e\n", (t3-t1));
+        printf ("Time_iter: %20.10e\n", (t3-t1)/iter);
+    }
+
+    RemoveDoubles (&aux); RemoveDoubles (&s); RemoveDoubles (&q); 
+    RemoveDoubles (&r); RemoveDoubles (&p); RemoveDoubles (&r0); RemoveDoubles (&y);
+#if PRECOND
+    RemoveDoubles (&diags); RemoveInts (&posd);
+    RemoveDoubles(&p_hat); RemoveDoubles (&q_hat); 
+#endif
+}
+
+/*********************************************************************************/
+
+int main (int argc, char **argv) {
+    int dim; 
+    double *sol1 = NULL, *sol2 = NULL;
+    int index = 0, indexL = 0;
+    SparseMatrix mat  = {0, 0, NULL, NULL, NULL}, sym = {0, 0, NULL, NULL, NULL};
+
+    int root = 0, myId, nProcs;
+    int dimL, dspL, *vdimL = NULL, *vdspL = NULL;
+    SparseMatrix matL = {0, 0, NULL, NULL, NULL};
+    double *sol1L = NULL, *sol2L = NULL;
+
+    int mat_from_file, nodes, size_param, stencil_points;
+
+    if (argc == 3) {
+        mat_from_file = atoi(argv[2]);
+    } else {
+        mat_from_file = atoi(argv[2]);
+        nodes = atoi(argv[3]);
+        size_param = atoi(argv[4]);
+        stencil_points = atoi(argv[5]);
+    }
+
+    /***************************************/
+
+    MPI_Init (&argc, &argv);
+
+    // Definition of the variables nProcs and myId
+    MPI_Comm_size(MPI_COMM_WORLD, &nProcs);
+    MPI_Comm_rank(MPI_COMM_WORLD, &myId);
+    root = nProcs-1;
+    root = 0;
+
+    /***************************************/
+
+    printf ("A\n");
+    CreateInts (&vdimL, nProcs); CreateInts (&vdspL, nProcs); 
+    if(mat_from_file) {
+        if (myId == root) {
+            // Creating the matrix
+            ReadMatrixHB (argv[1], &sym);
+            TransposeSparseMatrices (sym, 0, &mat, 0);
+            dim = mat.dim1;
+        }
+
+        // Distributing the matrix
+        dim = DistributeMatrix (mat, index, &matL, indexL, vdimL, vdspL, root, MPI_COMM_WORLD);
+        dimL = vdimL[myId]; dspL = vdspL[myId];
+        printf ("B\n");
+    }
+    else {
+        dim = size_param * size_param * size_param;
+        int divL, rstL, i;
+        divL = (dim / nProcs); rstL = (dim % nProcs);
+        for (i=0; i<nProcs; i++) vdimL[i] = divL + (i < rstL);
+        vdspL[0] = 0; for (i=1; i<nProcs; i++) vdspL[i] = vdspL[i-1] + vdimL[i-1];
+        dimL = vdimL[myId]; dspL = vdspL[myId];
+        int band_width = size_param * (size_param + 1) + 1;
+        band_width = 100 * nodes;
+        long nnz_here = ((long) (stencil_points + 2 * band_width)) * dimL;
+        printf ("dimL: %d, nodes: %d, size_param: %d, band_width: %d, stencil_points: %d, nnz_here: %ld\n",
+                dimL, nodes, size_param, band_width, stencil_points, nnz_here);
+        allocate_matrix(dimL, dim, nnz_here, &matL);
+        generate_Poisson3D_filled(&matL, size_param, stencil_points, band_width, dspL, dimL, dim);
+
+        // To generate ill-conditioned matrices
+//        double factor = 1.0e6;
+//        ScaleFirstRowCol(matL, dspL, dimL, myId, root, factor);
+    }
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    // Creating the vectors
+    CreateDoubles (&sol1, dim);
+    CreateDoubles (&sol2, dim);
+    CreateDoubles (&sol1L, dimL);
+    CreateDoubles (&sol2L, dimL);
+
+    InitDoubles (sol2, dim, 0.0, 0.0);
+    InitDoubles (sol1L, dimL, 0.0, 0.0);
+    InitDoubles (sol2L, dimL, 0.0, 0.0);
+
+    /***************************************/
+
+    printf ("C\n");
+
+    int IONE = 1;
+    double beta = 1.0 / sqrt(dim);
+    if(mat_from_file) {
+        // compute b = A * x_c, x_c = 1/sqrt(nbrows)
+        InitDoubles (sol1, dim, 1.0, 0.0);
+        ProdSparseMatrixVectorByRows (matL, 0, sol1, sol1L);                  // s = A * x
+        dscal (&dimL, &beta, sol1L, &IONE);                                         // s = beta * s
+    } else {
+        InitDoubles (sol1, dim, 0.0, 0.0);
+
+        int k=0;
+        int *vptrM = matL.vptr;
+        for (int i=0; i < matL.dim1; i++) {
+            for(int j=vptrM[i]; j<vptrM[i+1]; j++) {
+                sol1L[k] += matL.vval[j];
+            }
+        }
+    }
+
+    printf ("D\n");
+
+    MPI_Scatterv (sol2, vdimL, vdspL, MPI_DOUBLE, sol2L, dimL, MPI_DOUBLE, root, MPI_COMM_WORLD);
+
+    printf ("E\n");
+
+    BiCGStab (matL, sol2L, sol1L, vdimL, vdspL, myId);
+
+    printf ("F\n");
+
+    // Error computation ||b-Ax||
+//    if(mat_from_file) {
+        MPI_Allgatherv (sol2L, dimL, MPI_DOUBLE, sol2, vdimL, vdspL, MPI_DOUBLE, MPI_COMM_WORLD);
+        InitDoubles (sol2L, dimL, 0, 0);
+        ProdSparseMatrixVectorByRows (matL, 0, sol2, sol2L);
+        double DMONE = -1.0;
+        daxpy (&dimL, &DMONE, sol2L, &IONE, sol1L, &IONE);          
+        beta = ddot (&dimL, sol1L, &IONE, sol1L, &IONE);
+        MPI_Allreduce (MPI_IN_PLACE, &beta, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+        
+//    } else {
+//        // case with x_exact = {1.0}
+//        for (int i=0; i<dimL; i++)
+//            sol2L[i] -= 1.0;
+//        beta = ddot (&dimL, sol2L, &IONE, sol2L, &IONE);            
+//    } 
+
+    beta = sqrt(beta);
+    if (myId == 0) 
+        printf ("Error: %20.10e\n", beta);
+
+    /***************************************/
+    // Freeing memory
+    RemoveDoubles (&sol1); 
+    RemoveDoubles (&sol2); 
+    RemoveDoubles (&sol1L); 
+    RemoveDoubles (&sol2L);
+    RemoveInts (&vdspL); RemoveInts (&vdimL); 
+    if (myId == root) {
+        RemoveSparseMatrix (&mat);
+        RemoveSparseMatrix (&sym);
+    } 
+
+    MPI_Finalize ();
+
+    return 0;
+}
+
--- a/BiCGStab_Iker/ScalarVectors.c
+++ b/BiCGStab_Iker/ScalarVectors.c
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <string.h>
+#include <ScalarVectors.h>
+
+/*********************************************************************************/
+
+void CreateInts (int **vint, int dim) {
+	if ((*vint = (int *) malloc (sizeof(int)*dim)) == NULL)
+		{ printf ("Memory Error (CreateInts(%d))\n", dim); exit (1); }
+}
+
+void RemoveInts (int **vint) { 
+	if (*vint != NULL) { free (*vint); *vint = NULL; }  
+}
+
+void InitInts (int *vint, int dim, int frst, int incr) {
+	int i, *p1 = vint, num = frst;
+
+	for (i=0; i<dim; i++) 
+		{ *(p1++) = num; num += incr; }
+}
+		
+void CopyInts (int *src, int *dst, int dim) { 
+	memmove (dst, src, sizeof(int) * dim);
+}
+
+void CopyShiftInts (int *src, int *dst, int dim, int shft) {
+	int i, *p1 = src, *p2 = dst;
+
+	if (shft == 0)
+		CopyInts (src, dst, dim);
+	else
+		for (i=0; i<dim; i++)
+			*(p2++) = *(p1++) + shft;
+}
+	
+void TransformLengthtoHeader (int *vint, int dim) {
+	int i, *pi = vint; 
+
+	for (i=0; i<dim; i++) { *(pi+1) += *pi; pi++; }
+}
+
+void TransformHeadertoLength (int *vint, int dim) {
+	int i, *pi = vint+dim; 
+
+	for (i=dim; i>0; i--) { *(pi) -= *(pi-1); pi--; }
+}
+
+void ComputeHeaderfromLength (int *len, int *head, int dim) {
+	int i, *pi1 = len, *pi2 = head; 
+
+	for (i=0; i<dim; i++) { *(pi2+1) = (*pi2) +(*(pi1++)); pi2++; }
+}
+
+void ComputeLengthfromHeader (int *head, int *len, int dim) {
+	int i, *pi1 = head, *pi2 = len; 
+
+	for (i=0; i<dim; i++) { *(pi2++) = (*(pi1+1)) -(*pi1); pi1++; }
+}
+
+int AddInts (int *vint, int dim) {
+	int i, *pi = vint, aux = 0;
+
+	for (i=0; i<dim; i++) { 
+		aux += *pi; pi++; 
+	}
+
+	return aux;
+}
+
+// The permutation defined by perm is applied on vec, whose size is dim. 
+void PermuteInts (int *vec, int *perm, int dim) {
+  int i, *pi = vec;
+
+  for (i=0; i<dim; i++) { *pi = perm[*pi]; pi++; }
+}
+
+// Apply the inverse of perm, and store it on iperm, whose size is dim. 
+void ComputeInvPermutation (int *perm, int *iperm, int dim) {
+  int i, *pi1 = perm;
+
+  for (i=0; i<dim; i++) { iperm[*(pi1++)] = i; }
+}
+
+// Scale by scal the elements of vint, whose size is dim. 
+void ScaleInts (int *vint, int scal, int dim) {
+  int i;
+  int *pi = vint;
+
+  for (i=0; i<dim; i++)
+    *(pi++) *= scal;
+}
+
+/*********************************************************************************/
+
+void CreateDoubles (double **vdbl, int dim) {
+	if ((*vdbl = (double *) malloc (sizeof(double)*dim)) == NULL)
+		{ printf ("Memory Error (CreateDoubles(%d))\n", dim); exit (1); }
+}
+
+void RemoveDoubles (double **vdbl) { 
+	if (*vdbl != NULL) { free (*vdbl); *vdbl = NULL; }
+}
+
+void InitDoubles (double *vdbl, int dim, double frst, double incr) {
+	int i; 
+	double *pd = vdbl, num = frst;
+
+	for (i=0; i<dim; i++) 
+		{ *(pd++) = num; num += incr; }
+}
+		
+void InitRandDoubles (double *vdbl, int dim, double frst, double last) {
+	int i; 
+	double *pd = vdbl, size = last - frst;
+
+	for (i=0; i<dim; i++) 
+		{ *(pd++) = frst + (size * (rand() / (RAND_MAX + 1.0))); }
+}
+		
+void CopyDoubles (double *src, double *dst, int dim) { 
+	memmove (dst, src, sizeof(double) * dim);
+}
+
+void ScaleDoubles (double *vdbl, double scal, int dim) {
+	int i; 
+	double *pd = vdbl;
+
+	for (i=0; i<dim; i++) 
+		*(pd++) *= scal;
+}
+
+double DotDoubles (double *vdbl1, double *vdbl2, int dim) {
+	int i; 
+	double *pd1 = vdbl1, *pd2 = vdbl2, res = 0.0;
+
+	for (i=0; i<dim; i++) 
+		res += (*(pd2++)) * (*(pd1++));
+
+	return res;
+}
+
+void VvecDoubles (double alfa, double *src1, double *src2, double beta, double *dst, int dim) {
+    int i;
+
+    for (i = 0; i < dim; i++) {
+        //dst[i] = (beta * dst[i]) + (alfa * src1[i] * src2[i]); 
+        double tmp = alfa * src1[i] * src2[i];
+        dst[i] = fma(beta, dst[i], tmp);
+    }
+}
+
+
+/*********************************************************************************/
--- a/BiCGStab_Iker/ScalarVectors.h
+++ b/BiCGStab_Iker/ScalarVectors.h
+
+/*********************************************************************************/
+
+extern void CreateInts (int **vint, int dim);
+
+extern void RemoveInts (int **vint);
+
+extern void InitInts (int *vint, int dim, int frst, int incr);
+		
+extern void CopyInts (int *src, int *dst, int dim); 
+
+extern void CopyShiftInts (int *src, int *dst, int dim, int shft);
+	
+extern void TransformLengthtoHeader (int *vint, int dim);
+
+extern void TransformHeadertoLength (int *vint, int dim);
+
+extern void ComputeHeaderfromLength (int *len, int *head, int dim);
+
+extern void ComputeLengthfromHeader (int *head, int *len, int dim);
+
+extern int AddInts (int *vint, int dim);
+
+// The permutation defined by perm is applied on vec, whose size is dim. 
+extern void PermuteInts (int *vec, int *perm, int dim);
+
+// Apply the inverse of perm, and store it on iperm, whose size is dim. 
+extern void ComputeInvPermutation (int *perm, int *iperm, int dim);
+
+// Scale by scal the elements of vint, whose size is dim. 
+extern void ScaleInts (int *vint, int scal, int dim);
+
+/*********************************************************************************/
+
+extern void CreateDoubles (double **vdbl, int dim);
+
+extern void RemoveDoubles (double **vdbl); 
+
+extern void InitDoubles (double *vdbl, int dim, double frst, double incr);
+		
+extern void InitRandDoubles (double *vdbl, int dim, double frst, double last);
+
+extern void CopyDoubles (double *src, double *dst, int dim); 
+
+extern void ScaleDoubles (double *vdbl, double scal, int dim);
+
+extern double DotDoubles (double *vdbl1, double *vdbl2, int dim);
+
+extern void VvecDoubles (double alfa, double *src1, double *src2, double beta, double *dst, int dim);
+
+/*********************************************************************************/
--- a/BiCGStab_Iker/SparseProduct.c
+++ b/BiCGStab_Iker/SparseProduct.c
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+
+/// #include "InputOutput.h"
+#include "ScalarVectors.h"
+#include "hb_io.h"
+#include "SparseProduct.h"
+
+/*********************************************************************************/
+
+// This routine creates a sparseMatrix from the next parameters
+// * numR defines the number of rows
+// * numC defines the number of columns
+// * numE defines the number of nonzero elements
+// * msr indicates if the MSR is the format used to the sparse matrix
+// If msr is actived, numE doesn't include the diagonal elements
+// The parameter index indicates if 0-indexing or 1-indexing is used.
+void CreateSparseMatrix (ptr_SparseMatrix p_spr, int index, int numR, int numC, int numE, int msr) {
+//	printf (" index = %d , numR = %d , numC = %d , numE = %d\n", index, numR, numC, numE);
+	// The scalar components of the structure are initiated
+	p_spr->dim1 = numR; p_spr->dim2 = numC; 
+	// Only one malloc is made for the vectors of indices
+	CreateInts (&(p_spr->vptr), numE+numR+1);
+	// The first component of the vectors depends on the used format
+	*(p_spr->vptr) = ((msr)? (numR+1): 0) + index;
+	p_spr->vpos = p_spr->vptr + ((msr)? 0: (numR+1));
+	// The number of nonzero elements depends on the format used
+	CreateDoubles (&(p_spr->vval), numE+(numR+1)*msr);
+}
+
+// This routine liberates the memory related to matrix spr
+void RemoveSparseMatrix (ptr_SparseMatrix spr) {
+	// First the scalar are initiated
+	spr->dim1 = -1; spr->dim2 = -1; 
+	// The vectors are liberated
+	RemoveInts (&(spr->vptr)); RemoveDoubles (&(spr->vval)); 
+}
+
+/*********************************************************************************/
+
+// This routine creates de sparse matrix dst from the symmetric matrix spr.
+// The parameters indexS and indexD indicate, respectivaly, if 0-indexing or 1-indexing is used
+// to store the sparse matrices.
+void DesymmetrizeSparseMatrices (SparseMatrix src, int indexS, ptr_SparseMatrix dst, int indexD) {
+	int n = src.dim1, nnz = 0;
+	int *sizes = NULL;
+	int *pp1 = NULL, *pp2 = NULL, *pp3 = NULL, *pp4 = NULL, *pp5 = NULL;
+	int i, j, dim, indexDS = indexD - indexS;
+	double *pd3 = NULL, *pd4 = NULL;
+
+	// The vector sizes is created and initiated
+	CreateInts (&sizes, n); InitInts (sizes, n, 0, 0);
+	// This loop counts the number of elements in each row
+	pp1 = src.vptr; pp3 = src.vpos + *pp1 - indexS;
+	pp2 = pp1 + 1 ; pp4 = sizes - indexS;
+	for (i=indexS; i<(n+indexS); i++) {
+		// The size of the corresponding row is accumulated
+		dim = (*pp2 - *pp1); pp4[i] += dim;
+		// Now each component of the row is analyzed
+		for (j=0; j<dim; j++) {
+			// The nondiagonals elements define another element in the graph
+			if (*pp3 != i) pp4[*pp3]++;
+			pp3++;
+		}
+		pp1 = pp2++; 
+	}
+	
+	// Compute the number of nonzeros of the new sparse matrix
+	nnz = AddInts (sizes, n);
+	// Create the new sparse matrix
+	CreateSparseMatrix (dst, indexD, n, n, nnz, 0);
+	// Fill the vector of pointers
+	CopyInts (sizes, (dst->vptr) + 1, n);
+	dst->vptr[0] = indexD; TransformLengthtoHeader (dst->vptr, n);
+	// The vector sizes is initiated with the beginning of each row
+	CopyInts (dst->vptr, sizes, n);
+	// This loop fills the contents of vector vpos
+	pp1 = src.vptr; pp3 = src.vpos + *pp1 - indexS; 
+	pp2 = pp1 + 1 ; pp4 = dst->vpos - indexD; pp5 = sizes - indexS;
+	pd3 = src.vval  + *pp1 - indexS; pd4 = dst->vval - indexD;
+	for (i=indexS; i<(n+indexS); i++) {
+		dim = (*pp2 - *pp1);
+		for (j=0; j<dim; j++) {
+			// The elements in the i-th row
+			pp4[pp5[i]  ] = *pp3+indexDS; 
+			pd4[pp5[i]++] = *pd3; 
+			if (*pp3 != i) {
+				// The nondiagonals elements define another element in the graph
+				pp4[pp5[*pp3]  ] = i+indexDS;
+				pd4[pp5[*pp3]++] = *pd3;
+			}
+			pp3++; pd3++;
+		}
+		pp1 = pp2++;
+	}
+	// The memory related to the vector sizes is liberated
+	RemoveInts (&sizes);
+}
+
+/*********************************************************************************/
+
+// This routine creates de sparse matrix dst from the matrix spr.
+// The parameters indexS and indexD indicate, respectivaly, if 0-indexing or 1-indexing is used
+// to store the sparse matrices.
+void TransposeSparseMatrices (SparseMatrix src, int indexS, ptr_SparseMatrix dst, int indexD) {
+    int n = src.dim1, nnz = 0;
+    int *sizes = NULL;
+    int *pp1 = NULL, *pp2 = NULL, *pp3 = NULL, *pp4 = NULL, *pp5 = NULL;
+    int i, j, dim, indexDS = indexD - indexS;
+    double *pd3 = NULL, *pd4 = NULL;
+
+    // The vector sizes is created and initiated
+    CreateInts (&sizes, n); InitInts (sizes, n, 0, 0);
+    // This loop counts the number of elements in each row
+    pp1 = src.vptr; pp3 = src.vpos + *pp1 - indexS;
+    pp2 = pp1 + 1 ; pp4 = sizes - indexS;
+    for (i=indexS; i<(n+indexS); i++) {
+        // The size of the corresponding row is accumulated
+        dim = (*pp2 - *pp1); 
+        // Now each component of the row is analyzed
+        for (j=0; j<dim; j++) {
+            pp4[*pp3]++;
+            pp3++;
+        }
+        pp1 = pp2++; 
+    }
+
+    // Compute the number of nonzeros of the new sparse matrix
+    nnz = AddInts (sizes, n); 
+    // Create the new sparse matrix
+    CreateSparseMatrix (dst, indexD, n, n, nnz, 0);
+    // Fill the vector of pointers
+    CopyInts (sizes, (dst->vptr) + 1, n);
+    dst->vptr[0] = indexD; TransformLengthtoHeader (dst->vptr, n);
+    // The vector sizes is initiated with the beginning of each row
+    CopyInts (dst->vptr, sizes, n);
+    // This loop fills the contents of vector vpos
+    pp1 = src.vptr; pp3 = src.vpos + *pp1 - indexS; 
+    pp2 = pp1 + 1 ; pp4 = dst->vpos - indexD; pp5 = sizes - indexS;
+    pd3 = src.vval  + *pp1 - indexS; pd4 = dst->vval - indexD;
+    for (i=indexS; i<(n+indexS); i++) {
+        dim = (*pp2 - *pp1);
+        for (j=0; j<dim; j++) {
+            // The elements in the i-th column
+            pp4[pp5[*pp3]  ] = i+indexDS;
+            pd4[pp5[*pp3]++] = *pd3;
+            pp3++; pd3++;
+        }
+        pp1 = pp2++;
+    }
+    // The memory related to the vector sizes is liberated
+    RemoveInts (&sizes);
+}
+
+/*********************************************************************************/
+
+int ReadMatrixHB (char *filename, ptr_SparseMatrix p_spr) {
+  int *colptr = NULL;
+  double *exact = NULL;
+  double *guess = NULL;
+  int indcrd;
+  char *indfmt = NULL;
+  FILE *input;
+  char *key = NULL;
+  char *mxtype = NULL;
+  int ncol;
+  int neltvl;
+  int nnzero;
+  int nrhs;
+  int nrhsix;
+  int nrow;
+  int ptrcrd;
+  char *ptrfmt = NULL;
+  int rhscrd;
+  char *rhsfmt = NULL;
+  int *rhsind = NULL;
+  int *rhsptr = NULL;
+  char *rhstyp = NULL;
+  double *rhsval = NULL;
+  double *rhsvec = NULL;
+  int *rowind = NULL;
+  char *title = NULL;
+  int totcrd;
+  int valcrd;
+  char *valfmt = NULL;
+  double *values = NULL;
+
+	printf ("\nTEST09\n");
+	printf ("  HB_FILE_READ reads all the data in an HB file.\n");
+	printf ("  HB_FILE_MODULE is the module that stores the data.\n");
+
+	input = fopen (filename, "r");
+	if ( !input ) {
+		printf ("\n TEST09 - Warning!\n Error opening the file %s .\n", filename);
+		return -1;
+	}
+
+	hb_file_read ( input, &title, &key, &totcrd, &ptrcrd, &indcrd,
+									&valcrd, &rhscrd, &mxtype, &nrow, &ncol, &nnzero, &neltvl,
+									&ptrfmt, &indfmt, &valfmt, &rhsfmt, &rhstyp, &nrhs, &nrhsix,
+									&colptr, &rowind, &values, &rhsval, &rhsptr, &rhsind, &rhsvec,
+									&guess, &exact );
+	fclose (input);
+
+	// Conversion Fortran to C
+	CopyShiftInts (colptr, colptr, nrow+1, -1);
+	CopyShiftInts (rowind, rowind, nnzero, -1);
+
+	//  Data assignment
+	p_spr->dim1 = nrow  ; p_spr->dim2 = ncol  ; 
+	p_spr->vptr = colptr; p_spr->vpos = rowind; p_spr->vval = values; 
+
+	//  Memory liberation
+	free (exact ); free (guess ); free (indfmt);
+	free (key   ); free (mxtype); free (ptrfmt);
+	free (rhsfmt); free (rhsind); free (rhsptr);
+	free (rhstyp); free (rhsval); free (rhsvec);
+	free (title ); free (valfmt);
+	
+	return 0;
+}
+
+/*********************************************************************************/
+
+// This routine computes the product { res += spr * vec }.
+// The parameter index indicates if 0-indexing or 1-indexing is used,
+void ProdSparseMatrixVector2 (SparseMatrix spr, int index, double *vec, double *res) {
+	int i, j;
+	int *pp1 = spr.vptr, *pp2 = pp1+1, *pi1 = spr.vpos + *pp1 - index;
+	double aux, *pvec = vec - index, *pd2 = res;
+	double *pd1 = spr.vval + *pp1 - index;
+
+	// If the MSR format is used, first the diagonal has to be processed
+	if (spr.vptr == spr.vpos)
+		VvecDoubles (1.0, spr.vval, vec, 1.0, res, spr.dim1);
+
+	for (i=0; i<spr.dim1; i++) {
+		// The dot product between the row i and the vector vec is computed
+		aux = 0.0;
+		for (j=*pp1; j<*pp2; j++)
+			aux += *(pd1++) * pvec[*(pi1++)];
+//		for (j=spr.vptr[i]; j<spr.vptr[i+1]; j++)
+//			aux += spr.vval[j] * pvec[spr.vpos[j]];
+		// Accumulate the obtained value on the result
+		*(pd2++) += aux; pp1 = pp2++;
+	}
+}
+
+// This routine computes the product { res += spr * vec }.
+// The parameter index indicates if 0-indexing or 1-indexing is used,
+void ProdSparseMatrixVectorByRows (SparseMatrix spr, int index, double *vec, double *res) {
+	int i, j, dim = spr.dim1;
+	int *pp1 = spr.vptr, *pi1 = spr.vpos + *pp1 - index;
+	double aux, *pvec = vec + *pp1 - index;
+	double *pd1 = spr.vval + *pp1 - index;
+
+	// Process all the rows of the matrix
+	for (i=0; i<dim; i++) {
+		// The dot product between the row i and the vector vec is computed
+		aux = 0.0;
+		for (j=pp1[i]; j<pp1[i+1]; j++)
+			aux = fma(pd1[j], pvec[pi1[j]], aux);
+		// Accumulate the obtained value on the result
+		res[i] += aux; 
+	}
+}
+
+// This routine computes the product { res += spr * vec }.
+// The parameter index indicates if 0-indexing or 1-indexing is used,
+void ProdSparseMatrixVectorByRows_OMP (SparseMatrix spr, int index, double *vec, double *res) {
+	int i, j, dim = spr.dim1;
+	int *pp1 = spr.vptr, *pi1 = spr.vpos + *pp1 - index;
+	double aux, *pvec = vec + *pp1 - index;
+	double *pd1 = spr.vval + *pp1 - index;
+
+	// Process all the rows of the matrix
+	#pragma omp parallel for private(j, aux)
+	for (i=0; i<dim; i++) {
+		// The dot product between the row i and the vector vec is computed
+		aux = 0.0;
+		for (j=pp1[i]; j<pp1[i+1]; j++)
+			aux += pd1[j] * pvec[pi1[j]];
+		// Accumulate the obtained value on the result
+		res[i] += aux; 
+	}
+}
+
+/*void ProdSparseMatrixVectorByRows_OMPTasks (SparseMatrix spr, int index, double *vec, double *res, int bm) {
+	int i, dim = spr.dim1;
+
+	// Process all the rows of the matrix
+	//#pragma omp taskloop grainsize(bm) 
+	for ( i=0; i<dim; i+=bm) {
+		int cs = dim - i;
+		int c = cs < bm ? cs : bm;
+//	for (i=0; i<dim; i++) {
+	  #pragma omp task depend(inout:res[i:i+c-1]) //shared(c)
+		{
+//	printf("Task SPMV ---- i: %d, c: %d \n", i, c);
+		  int *pp1 = spr.vptr, *pi1 = spr.vpos + *pp1 - index;
+	    double aux, *pvec = vec + *pp1 - index;
+	    double *pd1 = spr.vval + *pp1 - index;
+	  	// The dot product between the row i and the vector vec is computed
+		  aux = 0.0;
+			for(int idx=i; idx < i+c; idx++){
+			//	printf("Task SPMV ---- idx: %d\n", idx);
+		  	for (int j=pp1[idx]; j<pp1[idx+1]; j++)
+			  	aux += pd1[j] * pvec[pi1[j]];
+		  	// Accumulate the obtained value on the result
+		  	res[idx] += aux; 
+	  	}
+		}
+	}
+}
+*/
+
+void ProdSparseMatrixVectorByRows_OMPTasks (SparseMatrix spr, int index, double *vec, double *res, int bm) {
+	int i, j, idx, dim = spr.dim1;
+	int *pp1 = spr.vptr, *pi1 = spr.vpos + *pp1 - index;
+	double aux, *pvec = vec + *pp1 - index;
+	double *pd1 = spr.vval + *pp1 - index;
+
+	// Process all the rows of the matrix
+	#pragma omp taskloop grainsize(bm) 
+	for ( i=0; i<dim; i++ ) {
+	  	// The dot product between the row i and the vector vec is computed
+		  aux = 0.0;
+		  for (j=pp1[i]; j<pp1[i+1]; j++)
+			  aux += pd1[j] * pvec[pi1[j]];
+		  // Accumulate the obtained value on the result
+		  res[i] += aux; 
+	}
+}
+
+
+/*********************************************************************************/
+
+// This routine computes the product { res += spr * vec }.
+// The parameter index indicates if 0-indexing or 1-indexing is used,
+void ProdSparseMatrixVectorByCols (SparseMatrix spr, int index, double *vec, double *res) {
+	int i, j, dim = spr.dim1;
+	int *pp1 = spr.vptr, *pi1 = spr.vpos + *pp1 - index;
+	double aux, *pres = res + *pp1 - index;
+	double *pd1 = spr.vval + *pp1 - index;
+
+	// Process all the columns of the matrix
+	for (i=0; i<dim; i++) {
+		// The result is scaled by the column i and the scalar vec[i]
+		aux = vec[i];
+		for (j=pp1[i]; j<pp1[i+1]; j++)
+			pres[pi1[j]] += pd1[j] * aux;
+	}
+}
+
+// This routine computes the product { res += spr * vec }.
+// The parameter index indicates if 0-indexing or 1-indexing is used,
+void ProdSparseMatrixVectorByCols_OMP (SparseMatrix spr, int index, double *vec, double *res) {
+	int i, j, dim = spr.dim1;
+	int *pp1 = spr.vptr, *pi1 = spr.vpos + *pp1 - index;
+	double aux, *pres = res + *pp1 - index;
+	double *pd1 = spr.vval + *pp1 - index;
+
+	// Process all the columns of the matrix
+	#pragma omp parallel for private(j, aux)
+	for (i=0; i<dim; i++) {
+		// The result is scaled by the column i and the scalar vec[i]
+		for (j=pp1[i]; j<pp1[i+1]; j++) {
+			aux = vec[i] * pd1[j];
+			#pragma omp atomic
+				pres[pi1[j]] += aux;
+		}
+	}
+}
+
+/*********************************************************************************/
+
+void GetDiagonalSparseMatrix2 (SparseMatrix spr, int shft, double *diag, int *posd) {
+    int i, j, dim = (spr.dim1 < spr.dim2) ? spr.dim1 : spr.dim2;
+    int *pp1 = NULL, *pp2 = NULL, *pi1 = NULL, *pi2 = posd; 
+    double *pd1 = NULL, *pd2 = diag;
+
+    if (spr.vptr == spr.vpos)
+        CopyDoubles (spr.vval, diag, spr.dim1);
+    else {
+        pp1 = spr.vptr; pp2 = pp1+1; j = (*pp2-*pp1);
+        pi1 = spr.vpos+*pp1; pd1 = spr.vval+*pp1; 
+        for (i=0; i<dim; i++) {
+            while ((j > 0) && (*pi1 < (i+shft))) {
+                pi1++; pd1++; j--;
+            }
+            *(pd2++) = ((j > 0) && (*pi1 == (i+shft))) ? *pd1: 0.0;
+            //*(pi2++) = ((j > 0) && (*pi1 == (i+shft))) ? *pp2-j: -1;
+            pi1 += j; pd1 += j; pp1 = (pp2++); j = (*pp2-*pp1);
+        }
+    }
+}
+
+/*********************************************************************************/
--- a/BiCGStab_Iker/SparseProduct.h
+++ b/BiCGStab_Iker/SparseProduct.h
+#ifndef SparseProductTip
+
+#define SparseProductTip 1
+
+typedef struct
+	{
+		int dim1, dim2;
+		int *vptr;
+		int *vpos;
+		double *vval;
+	} SparseMatrix, *ptr_SparseMatrix;
+
+/*********************************************************************************/
+
+// This routine creates a sparseMatrix from the next parameters
+// * numR defines the number of rows
+// * numC defines the number of columns
+// * numE defines the number of nonzero elements
+// * msr indicates if the MSR is the format used to the sparse matrix
+// If msr is actived, numE doesn't include the diagonal elements
+// The parameter index indicates if 0-indexing or 1-indexing is used.
+extern void CreateSparseMatrix (ptr_SparseMatrix p_spr, int index, int numR, int numC, int numE, 
+																	int msr);
+
+// This routine liberates the memory related to matrix spr
+extern void RemoveSparseMatrix (ptr_SparseMatrix spr);
+
+/*********************************************************************************/
+
+// This routine creates de sparse matrix dst from the symmetric matrix spr.
+// The parameters indexS and indexD indicate, respectivaly, if 0-indexing or 1-indexing is used
+// to store the sparse matrices.
+extern void DesymmetrizeSparseMatrices (SparseMatrix src, int indexS, ptr_SparseMatrix dst, 
+																					int indexD);
+
+/*********************************************************************************/
+
+// This routine creates de sparse matrix dst from the matrix spr.
+// The parameters indexS and indexD indicate, respectivaly, if 0-indexing or 1-indexing is used
+// to store the sparse matrices.
+void TransposeSparseMatrices (SparseMatrix src, int indexS, ptr_SparseMatrix dst, int indexD);
+
+/*********************************************************************************/
+
+extern int ReadMatrixHB (char *filename, ptr_SparseMatrix p_spr);
+
+/*********************************************************************************/
+
+// This routine computes the product { res += spr * vec }.
+// The parameter index indicates if 0-indexing or 1-indexing is used,
+extern void ProdSparseMatrixVector2 (SparseMatrix spr, int index, double *vec, double *res);
+
+// This routine computes the product { res += spr * vec }.
+// The parameter index indicates if 0-indexing or 1-indexing is used,
+extern void ProdSparseMatrixVectorByRows (SparseMatrix spr, int index, double *vec, double *res);
+
+// This routine computes the product { res += spr * vec }.
+// The parameter index indicates if 0-indexing or 1-indexing is used,
+extern void ProdSparseMatrixVectorByRows_OMP (SparseMatrix spr, int index, double *vec, double *res);
+
+/*********************************************************************************/
+
+// This routine computes the product { res += spr * vec }.
+// The parameter index indicates if 0-indexing or 1-indexing is used,
+extern void ProdSparseMatrixVectorByCols (SparseMatrix spr, int index, double *vec, double *res);
+
+// This routine computes the product { res += spr * vec }.
+// The parameter index indicates if 0-indexing or 1-indexing is used,
+extern void ProdSparseMatrixVectorByCols_OMP (SparseMatrix spr, int index, double *vec, double *res);
+
+/*********************************************************************************/
+
+extern void GetDiagonalSparseMatrix2 (SparseMatrix spr, int shft, double *diag, int *posd);
+
+/*********************************************************************************/
+
+#endif
--- a/BiCGStab_Iker/ToolsMPI.c
+++ b/BiCGStab_Iker/ToolsMPI.c
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <mpi.h>
+#include <ScalarVectors.h>
+#include "ToolsMPI.h"
+
+// #define PRINT_SEND_RESOTRNF_VECTORS 1
+
+/*********************************************************************************/
+
+void Synchronization (MPI_Comm Synch_Comm, char *message) {
+	int my_id, i ; 
+
+	MPI_Comm_rank(Synch_Comm, &my_id); 
+	MPI_Barrier (Synch_Comm);
+	printf ("(%d) %s\n", my_id, message);
+	if (my_id == 0) printf ("Waiting ... \n");
+	if (my_id == 0) scanf ("%d", &i);
+	if (my_id == 0) printf (" ... done\n");
+	MPI_Barrier (Synch_Comm);
+}
+
+/*********************************************************************************/
+
+// Return true if the corresponding asynchonous communication,
+// defined by data, has been finalized
+int TestSimple (void *data) {
+	int flag = 0;
+	ptr_SimpleNode smpnode = (ptr_SimpleNode) data;
+	
+	// Verify if the communication has finalized
+	MPI_Test (&(smpnode->req), &flag, &(smpnode->sta));
+	if (flag) {
+		// Remove the data included in the simple node
+		MPI_Wait (&(smpnode->req), &(smpnode->sta));
+		free (smpnode);
+	}
+
+	// Returns the result
+	return flag;
+}
+
+// Return true if the corresponding asynchonous communication,
+// defined by data, has been finalized
+int TestPacket (void *data) {
+	int flag = 0;
+	ptr_PacketNode pcknode = (ptr_PacketNode) data;
+	
+	// Verify if the communication has finalized
+	MPI_Test (&(pcknode->req), &flag, &(pcknode->sta));
+	if (flag) {
+		// Remove the data included in the pack
+		MPI_Wait (&(pcknode->req), &(pcknode->sta));
+		MPI_Type_free (&(pcknode->pack));
+		free (pcknode);
+	}
+
+	// Returns the result
+	return flag;
+}
+
+// Detect the lost messages whose destination is one process
+// into the processes of communicator Err_Comm
+void DetectErrorsMPI (MPI_Comm Err_Comm) {
+	int my_id, flag= 0;
+	MPI_Status st;
+
+	// Definition of the variable my_id
+	MPI_Comm_rank(Err_Comm, &my_id); 
+	// Test if some message exists
+	MPI_Iprobe (MPI_ANY_SOURCE, MPI_ANY_TAG, Err_Comm, &flag, &st);
+	if (flag) {
+		printf ("%d --> (%d,%d)\n", my_id, st.MPI_SOURCE, st.MPI_TAG);
+	}
+}
+
+/*********************************************************************************/
+
+// Prepare the structures required to send/receive a SparseMatrix structure
+// * spr refers to the SparseMatrix from where the data is obtained
+// * size is the number of rows to be sent
+// * weight is the number of nonzeros to be sent
+// * pcknode, where the resulted packet appears
+void MakeSprMatrixPacket (SparseMatrix spr, int size, int weight, ptr_PacketNode pcknode) {
+	int k;
+	int *lblq = pcknode->lblq;
+	MPI_Aint *dspl = pcknode->dspl;
+	MPI_Datatype *type = pcknode->type;
+		
+	// Definition of reference pointer
+	pcknode->ptr = (unsigned char *) spr.vptr;
+	// Definition of the required vectors to create the packet
+	type[0] = MPI_INT   ; lblq[0] = size+1; dspl[0] = (MPI_Aint) spr.vptr;
+	type[1] = MPI_INT   ; lblq[1] = weight; dspl[1] = (MPI_Aint) spr.vpos;
+	type[2] = MPI_DOUBLE; lblq[2] = weight; dspl[2] = (MPI_Aint) spr.vval;
+	type[3] = MPI_UB    ; lblq[3] = 1     ; dspl[3] = (MPI_Aint) (spr.vptr+size+1);
+	for (k=3; k>=0; k--) dspl[k] -= dspl[0]; 
+	// Creation of the packet
+	MPI_Type_create_struct (4, lblq, dspl, type, &(pcknode->pack));
+	MPI_Type_commit(&(pcknode->pack));
+}
+
+void MakeSprMatrixSendPacket (SparseMatrix spr, int *vlen, int dimL, int dspL, 
+															ptr_PacketNode pcknode) {
+	int k, weight, dspZ;
+	int *lblq = pcknode->lblq;
+	MPI_Aint *dspl = pcknode->dspl;
+	MPI_Datatype *type = pcknode->type;
+		
+//	printf ("dimL = %d , dspL = %d\n", dimL, dspL);
+//	PrintInts (vlen, spr.dim1);
+//	PrintInts (spr.vptr, spr.dim1+1);
+	// Definition of reference pointer
+	pcknode->ptr = (unsigned char *) (vlen+dspL);
+	// Definition of the required vectors to create the packet
+	dspZ = spr.vptr[dspL]; weight = spr.vptr[dspL+dimL] - dspZ;
+//	printf ("dspZ = %d , weight = %d\n", dspZ, weight);
+	type[0] = MPI_INT   ; lblq[0] = dimL  ; dspl[0] = (MPI_Aint) (vlen+dspL     );
+	type[1] = MPI_INT   ; lblq[1] = weight; dspl[1] = (MPI_Aint) (spr.vpos+dspZ );
+	type[2] = MPI_DOUBLE; lblq[2] = weight; dspl[2] = (MPI_Aint) (spr.vval+dspZ );
+	type[3] = MPI_UB    ; lblq[3] = 1     ; dspl[3] = (MPI_Aint) (vlen+dimL+dspL);
+	for (k=3; k>=0; k--) dspl[k] -= dspl[0]; 
+	// Creation of the packet
+	MPI_Type_create_struct (4, lblq, dspl, type, &(pcknode->pack));
+	MPI_Type_commit(&(pcknode->pack));
+}
+
+void MakeSprMatrixRecvPacket (SparseMatrix sprL, int nnzL, ptr_PacketNode pcknode) {
+	int k, dimL = sprL.dim1;
+	int *lblq = pcknode->lblq;
+	MPI_Aint *dspl = pcknode->dspl;
+	MPI_Datatype *type = pcknode->type;
+		
+//	printf ("nnzL = %d\n", nnzL);
+	// Definition of reference pointer
+	pcknode->ptr = (unsigned char *) (sprL.vptr+1);
+	// Definition of the required vectors to create the packet
+	type[0] = MPI_INT   ; lblq[0] = dimL; dspl[0] = (MPI_Aint) (sprL.vptr+1);
+	type[1] = MPI_INT   ; lblq[1] = nnzL; dspl[1] = (MPI_Aint) sprL.vpos;
+	type[2] = MPI_DOUBLE; lblq[2] = nnzL; dspl[2] = (MPI_Aint) sprL.vval;
+	type[3] = MPI_UB    ; lblq[3] = 1   ; dspl[3] = (MPI_Aint) (sprL.vptr+1+dimL);
+	for (k=3; k>=0; k--) dspl[k] -= dspl[0]; 
+	// Creation of the packet
+	MPI_Type_create_struct (4, lblq, dspl, type, &(pcknode->pack));
+	MPI_Type_commit(&(pcknode->pack));
+}
+
+// Compute the number of nonzeros elements of a PermSprMatrixRecvPacket packet
+// * prc_src is the processor from which the messages is sent
+// * dimL is the number of rows to be received
+// * comm is the communicator in which the messages is sent
+int ComputeSprMatrixRecvWeights (int prc_src, int dimL, MPI_Comm comm) {
+	int tam, tam_int, tam_double, tam_ub;
+	MPI_Status sta;
+
+	// Definition of sizes
+	MPI_Type_size(MPI_INT, &tam_int);
+	MPI_Type_size(MPI_DOUBLE, &tam_double);
+	MPI_Type_size(MPI_UB, &tam_ub);
+	MPI_Probe (prc_src, Tag_Send_Packet_Matrix_To_Leaf, comm, &sta);
+	MPI_Get_count (&sta, MPI_BYTE, &tam);
+
+	// Return the number of nonzeros included in a packet
+	return (tam - (dimL*tam_int + tam_ub)) / (tam_int + tam_double);
+}
+
+
+int DistributeMatrix (SparseMatrix spr, int index, ptr_SparseMatrix sprL, int indexL,
+												int *vdimL, int *vdspL, int root, MPI_Comm comm) {
+	int myId, nProcs;
+	int i, dim = spr.dim1, divL, rstL, dimL, dspL, nnzL;
+	ptr_PacketNode pcknode;
+
+	// Getiing the parameter of the communicator
+	MPI_Comm_rank(comm, &myId); MPI_Comm_size(comm, &nProcs); 
+	// Broadcasting the matrix dimension
+	MPI_Bcast (&dim, 1, MPI_INT, root, MPI_COMM_WORLD); 
+
+	// Calculating the vectors of sizes (vdimL) and displacements (vdspl)
+	divL = (dim / nProcs); rstL = (dim % nProcs);
+	for (i=0; i<nProcs; i++) vdimL[i] = divL + (i < rstL);
+	vdspL[0] = 0; for (i=0; i<nProcs; i++) vdspL[i+1] = vdspL[i] + vdimL[i];
+	dimL = vdimL[myId];	dspL = vdspL[myId];	
+	
+	// Distribution of the matrix, by blocks
+	if (root == myId) {
+		int *vlen = NULL;
+
+		CreateInts (&vlen, dim); ComputeLengthfromHeader (spr.vptr, vlen, dim);
+		for (i=0; i<nProcs; i++) {
+			if (i != myId) {
+				// Creating the message for each destination
+				pcknode = (ptr_PacketNode) malloc (sizeof(PacketNode));
+				MakeSprMatrixSendPacket (spr, vlen, vdimL[i], vdspL[i], pcknode);
+				MPI_Send (pcknode->ptr, 1, pcknode->pack, i, Tag_Send_Packet_Matrix_To_Leaf, comm);
+				MPI_Type_free (&(pcknode->pack));
+				free (pcknode);
+			}
+		}
+		nnzL = spr.vptr[dspL+dimL] - spr.vptr[dspL]; 
+		CreateSparseMatrix (sprL, indexL, dimL, dim, nnzL, 0);
+		CopyInts    (vlen+dspL, sprL->vptr+1, dimL);
+		CopyInts    (spr.vpos+spr.vptr[dspL], sprL->vpos, nnzL);
+		CopyDoubles (spr.vval+spr.vptr[dspL], sprL->vval, nnzL);
+
+		RemoveInts (&vlen);
+	} else {
+		MPI_Status sta;
+
+		// Compute the number of nonzeroes and creating the local matrix
+		nnzL = ComputeSprMatrixRecvWeights (root, dimL, comm);
+		CreateSparseMatrix (sprL, indexL, dimL, dim, nnzL, 0);
+		// Receiving the data on the local matrix
+		pcknode = (ptr_PacketNode) malloc (sizeof(PacketNode));
+		MakeSprMatrixRecvPacket (*sprL, nnzL, pcknode);
+		MPI_Recv (pcknode->ptr, 1, pcknode->pack, root, Tag_Send_Packet_Matrix_To_Leaf,
+               comm, &sta);
+		MPI_Type_free (&(pcknode->pack));
+		free (pcknode);
+	}
+	*(sprL->vptr) = indexL; TransformLengthtoHeader (sprL->vptr, dimL);
+
+	return dim;
+}
+
+/*********************************************************************************/
+
+// vcols is a vector with dimPos elements, including integer values from 0 to dim-1
+// The routine creates a bitmap determining which col index exists in vcols.
+// The bitmao is stored in colsJoin whose size is colsJoin_dim
+void joinColumns (int dim, int *vcols, int dimPos, unsigned char **colsJoin, 
+										int *colsJoin_dim) {
+	int i, div, rem;
+	int vec_dim = (dim + (sizeof(unsigned char) * 8) - 1) / (sizeof(unsigned char) * 8);
+	unsigned char *vec = (unsigned char *) malloc(sizeof(unsigned char) * vec_dim);
+	
+	for (i=0; i<vec_dim; i++) {
+		vec[i] = 0x0;
+	}
+	
+	for (i=0; i<dimPos; i++) {
+		div = vcols[i] / (sizeof(unsigned char) * 8);
+		rem = vcols[i] % (sizeof(unsigned char) * 8);
+		vec[div] |= (1 << rem);
+	}
+	*colsJoin = vec;
+	*colsJoin_dim = vec_dim;
+}
+
+
+// From colsJoin, this routine creates vector perm including the contents of the bitmap
+// Knowing the partition defined by nProcs and vdspL, this routine extends this
+//     partition on perm, using vdimP and vdspP vectors
+int createPerm (unsigned char *colsJoin, int colsJoin_dim, int *perm, int dim,
+									int *vdspL, int *vdimP, int *vdspP, int nProcs) {
+	int i, j, prc = 1, k = 0, col = 0;
+
+	vdspP[0] = 0;
+	for (i=0; i<colsJoin_dim; i++) {
+		if (colsJoin[i] != 0x0) {
+			unsigned char car = 0x1;
+			for (j=0; j<8*sizeof(unsigned char); j++) {
+				if (col == vdspL[prc]) {
+					vdimP[prc-1] = k - vdspP[prc-1];
+					vdspP[prc] = k;
+					prc++;
+				}
+				if (colsJoin[i] & car) {
+					perm[k] = col;
+					k++; 
+				}
+				car <<= 1;
+				col++;
+			}
+		} else {
+			col += 8*sizeof(unsigned char);
+			while ((prc <= nProcs) && (col >= vdspL[prc])) {
+					vdimP[prc-1] = k - vdspP[prc-1];
+					vdspP[prc] = k;
+					prc++;
+			}
+		}
+	}
+	while ((prc <= nProcs) && (col >= vdspL[prc])) {
+		vdimP[prc-1] = k - vdspP[prc-1];
+		vdspP[prc] = k;
+		prc++;
+  }
+	return k;
+}
+
+
+// Creation of the MPI_Datatype vectors to perform a reduction of the communications
+// vectDatatypeP includes MPI_DOUBLE for all processes.
+// vectDatatypeR includes the permutation required for each process.
+void createVectMPITypes (int myId, int nProcs, int *vdimL, int *vdimP, 
+													int *permR, int *vdimR, int *vdspR,
+													MPI_Datatype *vectDatatypeP, MPI_Datatype *vectDatatypeR) {
+	int i;
+
+	for (i=0; i<nProcs; i++) {
+		vectDatatypeP[i] = MPI_DOUBLE;
+		if (i == myId) {
+			MPI_Type_contiguous(vdimR[i], MPI_DOUBLE, vectDatatypeR+i);
+		} else {
+			MPI_Type_create_indexed_block(vdimR[i], 1, permR+vdspR[i], MPI_DOUBLE, 
+																		vectDatatypeR+i);
+		}
+		MPI_Type_commit(vectDatatypeR+i);
+	}
+}
+
+
+// Creation of all structures to perform a reduction of the communications
+// coll_p2p adapts the contents of the structure for collective or p2p operations
+// vecP is created to store the required elements to complete the product in the process
+// permP is created and included the permutation to be applied on vcols.
+void createAlltoallwStruct (int coll_p2p, MPI_Comm comm, SparseMatrix matL, 
+														int *vdimL, int *vdspL, int *vdimP, int *vdspP,
+														double **vecP, int **permP, int *ipermP, 
+														int *vdimR, int *vdspR,
+														MPI_Datatype *vectDatatypeP, MPI_Datatype *vectDatatypeR) {
+	
+	// Definition of the variables nProcs and myId
+	int myId, nProcs;
+  MPI_Comm_size(comm, &nProcs); MPI_Comm_rank(comm, &myId);
+
+	// Creation of column bitmap related to myId.
+	int colsJoin_dim = 0, dim = matL.dim2;
+	unsigned char *colsJoin = NULL;
+	joinColumns (dim, matL.vpos, matL.vptr[matL.dim1], &colsJoin, &colsJoin_dim);
+
+	// Creation of permutations, getting informaction from column bitmap
+	int permP_dim = 0;
+	permP_dim = createPerm (colsJoin, colsJoin_dim, ipermP, dim, vdspL, 
+														vdimP, vdspP, nProcs);
+	free (colsJoin); colsJoin = NULL;
+	CreateDoubles (vecP, permP_dim);
+	CreateInts (permP, permP_dim); CopyInts (ipermP, *permP, permP_dim); 
+	InitInts (ipermP, dim, -1, 0);
+	ComputeInvPermutation (*permP, ipermP, permP_dim);
+
+	// Definition of sizes of the sending pattern from myId
+	MPI_Alltoall (vdimP, 1, MPI_INT, vdimR, 1, MPI_INT, MPI_COMM_WORLD);
+	vdspR[0] = 0; ComputeHeaderfromLength (vdimR, vdspR, nProcs);
+
+	// Creation of the sending pattern from myId
+	int *permR = NULL, permR_dim = 0;;
+	permR_dim = vdspR[nProcs];
+	CreateInts (&permR, permR_dim);
+	MPI_Alltoallv (*permP, vdimP, vdspP, MPI_INT, 
+								 permR , vdimR, vdspR, MPI_INT, MPI_COMM_WORLD);
+	CopyShiftInts (permR, permR, permR_dim, -vdspL[myId]);
+
+	// Definition of the MPI_Datatype vectors required for communication
+	createVectMPITypes (myId, nProcs, vdimL, vdimP, permR, vdimR, vdspR, 
+												vectDatatypeP, vectDatatypeR);
+
+	// Computation of percentage of communication is done
+  int saving = vdspR[nProcs] - vdimL[myId];
+	MPI_Allreduce (MPI_IN_PLACE, &saving, 1, MPI_INT, MPI_SUM, comm);
+	if (myId == 0) {
+		printf ("%d nnzs of %d = %f %% \n", saving, dim * (nProcs - 1), 
+									100.0 * saving / (dim * (nProcs - 1)));
+	}
+
+	// Adaptation of sizes to complete the comunication
+	InitInts (vdimR, nProcs, 1, 0);
+	InitInts (vdspR, nProcs+1, 0, 0);
+	// This step is only required for MPI_Alltoallw
+	if (coll_p2p) {	
+		ScaleInts (vdspP, sizeof(double), nProcs+1);
+	}
+
+	// Freeing unuseful structures
+	RemoveInts (&permR);
+}
+
+
+// Communications to complete a MPI_Alltoallv reducing the communications
+// All elements required to compute SpMV is stored on vecP from vecL
+// coll_p2p marks if collective or p2p operations are used
+void joinDistributeVectorSPMV (int coll_p2p, MPI_Comm comm, double *vecL, 
+																double *vecP, int *vdimP, int *vdspP, int *vdimR, 
+																int *vdspR, MPI_Datatype *vectDatatypeP, 
+																MPI_Datatype *vectDatatypeR) {
+	if (coll_p2p) {
+		// Comunication using a collective operation
+  	MPI_Alltoallw (vecL, vdimR, vdspR, vectDatatypeR,
+                   vecP, vdimP, vdspP, vectDatatypeP, MPI_COMM_WORLD);
+	} else {
+  	// Definition of the variables nProcs, myId and other variables
+  	int i, k = 0, myId, nProcs;
+  	MPI_Comm_size (comm, &nProcs); MPI_Comm_rank(comm, &myId);
+
+  	// Definition of the vectors for implementing non-blocking communications
+		MPI_Status vectSta[2*nProcs-2];
+		MPI_Request vectReq[2*nProcs-2];
+		
+  	// Non-blocking send communications
+		for (i=0; i<nProcs; i++) {
+			if (i != myId) {
+				MPI_Isend (vecL+vdspR[i], vdimR[i], vectDatatypeR[i], i, Tag_NonBlocking_SpMV,
+              			comm, vectReq+k); 
+				k++;
+			}
+		}
+  	// Non-blocking receive communications
+		for (i=0; i<nProcs; i++) {
+			if (i != myId) {
+				MPI_Irecv (vecP+vdspP[i], vdimP[i], vectDatatypeP[i], i, Tag_NonBlocking_SpMV,
+              			comm, vectReq+k); 
+				k++;
+			}
+		}
+  	// Local copy
+		memcpy (vecP+vdspP[myId], vecL, vdimP[myId] * sizeof(double));
+
+  	// Waiting until all communications are complete
+		MPI_Waitall (2*nProcs-2, vectReq, vectSta);
+	}
+}
+
+/*********************************************************************************/
--- a/BiCGStab_Iker/ToolsMPI.h
+++ b/BiCGStab_Iker/ToolsMPI.h
+#ifndef ToolsMPI
+
+#define ToolsMPI 1
+
+// #include <SparseMatricesNew.h>
+#include <SparseProduct.h>
+
+/*********************************************************************************/
+
+#define Tag_Demand_Matrix_From_Root       1001
+#define Tag_Send_Task_To_Leaf             1002
+#define Tag_Receive_Dims_Factor_From_Leaf 1003
+#define Tag_End_Distribution_To_Leaf      1004
+#define Tag_Send_Dims_Matrix_To_Leaf      1006
+#define Tag_Send_Data_Matrix_To_Leaf      1007
+
+#define Tag_Demand_Vector_From_Root       1011
+#define Tag_Send_Dims_Vector_To_Father    1015
+#define Tag_Send_Data_Vector_To_Father    1016
+
+#define Tag_Send_Task_To_Root             1021
+#define Tag_Send_Solution_To_Root         1022
+#define Tag_Send_Dims_Vector_To_Children  1025
+#define Tag_Send_Data_Vector_To_Children  1026
+
+#define Tag_End_Resolution_To_Leaf        1031
+
+#define Tag_Send_Vector_Up_1              1041
+#define Tag_Send_Vector_Up_2              1042
+
+#define Tag_Send_Packet_Matrix_To_Leaf     210
+#define Tag_Receive_Data_Factor_From_Leaf  220
+#define Tag_Send_Vector_To_Leaf            230
+// #define Tag_FactorVector           240
+
+#define Tag_NonBlocking_SpMV               1051
+
+/*********************************************************************************/
+
+// typedef struct SimpleNode {
+typedef struct {
+	MPI_Status sta;
+	MPI_Request req;
+} SimpleNode, *ptr_SimpleNode;
+
+// Return true if the corresponding asynchonous communication,
+// defined by data, has been finalized
+extern int TestSimple (void *data);
+
+/*********************************************************************************/
+
+// #define MaxPacketSize                    10000
+#define MaxPacketSize                    5000
+
+// typedef struct PacketNode {
+typedef struct {
+	unsigned char *ptr;
+	int lblq[2*MaxPacketSize+3], vlen[MaxPacketSize];
+	MPI_Aint dspl[2*MaxPacketSize+3];
+	MPI_Datatype type[2*MaxPacketSize+3];
+	MPI_Datatype pack;
+	MPI_Status sta;
+	MPI_Request req;
+} PacketNode, *ptr_PacketNode;
+
+
+/*********************************************************************************/
+
+extern void Synchronization (MPI_Comm Synch_Comm, char *message);
+
+/*********************************************************************************/
+
+// Return true if the corresponding asynchonous communication,
+// defined by data, has been finalized
+extern int TestSimple (void *data);
+
+// Return true if the corresponding asynchonous communication,
+// defined by data, has been finalized
+extern int TestPacket (void *data);
+
+// Detect the lost messages whose destination is one process
+// into the processes of communicator Err_Comm
+extern void DetectErrorsMPI (MPI_Comm Err_Comm);
+
+/*********************************************************************************/
+
+// Prepare the structures required to send/receive a SparseMatrix structure
+// * spr refers to the SparseMatrix from where the data is obtained
+// * size is the number of rows to be sent
+// * weight is the number of nonzeros to be sent
+// * pcknode, where the resulted packet appears
+extern void MakeSprMatrixPacket (SparseMatrix spr, int size, int weight, ptr_PacketNode pcknode);
+
+extern void MakeSprMatrixSendPacket (SparseMatrix spr, int *len, int dimL, int dspL, 
+																			ptr_PacketNode pcknode);
+
+extern void MakeSprMatrixRecvPacket (SparseMatrix sprL, int nnzL, ptr_PacketNode pcknode);
+
+// Compute the number of nonzeros elements of a PermSprMatrixRecvPacket packet
+// * prc_src is the processor from which the messages is sent
+// * sizes is the number of rows to be received
+// * comm is the communicator in which the messages is sent
+extern int ComputeSprMatrixRecvWeights (int prc_src, int sizes, MPI_Comm comm);
+
+extern int DistributeMatrix (SparseMatrix spr, int index, ptr_SparseMatrix sprL, int indexL,
+															int *vdimL, int *vdspL, int root, MPI_Comm comm);
+
+/*********************************************************************************/
+
+// vcols is a vector with dimPos elements, including integer values from 0 to dim-1
+// The routine creates a bitmap determining which col index exists in vcols.
+// The bitmao is stored in colsJoin whose size is colsJoin_dim
+extern void joinColumns (int dim, int *vcols, int dimPos, unsigned char **colsJoin, 
+										int *colsJoin_dim);
+
+// From colsJoin, this routine creates vector perm including the contents of the bitmap
+// Knowing the partition defined by nProcs and vdspL, this routine extends this
+//     partition on perm, using vdimP and vdspP vectors
+extern int createPerm (unsigned char *colsJoin, int colsJoin_dim, int *perm, int dim,
+									int *vdspL, int *vdimP, int *vdspP, int nProcs);
+
+// Creation of the MPI_Datatype vectors to perform a reduction of the communications
+// vectDatatypeP includes MPI_DOUBLE for all processes.
+// vectDatatypeR includes the permutation required for each process.
+extern void createVectMPITypes (int myId, int nProcs, int *vdimL, int *vdimP, 
+													int *permR, int *vdimR, int *vdspR,
+													MPI_Datatype *vectDatatypeP, MPI_Datatype *vectDatatypeR);
+
+// Creation of all structures to perform a reduction of the communications
+// coll_p2p adapts the contents of the structure for collective or p2p operations
+// vecP is created to store the required elements to complete the product in the process
+// permP is created and included the permutation to be applied on vcols.
+extern void createAlltoallwStruct (int coll_p2p, MPI_Comm comm, SparseMatrix matL, 
+														int *vdimL, int *vdspL, int *vdimP, int *vdspP,
+														double **vecP, int **permP, int *ipermP, 
+														int *vdimR, int *vdspR,
+														MPI_Datatype *vectDatatypeP, MPI_Datatype *vectDatatypeR);
+
+// Communications to complete a MPI_Alltoallv reducing the communications
+// All elements required to compute SpMV is stored on vecP from vecL
+// coll_p2p marks if collective or p2p operations are used
+extern void joinDistributeVectorSPMV (int coll_p2p, MPI_Comm comm, double *vecL, 
+																double *vecP, int *vdimP, int *vdspP, int *vdimR, 
+																int *vdspR, MPI_Datatype *vectDatatypeP, 
+																MPI_Datatype *vectDatatypeR);
+
+/*********************************************************************************/
+
+#endif
--- a/BiCGStab_Iker/common.h
+++ b/BiCGStab_Iker/common.h
+
+#ifndef COMMON_H
+#define COMMON_H
+
+#include <math.h>
+
+double norm_inf(int n_dist, double *res_err) {
+    double nrm = 0.0;
+
+    for (int i = 0; i < n_dist; i++) {
+        double tmp = fabs(res_err[i]);
+        if (nrm < tmp)
+            nrm = tmp;
+    }
+
+    return nrm;
+}
+
+#endif // COMMON_H
--- a/BiCGStab_Iker/hb_io.c
+++ b/BiCGStab_Iker/hb_io.c
--- a/BiCGStab_Iker/hb_io.h
+++ b/BiCGStab_Iker/hb_io.h
+int ch_eqi ( char ch1, char ch2 );
+int ch_is_digit ( char c );
+int ch_is_format_code ( char c );
+int ch_to_digit ( char ch );
+void hb_exact_read ( FILE *input, int nrow, int nrhs, int rhscrd, 
+  char *rhsfmt, char *rhstyp, double exact[] );
+void hb_exact_write ( FILE *output, int nrow, int nrhs, int rhscrd, 
+  char *rhsfmt, char *rhstyp, double exact[] );
+void hb_file_read ( FILE *input, char **title, char **key, int *totcrd, 
+  int *ptrcrd, int *indcrd, int *valcrd, int *rhscrd, char **mxtype, int *nrow, 
+  int *ncol, int *nnzero, int *neltvl, char **ptrfmt, char **indfmt, char **valfmt, 
+  char **rhsfmt, char **rhstyp, int *nrhs, int *nrhsix, int **colptr, 
+  int **rowind, double **values, double **rhsval, int **rhsptr, int **rhsind,  
+  double **rhsvec, double **guess, double **exact );
+void hb_file_write ( FILE *output, char *title, char *key, int totcrd, 
+  int ptrcrd, int indcrd, int valcrd, int rhscrd, char *mxtype, int nrow, 
+  int ncol, int nnzero, int neltvl, char *ptrfmt, char *indfmt, char *valfmt, 
+  char *rhsfmt, char *rhstyp, int nrhs, int nrhsix, int colptr[],
+  int rowind[], double values[], double rhsval[], int rhsptr[], int rhsind[], 
+  double rhsvec[], double guess[], double exact[] );
+void hb_guess_read ( FILE *input, int nrow, int nrhs, int rhscrd, 
+  char *rhsfmt, char *rhstyp, double guess[] );
+void hb_guess_write ( FILE *output, int nrow, int nrhs, int rhscrd, 
+  char *rhsfmt, char *rhstyp, double guess[] );
+void hb_header_print ( char *title, char *key, int totcrd, int ptrcrd, 
+  int indcrd, int valcrd, int rhscrd, char *mxtype, int nrow, int ncol, 
+  int nnzero, int neltvl, char *ptrfmt, char *indfmt, char *valfmt, 
+  char *rhsfmt, char *rhstyp, int nrhs, int nrhsix );
+void hb_header_read ( FILE *input, char **title, char **key, int *totcrd, 
+  int *ptrcrd, int *indcrd, int *valcrd, int *rhscrd, char **mxtype, int *nrow, 
+  int *ncol, int *nnzero, int *neltvl, char **ptrfmt, char **indfmt, char **valfmt, 
+  char **rhsfmt, char **rhstyp, int *nrhs, int *nrhsix );
+void hb_header_write ( FILE *output, char *title, char *key, int totcrd, 
+  int ptrcrd, int indcrd, int valcrd, int rhscrd, char *mxtype, int nrow, 
+  int ncol, int nnzero, int neltvl, char *ptrfmt, char *indfmt, char *valfmt, 
+  char *rhsfmt, char *rhstyp, int nrhs, int nrhsix );
+double *hb_matvec_a_mem ( int nrow, int ncol, int nnzero, int nrhs, 
+  int colptr[], int rowind[], double values[], double exact[] );
+void hb_rhs_read ( FILE *input, int nrow, int nnzero, int nrhs, int nrhsix, 
+  int rhscrd, char *ptrfmt, char *indfmt, char *rhsfmt, char *mxtype, 
+  char *rhstyp, double rhsval[], int rhsind[], int rhsptr[], double rhsvec[] );
+void hb_rhs_write ( FILE *output, int nrow, int nnzero, int nrhs, int nrhsix, 
+  int rhscrd, char *ptrfmt, char *indfmt, char *rhsfmt, char *mxtype, 
+  char *rhstyp, double rhsval[], int rhsind[], int rhsptr[], double rhsvec[] );
+void hb_structure_print ( int ncol, char *mxtype, int nnzero, int neltvl, 
+  int colptr[], int rowind[] );
+void hb_structure_read ( FILE *input, int ncol, char *mxtype, int nnzero, 
+  int neltvl, int ptrcrd, char *ptrfmt, int indcrd, char *indfmt, 
+  int colptr[], int rowind[] );
+void hb_structure_write ( FILE *output, int ncol, char *mxtype, 
+  int nnzero, int neltvl, char *ptrfmt, char *indfmt, int colptr[], 
+  int rowind[] );
+int *hb_ua_colind ( int ncol, int colptr[], int nnzero );
+void hb_values_print ( int ncol, int colptr[], char *mxtype, int nnzero, 
+  int neltvl, double values[] );
+void hb_values_read ( FILE *input, int valcrd, char *mxtype, int nnzero,
+  int neltvl, char *valfmt, double values[] );
+void hb_values_write ( FILE *output, int valcrd, char *mxtype, 
+  int nnzero, int neltvl, char *valfmt, double values[] );
+double *hb_vecmat_a_mem ( int nrow, int ncol, int nnzero, int nrhs, 
+  int colptr[], int rowind[], double values[], double exact[] );
+int i4_max ( int i1, int i2 );
+int i4_min ( int i1, int i2 );
+void i4vec_print ( int n, int a[], char *title );
+void i4vec_print_part ( int n, int a[], int max_print, char *title );
+void r8mat_print ( int m, int n, double a[], char *title );
+void r8mat_print_some ( int m, int n, double a[], int ilo, int jlo, int ihi,
+  int jhi, char *title );
+void r8vec_print ( int n, double a[], char *title );
+void r8vec_print_part ( int n, double a[], int max_print, char *title );
+int s_len_trim ( char *s );
+char *s_substring ( char *s, int a, int b );
+void s_to_format ( char *s, int *r, char *code, int *w, int *m );
+void s_trim ( char *s );
+void timestamp ( );
+
--- a/BiCGStab_Iker/makefile
+++ b/BiCGStab_Iker/makefile
+## ============================================================
+## INTEL COMPILERS
+## ============================================================
+#
+#CC = mpicc
+#CFLAGS = -Wall -g -openmp -I.  -I${MKLROOT}/include
+#CFLAGS = -Wall -openmp -I. -I${MKLROOT}/include
+#CLFLAGS = -Wall -fopenmp -I. -I${MKLROOT}/include
+#
+#CLINKER = mpicc
+#FLINKER = mpif77
+#LDFLAGS = -openmp
+#LIBLIST = -L. -lhbio -lclock -lsparsenew -lvector -lm -lc
+#LIBLIST = -L. -lhbio -lclock -lvector -lm -lc
+#LIBLIST = -L. -lhbio -lclock -lm -lc
+#LIBLIST = -L. -lsparse -lvector -lclock -lm -lc
+#
+##LIBMKL = -L$(MKLROOT)/lib/intel64 $(MKL_FMULTIS_INTEL)
+#LIBMKL = -L${MKLROOT}/lib/intel64 -lmkl_intel_lp64 -lmkl_core -lmkl_sequential -lpthread
+
+# ============================================================
+# GNU COMPILERS
+# ============================================================
+
+CC = mpicxx
+CFLAGS = -std=c++11 -mavx -fabi-version=0 -Wall -fopenmp -I. -I${MKLROOT}/include -I${HOME}/libs
+CFLAGS = -std=c++11 -Wall -fopenmp -I. -I${MKLROOT}/include -I${HOME}/libs
+CLFLAGS = -Wall -fopenmp -I. -I${MKLROOT}/include
+
+CLINKER = mpicxx
+LDFLAGS = -fopenmp
+LIBLIST = -L. -lhbio -lclock -lsparsenew -lvector -lm -lc
+LIBLIST = -L. -lhbio -lclock -lvector -lm -lc
+LIBLIST = -L. -lhbio -lclock -lm -lc
+LIBLIST = -L. -lsparse -lvector -lclock -lm -lc
+
+LIBMKL = -L${MKLROOT}/lib/intel64 -lmkl_intel_lp64 -lmkl_core -lmkl_sequential -lpthread
+
+# ============================================================
+
+AR = ar
+ARFLAGS = ru
+
+RL = ranlib
+
+# ============================================================
+
+OBJS_CLOCK  = reloj.o 
+OBJS_VECTOR = ScalarVectors.o
+OBJS_SPARSE = hb_io.o SparseProduct.o
+
+OBJS = $(OBJS_CLOCK) $(OBJS_VECTOR) $(OBJS_SPARSE) 
+
+# ============================================================
+
+default: libclock.a libvector.a libsparse.a BiCGStab 
+
+libshared.a : $(OBJS)
+	$(AR) $(ARFLAGS) $@ $?
+	$(RL) $(RLFLAGS) $@
+
+libclock.a : $(OBJS_CLOCK)
+	$(AR) $(ARFLAGS) $@ $?
+	$(RL) $(RLFLAGS) $@
+
+libvector.a : $(OBJS_VECTOR)
+	$(AR) $(ARFLAGS) $@ $?
+	$(RL) $(RLFLAGS) $@
+
+libsparse.a : $(OBJS_SPARSE)
+	$(AR) $(ARFLAGS) $@ $?
+	$(RL) $(RLFLAGS) $@
+
+BiCGStab: BiCGStab.o ToolsMPI.o matrix.o 
+	$(CLINKER) $(LDFLAGS) -o BiCGStab BiCGStab.o ToolsMPI.o matrix.o $(LIBMKL) $(LIBLIST)
+
+# ============================================================
+
+.c.o:
+	echo compiling
+	$(CC) $(CFLAGS) -c $*.c
+
+clean:
+	rm -f *.o *.a BiCGStab 
+
+# ============================================================
--- a/BiCGStab_Iker/matrix.c
+++ b/BiCGStab_Iker/matrix.c
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+
+//#include "global.h"
+//#include "debug.h"
+
+#include "matrix.h"
+//#include "cg_aux_conLabel.h"
+
+// finite-difference method for a 3D Poisson's equation with a 7, 19 or 27 point stencil
+void generate_Poisson3D(ptr_SparseMatrix A, const int p, const int stencil_points, int dspL, int dimL, int dim)
+{
+	int p2 = p * p, i, j=0; //, pos=0;
+	int pos = 0;
+	int *vptr = A->vptr;
+	const int    *stenc_c;
+	const double *stenc_v;
+
+	const int    stenc_c7[]  = { -p2,  -p,  -1,   0,   1,   p,  p2};
+	const double stenc_v7[]  = { -1.0, -1.0, -1.0, 6.0, -1.0, -1.0, -1.0};
+
+	const double r = 1.0;
+	const int    stenc_c19[] =
+	{
+		       -p2-p,          -p2-1,  -p2+0, -p2+1,          -p2+p,
+		 -p-1,    -p,    -p+1,    -1,      0,     1,     p-1,     p,     p+1,
+		        p2-p,           p2-1,   p2+0,  p2+1,           p2+p
+	};
+	const double stenc_v19[] =
+	{
+		     -(1+r),      -(1+r),    -(8*r-4),   -(1+r),      -(1+r),
+		-2, -(6-2*r), -2, -(6-2*r), -(-32-16*r), -(6-2*r), -2, -(6-2*r), -2,
+		     -(1+r),      -(1+r),    -(8*r-4),   -(1+r),      -(1+r)
+	};
+
+	const int    stenc_c27[] =
+	{
+		-p2-p-1, -p2-p, -p2-p+1, -p2-1,  -p2+0, -p2+1, -p2+p-1, -p2+p, -p2+p+1,
+		   -p-1,    -p,    -p+1,    -1,      0,     1,     p-1,     p,     p+1,
+		 p2-p-1,  p2-p,  p2-p+1,  p2-1,   p2+0,  p2+1,  p2+p-1,  p2+p,  p2+p+1
+	};
+	const double stenc_v27[] =
+	{
+		  -(2+r),  -(8-10*r),    -(2+r),  -(8-10*r),   -(100*r-40),  -(8-10*r),    -(2+r),  -(8-10*r),    -(2+r),
+		-(20-2*r), -(80-20*r), -(20-2*r), -(80-20*r), -(-400-200*r), -(80-20*r), -(20-2*r), -(80-20*r), -(20-2*r),
+		   -(2+r),  -(8-10*r),    -(2+r),  -(8-10*r),   -(100*r-40),  -(8-10*r),    -(2+r),  -(8-10*r),    -(2+r)
+	};
+
+	if( stencil_points == 7 )
+	{
+		stenc_c = stenc_c7;
+		stenc_v = stenc_v7;
+	}
+	else if( stencil_points == 19 )
+	{
+		stenc_c = stenc_c19;
+		stenc_v = stenc_v19;
+	}
+	else if( stencil_points == 27 )
+	{
+		stenc_c = stenc_c27;
+		stenc_v = stenc_v27;
+	}
+	else
+		// this should be impossible, but silences compiler warnings
+		return;
+
+	// to compute the nnz, we just need to know that each stencil point at distance |d| from the diagonal
+	// will be excluded from the matrix on d lines, otherwise each stencil point is on each line
+
+	// let's only do the part here.
+	//for(j=start_row; j<end_row; j++)
+	printf("Generate matrix ---- dim: %d, dimL: %d, dspL: %d\n", dim, dimL, dspL);
+	for(j=0; j<dimL; j++) //M
+	{
+		vptr[j] = pos;
+	//	printf("A->vptr[%d]: %d \n", j, A->vptr[j]);
+		for(i=0; i<stencil_points; i++){
+			int val = j + dspL + stenc_c[i];
+			//long val = j + dspL + stenc_c[i];
+		//	printf("j: %d dspL: %d, stenc_c[%d]: %d, dim: %d \n", j, dspL, i, stenc_c[i], dim);
+			if( val >= 0 && val < dim )
+			{
+				A->vpos[pos] = val;
+				A->vval[pos] = stenc_v[i];
+		//		printf("j: %d, stenc_c[%d]: %d, A->vpos[%d]: %d, A->vval[%d]: %f\n", j, i, stenc_c[i], pos, A->vpos[pos], pos, A->vval[pos]);
+				pos++;
+			}
+		}
+	}
+
+	// point to just beyond last element
+	//MA->vptr[j] = pos;
+	vptr[j] = pos;
+	//A->elemc = pos;
+//	A->vptr[j-start_row] = pos;
+}
+
+// finite-difference method for a 3D Poisson's equation with a 7, 19 or 27 point stencil
+void generate_Poisson3D_filled(ptr_SparseMatrix A, const int p, const int stencil_points, int band_width, int dspL, int dimL, int dim)
+{
+	int p2 = p * p, i, j=0; //, pos=0;
+	int pos = 0;
+	int *vptr = A->vptr;
+	const double value = 0.1;
+	const int    *stenc_c;
+	const double *stenc_v;
+
+    double eps = 0.0001;
+	const int    stenc_c7[]  = { -p2,  -p,  -1,   0,   1,   p,  p2};
+	const double stenc_v7[]  = { -1.0, -1.0, -1.0, 6.0, -(1.0-eps), -(1.0-eps), -(1.0-eps)};
+
+	const double r = 1.0;
+	const int    stenc_c19[] =
+	{
+		       -p2-p,          -p2-1,  -p2+0, -p2+1,          -p2+p,
+		 -p-1,    -p,    -p+1,    -1,      0,     1,     p-1,     p,     p+1,
+		        p2-p,           p2-1,   p2+0,  p2+1,           p2+p
+	};
+	const double stenc_v19[] =
+	{
+		     -(1+r),      -(1+r),    -(8*r-4),   -(1+r),      -(1+r),
+		-2, -(6-2*r), -2, -(6-2*r), -(-32-16*r), -(6-2*r), -2, -(6-2*r), -2,
+		     -(1+r),      -(1+r),    -(8*r-4),   -(1+r),      -(1+r)
+	};
+
+	const int    stenc_c27[] =
+	{
+		-p2-p-1, -p2-p, -p2-p+1, -p2-1,  -p2+0, -p2+1, -p2+p-1, -p2+p, -p2+p+1,
+		   -p-1,    -p,    -p+1,    -1,      0,     1,     p-1,     p,     p+1,
+		 p2-p-1,  p2-p,  p2-p+1,  p2-1,   p2+0,  p2+1,  p2+p-1,  p2+p,  p2+p+1
+	};
+	const double stenc_v27[] =
+	{
+		  -(2+r),  -(8-10*r),    -(2+r),  -(8-10*r),   -(100*r-40),  -(8-10*r+eps),    -(2+r-eps),  -(8-10*r+eps),    -(2+r-eps),
+		-(20-2*r), -(80-20*r), -(20-2*r), -(80-20*r), -(-400-200*r), -(80-20*r+eps), -(20-2*r+eps), -(80-20*r+eps), -(20-2*r+eps),
+		   -(2+r),  -(8-10*r),    -(2+r),  -(8-10*r),   -(100*r-40-eps),  -(8-10*r+eps),    -(2+r-eps),  -(8-10*r+eps),    -(2+r-eps)
+	};
+
+	if( stencil_points == 7 )
+	{
+		stenc_c = stenc_c7;
+		stenc_v = stenc_v7;
+	}
+	else if( stencil_points == 19 )
+	{
+		stenc_c = stenc_c19;
+		stenc_v = stenc_v19;
+	}
+	else if( stencil_points == 27 )
+	{
+		stenc_c = stenc_c27;
+		stenc_v = stenc_v27;
+	}
+	else
+		// this should be impossible, but silences compiler warnings
+		return;
+
+	// to compute the nnz, we just need to know that each stencil point at distance |d| from the diagonal
+	// will be excluded from the matrix on d lines, otherwise each stencil point is on each line
+
+	// let's only do the part here.
+	//for(j=start_row; j<end_row; j++)
+	printf("Generate matrix ---- dim: %d, dimL: %d, dspL: %d, band_width: %d \n", dim, dimL, dspL, band_width);
+	for(j=0; j<dimL; j++) //M
+	{
+		int iii;
+		//long iii;
+		int jjj = j + dspL;
+		int prv = 0;
+		vptr[j] = pos;
+		//printf("j: %d, dspL: %d, pos: %d, dim: %d, band_width: %d\n", j, dspL, pos, dim, band_width);
+		//printf(stderr, "A->vptr[%d]: %ld \n", j, vptr[j]);
+		for(i=0; i<stencil_points; i++){
+			int	val = j + dspL + stenc_c[i];
+			//long	val = j + dspL + stenc_c[i];
+			//printf("j: %d, i: %d, val: %d, dspL: %d, pos: %d, stenc_c[%d]: %d, dim: %d, band_width: %d\n", j, i, val, dspL, pos, i, stenc_c[i], dim, band_width);
+			//printf("j: %d, val: %d, dspL: %d, pos: %ld , stenc_c[%d]: %d, dim: %d, band_width: %d \n", j, val, dspL, pos, i, stenc_c[i], dim, band_width);
+//			if( j + dspL + stenc_c[i] >= 0 && j + dspL + stenc_c[i] < dim )
+			if( val >= 0 && val < dim )
+			{
+				// Analyzing if val is into the band
+//				if( val >= (j-band_width) && val <= (j+band_width) ) {
+				if( val >= (jjj-band_width) && val <= (jjj+band_width) ) {
+					// Adding the elements in the band which are previous to val
+//					int kk1 = ((j-band_width) <    0)?     0: (j-band_width);
+					int kk1 = ((jjj-band_width) <    0)?     0: (jjj-band_width);
+					if (prv != 0) kk1 = prv;
+					//printf("Entra en if kk1: %d, val: %d j: %d, band_width: %d, j-band_width: %d, j+band_width: %d \n", kk1, val, j, band_width, j-band_width, j+band_width);
+					for (iii=kk1; iii<val; iii++) {
+						A->vpos[pos] = iii;
+						A->vval[pos] = value;
+						pos++;
+					}
+					prv = val + 1;
+					// Choosing the correct value to val 
+//					if ( val == j ) {
+					if ( val == jjj ) {
+						//printf("Diagonal val: %d, pos: %ld \n", val, pos);
+						A->vpos[pos] = val;
+						A->vval[pos] = stenc_v[i] + band_width * value;
+						//A->vval[pos] = stenc_v[i] + (2 * band_width * value) / 20;
+						pos++;
+					}	else {
+						//printf("No Diagonal val: %d, j: %ld \n", val, pos);
+						A->vpos[pos] = val;
+						A->vval[pos] = stenc_v[i] + value;
+						pos++;
+					}
+//				} else if (val < (j-band_width)) {
+				} else if (val < (jjj-band_width)) {
+					// Choosing the correct value to val 
+					A->vpos[pos] = val;
+					A->vval[pos] = stenc_v[i];
+					pos++;
+					// prv = val + 1;
+				} else {
+					// Adding the elements in the band which are previous to val
+//					int kk2 = ((j+band_width) >= dim)? dim-1: (j+band_width);
+					int kk2 = ((jjj+band_width) >= dim)? dim-1: (jjj+band_width);
+//					if (prv == 0) prv = j+1;
+					if (prv == 0) prv = jjj+1;
+					//printf("No entra en if prv: %d, kk2: %d, val: %d, j: %d, band_width: %d, j-band_width: %d, j+band_width: %d \n", prv, kk2, val, j, band_width, j-band_width, j+band_width);
+					for (iii=prv; iii<=kk2; iii++) {
+						A->vpos[pos] = iii;
+						A->vval[pos] = value;
+						pos++;
+					}
+					prv = kk2 + 1;
+					// Choosing the correct value to val 
+					A->vpos[pos] = val;
+					A->vval[pos] = stenc_v[i];
+					pos++;
+				}
+/*
+//				A->vpos[pos] = j + stenc_c[i] + dspL;
+				A->vpos[pos] = val;
+				A->vval[pos] = stenc_v[i];
+		//		printf("j: %d, stenc_c[%d]: %d, A->vpos[%d]: %d, A->vval[%d]: %f\n", j, i, stenc_c[i], pos, A->vpos[pos], pos, A->vval[pos]);
+				pos++;
+*/
+			}
+		}
+//		if (prv <= (j+band_width)) {
+		if (prv <= (jjj+band_width)) {
+//			int kk2 = ((j+band_width) >= dim)? dim-1: (j+band_width);
+			int kk2 = ((jjj+band_width) >= dim)? dim-1: (jjj+band_width);
+//			if (prv == 0) prv = j+1;
+			if (prv == 0) prv = jjj+1;
+			//printf("No entra en if prv: %d, kk2: %d, j: %d, band_width: %d, j-band_width: %d, j+band_width: %d \n", prv, kk2, j, band_width, j-band_width, j+band_width);
+			for (iii=prv; iii<=kk2; iii++) {
+				A->vpos[pos] = iii;
+				A->vval[pos] = value;
+				pos++;
+			}
+		}
+	}
+
+	// point to just beyond last element
+	//A->vptr[j] = pos;
+	vptr[j] = pos;
+	//A->elemc = pos;
+//	A->vptr[j-start_row] = pos;
+	printf("FIN Generate matrix ---- dim: %d, dimL: %d, dspL: %d, band_width: %d \n", dim, dimL, dspL, band_width);
+}
+
+void generate_Poisson3D_perm(ptr_SparseMatrix A, const int p, const int stencil_points, int init, int step, int dimL, int dim)
+{
+	int p2 = p * p, i, j=0; //, pos=0;
+	int pos = 0;
+	int *vptr = A->vptr;
+	const int    *stenc_c;
+	const double *stenc_v;
+
+	const int    stenc_c7[]  = { -p2,  -p,  -1,   0,   1,   p,  p2};
+	//const double stenc_v7[]  = { 1.0, 1.0, 1.0,-6.0, 1.0, 1.0, 1.0};
+	const double stenc_v7[]  = { -1.0, -1.0, -1.0, 6.0, -1.0, -1.0, -1.0};
+
+	const double r = 1.0;
+	const int    stenc_c19[] =
+	{
+		       -p2-p,          -p2-1,  -p2+0, -p2+1,          -p2+p,
+		 -p-1,    -p,    -p+1,    -1,      0,     1,     p-1,     p,     p+1,
+		        p2-p,           p2-1,   p2+0,  p2+1,           p2+p
+	};
+	//const double stenc_v19[] =
+//	{
+//		     1+r,      1+r,    8*r-4,   1+r,      1+r,
+//		2, 6-2*r, 2, 6-2*r, -32-16*r, 6-2*r, 2, 6-2*r, 2,
+//		     1+r,      1+r,    8*r-4,   1+r,      1+r
+//	};
+	const double stenc_v19[] =
+	{
+		     -(1+r),      -(1+r),    -(8*r-4),   -(1+r),      -(1+r),
+		-2, -(6-2*r), -2, -(6-2*r), -(-32-16*r), -(6-2*r), -2, -(6-2*r), -2,
+		     -(1+r),      -(1+r),    -(8*r-4),   -(1+r),      -(1+r)
+	};
+
+	const int    stenc_c27[] =
+	{
+		-p2-p-1, -p2-p, -p2-p+1, -p2-1,  -p2+0, -p2+1, -p2+p-1, -p2+p, -p2+p+1,
+		   -p-1,    -p,    -p+1,    -1,      0,     1,     p-1,     p,     p+1,
+		 p2-p-1,  p2-p,  p2-p+1,  p2-1,   p2+0,  p2+1,  p2+p-1,  p2+p,  p2+p+1
+	};
+	//const double stenc_v27[] =
+//	{
+//		   2+r,  8-10*r,    2+r,  8-10*r,   100*r-40,  8-10*r,    2+r,  8-10*r,    2+r,
+//		20-2*r, 80-20*r, 20-2*r, 80-20*r, -400-200*r, 80-20*r, 20-2*r, 80-20*r, 20-2*r,
+//		   2+r,  8-10*r,    2+r,  8-10*r,   100*r-40,  8-10*r,    2+r,  8-10*r,    2+r
+//	};
+	const double stenc_v27[] =
+	{
+		  -(2+r),  -(8-10*r),    -(2+r),  -(8-10*r),   -(100*r-40),  -(8-10*r),    -(2+r),  -(8-10*r),    -(2+r),
+		-(20-2*r), -(80-20*r), -(20-2*r), -(80-20*r), -(-400-200*r), -(80-20*r), -(20-2*r), -(80-20*r), -(20-2*r),
+		   -(2+r),  -(8-10*r),    -(2+r),  -(8-10*r),   -(100*r-40),  -(8-10*r),    -(2+r),  -(8-10*r),    -(2+r)
+	};
+
+	if( stencil_points == 7 )
+	{
+		stenc_c = stenc_c7;
+		stenc_v = stenc_v7;
+	}
+	else if( stencil_points == 19 )
+	{
+		stenc_c = stenc_c19;
+		stenc_v = stenc_v19;
+	}
+	else if( stencil_points == 27 )
+	{
+		stenc_c = stenc_c27;
+		stenc_v = stenc_v27;
+	}
+	else
+		// this should be impossible, but silences compiler warnings
+		return;
+
+	// to compute the nnz, we just need to know that each stencil point at distance |d| from the diagonal
+	// will be excluded from the matrix on d lines, otherwise each stencil point is on each line
+	/*A->nnz = stencil_points * p3 ; 
+	for(i=0; i<stencil_points; i++)
+		A->nnz -= abs(stenc_c[i]);*/ //M
+	//A->nnz = A->nnz / nProcs; //M
+
+	// let's only do the part here.
+	//for(j=start_row; j<end_row; j++)
+	int dimB = (step == 1) ? 0: (dim / step);
+	int resB = (step == 1) ? 0: (dim % step);
+	int row=init;
+//	printf ("init = %d , step = %d , dimB = %d , dim = %d\n", init, step, dimB, dim);
+	for(j=0; j<dimL; j++) //M
+//	for(j=init; j<dim; j+=step) //M
+	{
+		//A->vptr[j-start_row] = pos;
+		//MA->vptr[j] = pos;
+		vptr[j] = pos;
+	//	printf("A->vptr[%d]: %d \n", j, A->vptr[j]);
+//		printf ("(%d) row = %d , pos = %d\n", init, row-1, pos);
+		for(i=0; i<stencil_points; i++){
+			int val = row + stenc_c[i];
+			int val1 = (val / step);
+			int val2 = (val % step);
+//			int k = (val2 * dimB + val1) ;
+			int k = (val2 * dimB + val1 + ((val2 < resB)? val2: resB)) ;
+			//long k = (val2 * dimB + val1 + ((val2 < resB)? val2: resB)) ;
+		//	printf("j: %d dspL: %d, stenc_c[%d]: %d, dim: %d \n", j, dspL, i, stenc_c[i], dim);
+//			if( k >= 0 && k < dim )
+			if( val >= 0 && val < dim )
+			{
+				A->vpos[pos] = k;
+				A->vval[pos] = stenc_v[i];
+		//		printf("j: %d, stenc_c[%d]: %d, A->vpos[%d]: %d, A->vval[%d]: %f\n", j, i, stenc_c[i], pos, A->vpos[pos], pos, A->vval[pos]);
+				pos++;
+			}
+		}
+		row += step;
+	}
+
+	// point to just beyond last element
+	//M A->vptr[j] = pos;
+	vptr[j] = pos;
+	//A->elemc = pos;
+//	A->vptr[j-start_row] = pos;
+}
+
+void allocate_matrix(const int m, const int n, const int nnz, ptr_SparseMatrix A)
+{
+	A->dim1 = m;
+	A->dim2 = n;
+	//A->elemc = nnz;
+	//long *vptr = A->vptr;
+
+	//A->vptr = (int*)calloc((n+1), sizeof(int));
+	A->vptr = (int*)calloc((n+1), sizeof(int));
+	//vptr = (long*)calloc((n+1), sizeof(long));
+
+	A->vpos = (int*)calloc(nnz, sizeof(int));
+	//A->vpos = (long *)calloc(nnz, sizeof(long));
+	A->vval = (double*)calloc(nnz, sizeof(double));
+
+/*	if( ! A->vval ) 
+	{
+		fprintf(stderr, "Allocating vval failed !\n");
+		exit(2);
+	}
+
+	if (! A->vpos ) 
+	{
+		fprintf(stderr, "Allocating vpos failed !\n");
+		exit(2);
+	}
+
+	if (! A->vptr )
+	{
+		fprintf(stderr, "Allocating vptr failed !\n");
+		exit(2);
+	}*/
+	if( ! A->vval || ! A->vpos || ! A->vptr )
+	{
+		fprintf(stderr, "Allocating sparse matrix of size %d rows and %d non-zeros failed !\n", n, nnz);
+		exit(2);
+	}
+	fprintf(stderr, "Matrix allocated\n");
+}
+
+void ScaleFirstRowCol(SparseMatrix A, int despL, int dimL, int myId, int root, double factor){   
+// To generate ill-conditioned matrices
+  int i;
+
+  if (myId == root) {
+    for (i=A.vptr[0]; i< A.vptr[1]; i++)
+       A.vval[i] *= factor;
+  }
+  if (despL == 0) {
+    i = 0;
+    while((i < dimL) && (A.vpos[A.vptr[i]] == 0)) {
+      A.vval[A.vptr[i]] *= factor;
+      i++;
+    }
+  }
+}
+
+/*
+void deallocate_matrix(Matrix *A)
+{
+	free(A->r);
+	free(A->c);
+
+	if( A->v )
+		free(A->v);
+}*/
+
+
--- a/BiCGStab_Iker/matrix.h
+++ b/BiCGStab_Iker/matrix.h
+#ifndef MATRIX_H_INCLUDED
+#define MATRIX_H_INCLUDED
+
+#include <stdio.h>
+#include <SparseProduct.h>
+
+typedef struct Matrix
+{
+	int n, m, *c, *r;
+	long nnz;
+	double *v;
+} Matrix;
+
+typedef enum
+{
+	FROM_FILE = 0,
+	POISSON3D
+} matrix_type;
+
+void generate_Poisson3D(ptr_SparseMatrix A, const int p, const int stencil_points, int dspL, int dimL, int dim);
+
+// memory utility functions
+void allocate_matrix(const int n, const int m, const int nnz, ptr_SparseMatrix A);
+
+void generate_Poisson3D_filled(ptr_SparseMatrix A, const int p, const int stencil_points, int band_width, int dspL, int dimL, int dim);
+
+void generate_Poisson3D_perm(ptr_SparseMatrix A, const int p, const int stencil_points, int init, int step, int dimL, int dim);
+
+void ScaleFirstRowCol(SparseMatrix A, int despL, int dimL, int myId, int root, double factor);
+#endif // MATRIX_H_INCLUDED
+
--- a/BiCGStab_Iker/reloj.c
+++ b/BiCGStab_Iker/reloj.c
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/times.h>
+#include <unistd.h>
+
+static double  timetick;
+static double  tstart = 0.0;
+static double  ucpustart = 0.0;
+static int     first = 1;
+
+void reloj (double *elapsed, double *ucpu)
+{
+  struct tms      cpu;
+  struct timeval  tp;
+//  struct timezone tzp;
+  
+  if(first) {
+
+    /* Initialize clock */
+    timetick = 1.0 / (double)(sysconf(_SC_CLK_TCK));
+    first = 0;
+    gettimeofday(&tp, NULL); // gettimeofday(&tp, &tzp);
+    tstart = (double)tp.tv_sec + (double)tp.tv_usec * 1.0e-6;
+
+    /* Initialize CPU time */
+    times(&cpu);
+    ucpustart = (double)(cpu.tms_utime + cpu.tms_cutime) * timetick;
+
+    /* Return values */
+    *elapsed = 0.0e0;
+    *ucpu = 0.0e0;
+
+  }
+  else  {
+
+    /* Get clock time */
+    gettimeofday(&tp, NULL); // gettimeofday(&tp, &tzp);
+    *elapsed = (double)tp.tv_sec + (double)tp.tv_usec * 1.0e-6 - tstart;
+
+    /* Get CPU time */
+    times(&cpu);
+    *ucpu = (double)(cpu.tms_utime + cpu.tms_cutime) * timetick - ucpustart;
+  }
+  
+  return;
+
+}
+
--- a/BiCGStab_Iker/reloj.h
+++ b/BiCGStab_Iker/reloj.h
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/times.h>
+#include <unistd.h>
+
+extern void reloj (double *elapsed, double *ucpu);
+