/**
 * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

/**
 * Matrix multiplication: C = A * B.
 * Host code.
 *
 * This sample implements matrix multiplication which makes use of shared memory
 * to ensure data reuse, the matrix multiplication is done using tiling approach.
 * It has been written for clarity of exposition to illustrate various CUDA programming
 * principles, not with the goal of providing the most performant generic kernel for matrix multiplication.
 * See also:
 * V. Volkov and J. Demmel, "Benchmarking GPUs to tune dense linear algebra,"
 * in Proc. 2008 ACM/IEEE Conf. on Supercomputing (SC '08),
 * Piscataway, NJ: IEEE Press, 2008, pp. Art. 31:1-11.
 */

// System includes
#include <stdio.h>
#include <stdbool.h>
#include <assert.h>
#include "jetson_nano.h"
// CUDA runtime
#include <cuda_runtime.h>

// Helper functions and utilities to work with CUDA
#include <helper_functions.h>
#include <helper_cuda.h>
#include <cuda_fp16.h>

#include "input_device.h"
#define FREQ 921600000
#define T 1
#define BITSNOSIGNIFICATIVOS 16
#define CYCLES (T*(FREQ) >> BITSNOSIGNIFICATIVOS)
#define QUATUMINTERACIONES 1000
#define SIZEROW 1
typedef int  btype;
typedef btype *btypePtr;

#define myclock() (int) (clock64() >> BITSNOSIGNIFICATIVOS)

/**
 * Micro Kernel that performs the computation using only registers 
 */
__global__ void microKernel_reg_iter (unsigned int nit, char *vadd) {

    btype regin, regout, local;
    btype id = (blockIdx.x*blockDim.x + threadIdx.x+1);

    regin = id;
    local = id;
#pragma unroll 2 
    for (int op = 0; op < nit; ++op) {
      regout = regin*local + id;
      local = (regout-local)/regin;
    }
    vadd[(int) id - 1] = (local == id);

}

/**
 * Micro Kernel that performs the computation using only registers 
 */
__global__ void microKernel_reg_time (unsigned int cycles, char *vadd) {
    //long long 
   unsigned int fin,ahora;
    //clock_t start,ahora;
    btype regin, regout, local;
    btype id = (blockIdx.x*blockDim.x + threadIdx.x+1);

    ahora=myclock();
    regin = id;
    local = id;
    //fin=ahora+CYCLES;
    fin=ahora+cycles;

    while (ahora < fin  )  
     {   
     ahora=myclock();
     #pragma unroll 2
     for (unsigned int op=0; op< QUATUMINTERACIONES;++op){
      regout = regin*local + id;
      
      local = (regout-local)/regin; 
         }  
    } 
  
    vadd[(int) id - 1] = (local == id);

}
/**
 * Micro Kernel that performs the computation using global memory (and cache)
 */
__global__ void microKernel_global_iter(int nit, char *vadd, volatile btype *global) {
    btype regin, regout;
    btype id = (blockIdx.x*blockDim.x + threadIdx.x+1);
    int idInt = SIZEROW*(int) id;

    regin = id;
    global[idInt] = id;
#pragma unroll 2 
    for (int op = 0; op < nit; ++op) {
      regout = regin*global[idInt] + id;
      global[idInt] = (regout-global[idInt])/regin;
    }
    vadd[(int) id - 1] = ( global[idInt] == id );
}

__global__ void microKernel_global_time(unsigned int cycles, char *vadd, volatile btype *global) {
    unsigned  int fin,ahora;
    btype regin, regout;
    btype id = (blockIdx.x*blockDim.x + threadIdx.x+1);
    volatile int idInt = SIZEROW*(int) id;

     ahora=myclock();
    regin = id;
    fin=ahora+cycles;
    global[idInt] = id;
     while (ahora < fin  )  
     {   
     ahora=myclock();
     #pragma unroll 2 
    for (unsigned  int op = 0; op < QUATUMINTERACIONES; ++op) {
      regout = regin*global[idInt] + id;
      global[idInt] = (regout-global[idInt])/regin;
    }
    }
    vadd[(int) id - 1] = ( global[idInt] == id );
}
/**
 * Micro Kernel that performs the computation using shared memory
 */
__global__ void microKernel_shared_iter(unsigned int nit, char *vadd) {
    
  
    btype regin, regout;
    volatile btype id = (btype) (blockIdx.x*blockDim.x + threadIdx.x + 1);

    volatile extern __shared__ btype sh[];

    regin = id;
    sh[threadIdx.x] = id;
#pragma unroll 2 
    for (unsigned int op = 0; op < nit; ++op) {
      regout = regin*sh[threadIdx.x] + id;
      sh[threadIdx.x] = (regout-sh[threadIdx.x])/regin;
    }
    vadd[(int) id - 1 ] = (sh[threadIdx.x] == id);
}

__global__ void microKernel_shared_time (unsigned int cycles, char *vadd) {
    
    unsigned int fin,ahora;
    btype regin, regout;
    volatile btype id = (btype) (blockIdx.x*blockDim.x + threadIdx.x + 1);

    volatile extern __shared__ btype sh[];
    ahora=myclock();
    regin = id;
    sh[threadIdx.x] = id;
    //fin=ahora+CYCLES;
    fin=ahora+cycles;

    while (ahora < fin  )  
     {   
     ahora=myclock();
     #pragma unroll 2 
     for (int op = 0; op < QUATUMINTERACIONES; ++op) {
      regout = regin*sh[threadIdx.x] + id;
      sh[threadIdx.x] = (regout-sh[threadIdx.x])/regin;
      }
     } 
    vadd[(int) id - 1 ] = (sh[threadIdx.x] == id);
}

bool check_error(char *h_vadd, int vsize) {
    int sum = 0;
    for (int i = 0; i < vsize; i++) 
        sum += h_vadd[i];
    return (sum == vsize);
}


/**
 * Run microKernel
 */
int launch_kernel(char *bench, int grid, int blk, unsigned int nitocycles,int time) {
    char *h_vadd;
    char *d_vadd;
    btypePtr d_global;
    int vsize = grid*blk;
   

    // Allocate CUDA events that we'll use for timing
    cudaEvent_t start, stop;
    checkCudaErrors(cudaEventCreate(&start));
    checkCudaErrors(cudaEventCreate(&stop));

    h_vadd = (char *) malloc(vsize*sizeof(char));

    checkCudaErrors(cudaMalloc(&d_vadd, vsize*sizeof(char)));
    checkCudaErrors(cudaDeviceSynchronize());
 
    // Record the start event
    checkCudaErrors(cudaEventRecord(start));

    // Execute the kernel
  
    if (!strcmp(bench, "shm") ) {
        printf("shm");
        if(time) {
            printf("time \n");
            microKernel_shared_time <<<grid, blk, blk*sizeof(btype)>>>(nitocycles, d_vadd);
        }
        else {
            printf("iterations\n");
            microKernel_shared_iter <<<grid, blk, blk*sizeof(btype)>>>(nitocycles, d_vadd);
        } 
    } else if (!strcmp(bench, "glb") ) {
        printf("glb");
        checkCudaErrors(cudaMalloc(&d_global, SIZEROW*vsize*sizeof(btype)));
        if(time) {
            printf("time\n");
            microKernel_global_time <<<grid, blk, blk*sizeof(btype)>>>(nitocycles, d_vadd, d_global);
        }                                                
        else {
            printf("iterations\n");
            microKernel_global_iter <<<grid, blk, blk*sizeof(btype)>>>(nitocycles, d_vadd, d_global);
        }
    } else if (!strcmp(bench, "reg") ) {
        printf("reg");
        if(time) {
            printf("time\n");
            microKernel_reg_time <<<grid, blk, blk*sizeof(btype)>>>(nitocycles, d_vadd);
        }
        else {
            printf("iterations\n");  
            microKernel_reg_iter <<<grid, blk, blk*sizeof(btype)>>>(nitocycles, d_vadd);
       } 
    } 


    // Record the stop event
    checkCudaErrors(cudaDeviceSynchronize());

    checkCudaErrors(cudaEventRecord(stop));

    // Wait for the stop event to complete
    checkCudaErrors(cudaEventSynchronize(stop));

    float msecTotal = 0.0f;
    checkCudaErrors(cudaEventElapsedTime(&msecTotal, start, stop));

    // Compute and print the performance
    printf( "Elapsed time= %.2f\n", msecTotal);

    //checkCudaErrors(cudaDeviceSynchronize());

    checkCudaErrors( cudaMemcpy(h_vadd, d_vadd, vsize*sizeof(char), cudaMemcpyDeviceToHost) );
    
    printf("Checking computed result for correctness:\n ");
    bool correct = check_error(h_vadd, vsize);

    // Clean up memory
    checkCudaErrors(cudaEventDestroy(start));
    checkCudaErrors(cudaEventDestroy(stop));
    checkCudaErrors(cudaFree(d_vadd));
    if (!strcmp(bench, "glb") ) {
        checkCudaErrors(cudaFree(d_global));
    }
    free(h_vadd);

    return correct;

    /*
    if (correct) {
        return EXIT_SUCCESS;
    } else {
        return EXIT_FAILURE;
    }
    */
}


/**
 * Program main
 */
 
int a;
long int b;
long long int c;
char *buffer,*buffer2;

int main(int argc, char **argv) {
    unsigned int grid, blk, nitocycles;
    long int frec;
    char *bench = (char *) malloc(4);
    bool time;
    unsigned long int long_nitocycles;

    if (checkCmdLineFlag(argc, (const char **)argv, "help") ||
            checkCmdLineFlag(argc, (const char **)argv, "?")) {
        printf("Usage -bench=bench_name ('shm', 'glb', 'reg')\n");
        printf("      -grid=grid_size (Grid size)\n");
        printf("      -blk=block_size (Thread block size)\n");
        printf("      -nit=number_its (number of iterations)\n");
        printf("      -time=time (time to run the microbenchark)\n");

        exit(EXIT_SUCCESS);
    }

/*
   if (checkCmdLineFlag(argc, (const char **)argv, "nit")) {
        nitocycles = getCmdLineArgumentInt(argc, (const char **)argv, "nit");}
if (checkCmdLineFlag(argc, (const char **)argv, "nit")) {
       getCmdLineArgumentString(argc, (const char **)argv, "nit",&buffer);}
printf ("Valor entero %d y cadena %s, long convertido de string %lu", nitocycles,buffer,strtol(buffer,&buffer2,10));
*/

    frec=frec_now(); // Get current frequency to compute time from cycles
    printf("GPU frequency: %lu \n", frec);
    if (checkCmdLineFlag(argc, (const char **)argv, "bench")) {
        getCmdLineArgumentString(argc, (const char **)argv, "bench", &bench);
    }
    else
      printf ("FAIL: bench\n");

    // Grid size
    if (checkCmdLineFlag(argc, (const char **)argv, "grid")) {
        grid = getCmdLineArgumentInt(argc, (const char **)argv, "grid");
    }

    // Thread block size 
    if (checkCmdLineFlag(argc, (const char **)argv, "blk")) {
        blk = getCmdLineArgumentInt(argc, (const char **)argv, "blk");
    }
    else
      printf ("FAIL: blk\n");

    time=false;
    // Kernel time
    if (checkCmdLineFlag(argc, (const char **)argv, "time")) {
        long_nitocycles = ((long int) (frec * getCmdLineArgumentFloat(argc, (const char **)argv, "time")));
        nitocycles=(unsigned int) (long_nitocycles >> BITSNOSIGNIFICATIVOS);
        time=true;
    }
    else // Number of iterations
        if (checkCmdLineFlag(argc, (const char **)argv, "nit")) {
            nitocycles = getCmdLineArgumentInt(argc, (const char **)argv, "nit");
        }
        else
            printf ("FAIL:nit and/or time\n");

    printf("microKernel=%s, grid: %u, blk: %u, nit o cycles: %u\n", bench, grid, blk, nitocycles);

    int kernel_result = launch_kernel(bench, grid, blk, nitocycles,time);

    printf("Launch result: %d\n", kernel_result);

    exit(!kernel_result);
}