/** * Copyright 1993-2015 NVIDIA Corporation. All rights reserved. * * Please refer to the NVIDIA end user license agreement (EULA) associated * with this source code for terms and conditions that govern your use of * this software. Any use, reproduction, disclosure, or distribution of * this software and related documentation outside the terms of the EULA * is strictly prohibited. * */ /** * Matrix multiplication: C = A * B. * Host code. * * This sample implements matrix multiplication which makes use of shared memory * to ensure data reuse, the matrix multiplication is done using tiling approach. * It has been written for clarity of exposition to illustrate various CUDA programming * principles, not with the goal of providing the most performant generic kernel for matrix multiplication. * See also: * V. Volkov and J. Demmel, "Benchmarking GPUs to tune dense linear algebra," * in Proc. 2008 ACM/IEEE Conf. on Supercomputing (SC '08), * Piscataway, NJ: IEEE Press, 2008, pp. Art. 31:1-11. */ // System includes #include #include #include #include "jetson_nano.h" // CUDA runtime #include // Helper functions and utilities to work with CUDA #include #include #include #include "input_device.h" #define FREQ 921600000 #define T 1 #define BITSNOSIGNIFICATIVOS 16 #define CYCLES (T*(FREQ) >> BITSNOSIGNIFICATIVOS) #define QUATUMINTERACIONES 1000 #define SIZEROW 1 typedef int btype; typedef btype *btypePtr; #define myclock() (int) (clock64() >> BITSNOSIGNIFICATIVOS) /** * Micro Kernel that performs the computation using only registers */ __global__ void microKernel_reg_iter (unsigned int nit, char *vadd) { btype regin, regout, local; btype id = (blockIdx.x*blockDim.x + threadIdx.x+1); regin = id; local = id; #pragma unroll 2 for (int op = 0; op < nit; ++op) { regout = regin*local + id; local = (regout-local)/regin; } vadd[(int) id - 1] = (local == id); } /** * Micro Kernel that performs the computation using only registers */ __global__ void microKernel_reg_time (unsigned int cycles, char *vadd) { //long long unsigned int fin,ahora; //clock_t start,ahora; btype regin, regout, local; btype id = (blockIdx.x*blockDim.x + threadIdx.x+1); ahora=myclock(); regin = id; local = id; //fin=ahora+CYCLES; fin=ahora+cycles; while (ahora < fin ) { ahora=myclock(); #pragma unroll 2 for (unsigned int op=0; op< QUATUMINTERACIONES;++op){ regout = regin*local + id; local = (regout-local)/regin; } } vadd[(int) id - 1] = (local == id); } /** * Micro Kernel that performs the computation using global memory (and cache) */ __global__ void microKernel_global_iter(int nit, char *vadd, volatile btype *global) { btype regin, regout; btype id = (blockIdx.x*blockDim.x + threadIdx.x+1); int idInt = SIZEROW*(int) id; regin = id; global[idInt] = id; #pragma unroll 2 for (int op = 0; op < nit; ++op) { regout = regin*global[idInt] + id; global[idInt] = (regout-global[idInt])/regin; } vadd[(int) id - 1] = ( global[idInt] == id ); } __global__ void microKernel_global_time(unsigned int cycles, char *vadd, volatile btype *global) { unsigned int fin,ahora; btype regin, regout; btype id = (blockIdx.x*blockDim.x + threadIdx.x+1); volatile int idInt = SIZEROW*(int) id; ahora=myclock(); regin = id; fin=ahora+cycles; global[idInt] = id; while (ahora < fin ) { ahora=myclock(); #pragma unroll 2 for (unsigned int op = 0; op < QUATUMINTERACIONES; ++op) { regout = regin*global[idInt] + id; global[idInt] = (regout-global[idInt])/regin; } } vadd[(int) id - 1] = ( global[idInt] == id ); } /** * Micro Kernel that performs the computation using shared memory */ __global__ void microKernel_shared_iter(unsigned int nit, char *vadd) { btype regin, regout; volatile btype id = (btype) (blockIdx.x*blockDim.x + threadIdx.x + 1); volatile extern __shared__ btype sh[]; regin = id; sh[threadIdx.x] = id; #pragma unroll 2 for (unsigned int op = 0; op < nit; ++op) { regout = regin*sh[threadIdx.x] + id; sh[threadIdx.x] = (regout-sh[threadIdx.x])/regin; } vadd[(int) id - 1 ] = (sh[threadIdx.x] == id); } __global__ void microKernel_shared_time (unsigned int cycles, char *vadd) { unsigned int fin,ahora; btype regin, regout; volatile btype id = (btype) (blockIdx.x*blockDim.x + threadIdx.x + 1); volatile extern __shared__ btype sh[]; ahora=myclock(); regin = id; sh[threadIdx.x] = id; //fin=ahora+CYCLES; fin=ahora+cycles; while (ahora < fin ) { ahora=myclock(); #pragma unroll 2 for (int op = 0; op < QUATUMINTERACIONES; ++op) { regout = regin*sh[threadIdx.x] + id; sh[threadIdx.x] = (regout-sh[threadIdx.x])/regin; } } vadd[(int) id - 1 ] = (sh[threadIdx.x] == id); } bool check_error(char *h_vadd, int vsize) { int sum = 0; for (int i = 0; i < vsize; i++) sum += h_vadd[i]; return (sum == vsize); } /** * Run microKernel */ int launch_kernel(char *bench, int grid, int blk, unsigned int nitocycles,int time) { char *h_vadd; char *d_vadd; btypePtr d_global; int vsize = grid*blk; // Allocate CUDA events that we'll use for timing cudaEvent_t start, stop; checkCudaErrors(cudaEventCreate(&start)); checkCudaErrors(cudaEventCreate(&stop)); h_vadd = (char *) malloc(vsize*sizeof(char)); checkCudaErrors(cudaMalloc(&d_vadd, vsize*sizeof(char))); checkCudaErrors(cudaDeviceSynchronize()); // Record the start event checkCudaErrors(cudaEventRecord(start)); // Execute the kernel if (!strcmp(bench, "shm") ) { printf("shm"); if(time) { printf("time \n"); microKernel_shared_time <<>>(nitocycles, d_vadd); } else { printf("iterations\n"); microKernel_shared_iter <<>>(nitocycles, d_vadd); } } else if (!strcmp(bench, "glb") ) { printf("glb"); checkCudaErrors(cudaMalloc(&d_global, SIZEROW*vsize*sizeof(btype))); if(time) { printf("time\n"); microKernel_global_time <<>>(nitocycles, d_vadd, d_global); } else { printf("iterations\n"); microKernel_global_iter <<>>(nitocycles, d_vadd, d_global); } } else if (!strcmp(bench, "reg") ) { printf("reg"); if(time) { printf("time\n"); microKernel_reg_time <<>>(nitocycles, d_vadd); } else { printf("iterations\n"); microKernel_reg_iter <<>>(nitocycles, d_vadd); } } // Record the stop event checkCudaErrors(cudaDeviceSynchronize()); checkCudaErrors(cudaEventRecord(stop)); // Wait for the stop event to complete checkCudaErrors(cudaEventSynchronize(stop)); float msecTotal = 0.0f; checkCudaErrors(cudaEventElapsedTime(&msecTotal, start, stop)); // Compute and print the performance printf( "Elapsed time= %.2f\n", msecTotal); //checkCudaErrors(cudaDeviceSynchronize()); checkCudaErrors( cudaMemcpy(h_vadd, d_vadd, vsize*sizeof(char), cudaMemcpyDeviceToHost) ); printf("Checking computed result for correctness:\n "); bool correct = check_error(h_vadd, vsize); // Clean up memory checkCudaErrors(cudaEventDestroy(start)); checkCudaErrors(cudaEventDestroy(stop)); checkCudaErrors(cudaFree(d_vadd)); if (!strcmp(bench, "glb") ) { checkCudaErrors(cudaFree(d_global)); } free(h_vadd); return correct; /* if (correct) { return EXIT_SUCCESS; } else { return EXIT_FAILURE; } */ } /** * Program main */ int a; long int b; long long int c; char *buffer,*buffer2; int main(int argc, char **argv) { unsigned int grid, blk, nitocycles; long int frec; char *bench = (char *) malloc(4); bool time; unsigned long int long_nitocycles; if (checkCmdLineFlag(argc, (const char **)argv, "help") || checkCmdLineFlag(argc, (const char **)argv, "?")) { printf("Usage -bench=bench_name ('shm', 'glb', 'reg')\n"); printf(" -grid=grid_size (Grid size)\n"); printf(" -blk=block_size (Thread block size)\n"); printf(" -nit=number_its (number of iterations)\n"); printf(" -time=time (time to run the microbenchark)\n"); exit(EXIT_SUCCESS); } /* if (checkCmdLineFlag(argc, (const char **)argv, "nit")) { nitocycles = getCmdLineArgumentInt(argc, (const char **)argv, "nit");} if (checkCmdLineFlag(argc, (const char **)argv, "nit")) { getCmdLineArgumentString(argc, (const char **)argv, "nit",&buffer);} printf ("Valor entero %d y cadena %s, long convertido de string %lu", nitocycles,buffer,strtol(buffer,&buffer2,10)); */ frec=frec_now(); // Get current frequency to compute time from cycles printf("GPU frequency: %lu \n", frec); if (checkCmdLineFlag(argc, (const char **)argv, "bench")) { getCmdLineArgumentString(argc, (const char **)argv, "bench", &bench); } else printf ("FAIL: bench\n"); // Grid size if (checkCmdLineFlag(argc, (const char **)argv, "grid")) { grid = getCmdLineArgumentInt(argc, (const char **)argv, "grid"); } // Thread block size if (checkCmdLineFlag(argc, (const char **)argv, "blk")) { blk = getCmdLineArgumentInt(argc, (const char **)argv, "blk"); } else printf ("FAIL: blk\n"); time=false; // Kernel time if (checkCmdLineFlag(argc, (const char **)argv, "time")) { long_nitocycles = ((long int) (frec * getCmdLineArgumentFloat(argc, (const char **)argv, "time"))); nitocycles=(unsigned int) (long_nitocycles >> BITSNOSIGNIFICATIVOS); time=true; } else // Number of iterations if (checkCmdLineFlag(argc, (const char **)argv, "nit")) { nitocycles = getCmdLineArgumentInt(argc, (const char **)argv, "nit"); } else printf ("FAIL:nit and/or time\n"); printf("microKernel=%s, grid: %u, blk: %u, nit o cycles: %u\n", bench, grid, blk, nitocycles); int kernel_result = launch_kernel(bench, grid, blk, nitocycles,time); printf("Launch result: %d\n", kernel_result); exit(!kernel_result); }