micro.cu 10.6 KB
Newer Older
German Leon's avatar
German Leon committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
/**
 * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

/**
 * Matrix multiplication: C = A * B.
 * Host code.
 *
 * This sample implements matrix multiplication which makes use of shared memory
 * to ensure data reuse, the matrix multiplication is done using tiling approach.
 * It has been written for clarity of exposition to illustrate various CUDA programming
 * principles, not with the goal of providing the most performant generic kernel for matrix multiplication.
 * See also:
 * V. Volkov and J. Demmel, "Benchmarking GPUs to tune dense linear algebra,"
 * in Proc. 2008 ACM/IEEE Conf. on Supercomputing (SC '08),
 * Piscataway, NJ: IEEE Press, 2008, pp. Art. 31:1-11.
 */

// System includes
#include <stdio.h>
#include <stdbool.h>
#include <assert.h>
#include "jetson_nano.h"
// CUDA runtime
#include <cuda_runtime.h>

// Helper functions and utilities to work with CUDA
#include <helper_functions.h>
#include <helper_cuda.h>
#include <cuda_fp16.h>

#include "input_device.h"
#define FREQ 921600000
#define T 1
#define BITSNOSIGNIFICATIVOS 16
#define CYCLES (T*(FREQ) >> BITSNOSIGNIFICATIVOS)
#define QUATUMINTERACIONES 1000
#define SIZEROW 1
typedef int  btype;
typedef btype *btypePtr;

#define myclock() (int) (clock64() >> BITSNOSIGNIFICATIVOS)

/**
 * Micro Kernel that performs the computation using only registers 
 */
__global__ void microKernel_reg_iter (unsigned int nit, char *vadd) {

    btype regin, regout, local;
    btype id = (blockIdx.x*blockDim.x + threadIdx.x+1);

    regin = id;
    local = id;
#pragma unroll 2 
    for (int op = 0; op < nit; ++op) {
      regout = regin*local + id;
      local = (regout-local)/regin;
    }
    vadd[(int) id - 1] = (local == id);

}

/**
 * Micro Kernel that performs the computation using only registers 
 */
__global__ void microKernel_reg_time (unsigned int cycles, char *vadd) {
    //long long 
   unsigned int fin,ahora;
    //clock_t start,ahora;
    btype regin, regout, local;
    btype id = (blockIdx.x*blockDim.x + threadIdx.x+1);

    ahora=myclock();
    regin = id;
    local = id;
    //fin=ahora+CYCLES;
    fin=ahora+cycles;

    while (ahora < fin  )  
     {   
     ahora=myclock();
     #pragma unroll 2
     for (unsigned int op=0; op< QUATUMINTERACIONES;++op){
      regout = regin*local + id;
      
      local = (regout-local)/regin; 
         }  
    } 
  
    vadd[(int) id - 1] = (local == id);

}
/**
 * Micro Kernel that performs the computation using global memory (and cache)
 */
__global__ void microKernel_global_iter(int nit, char *vadd, volatile btype *global) {
    btype regin, regout;
    btype id = (blockIdx.x*blockDim.x + threadIdx.x+1);
    int idInt = SIZEROW*(int) id;

    regin = id;
    global[idInt] = id;
#pragma unroll 2 
    for (int op = 0; op < nit; ++op) {
      regout = regin*global[idInt] + id;
      global[idInt] = (regout-global[idInt])/regin;
    }
    vadd[(int) id - 1] = ( global[idInt] == id );
}

__global__ void microKernel_global_time(unsigned int cycles, char *vadd, volatile btype *global) {
    unsigned  int fin,ahora;
    btype regin, regout;
    btype id = (blockIdx.x*blockDim.x + threadIdx.x+1);
    volatile int idInt = SIZEROW*(int) id;

     ahora=myclock();
    regin = id;
    fin=ahora+cycles;
    global[idInt] = id;
     while (ahora < fin  )  
     {   
     ahora=myclock();
     #pragma unroll 2 
    for (unsigned  int op = 0; op < QUATUMINTERACIONES; ++op) {
      regout = regin*global[idInt] + id;
      global[idInt] = (regout-global[idInt])/regin;
    }
    }
    vadd[(int) id - 1] = ( global[idInt] == id );
}
/**
 * Micro Kernel that performs the computation using shared memory
 */
__global__ void microKernel_shared_iter(unsigned int nit, char *vadd) {
    
  
    btype regin, regout;
    volatile btype id = (btype) (blockIdx.x*blockDim.x + threadIdx.x + 1);

    volatile extern __shared__ btype sh[];

    regin = id;
    sh[threadIdx.x] = id;
#pragma unroll 2 
    for (unsigned int op = 0; op < nit; ++op) {
      regout = regin*sh[threadIdx.x] + id;
      sh[threadIdx.x] = (regout-sh[threadIdx.x])/regin;
    }
    vadd[(int) id - 1 ] = (sh[threadIdx.x] == id);
}

__global__ void microKernel_shared_time (unsigned int cycles, char *vadd) {
    
    unsigned int fin,ahora;
    btype regin, regout;
    volatile btype id = (btype) (blockIdx.x*blockDim.x + threadIdx.x + 1);

    volatile extern __shared__ btype sh[];
    ahora=myclock();
    regin = id;
    sh[threadIdx.x] = id;
    //fin=ahora+CYCLES;
    fin=ahora+cycles;

    while (ahora < fin  )  
     {   
     ahora=myclock();
     #pragma unroll 2 
     for (int op = 0; op < QUATUMINTERACIONES; ++op) {
      regout = regin*sh[threadIdx.x] + id;
      sh[threadIdx.x] = (regout-sh[threadIdx.x])/regin;
      }
     } 
    vadd[(int) id - 1 ] = (sh[threadIdx.x] == id);
}

bool check_error(char *h_vadd, int vsize) {
    int sum = 0;
    for (int i = 0; i < vsize; i++) 
        sum += h_vadd[i];
    return (sum == vsize);
}


/**
 * Run microKernel
 */
int launch_kernel(char *bench, int grid, int blk, unsigned int nitocycles,int time) {
    char *h_vadd;
    char *d_vadd;
    btypePtr d_global;
    int vsize = grid*blk;
   

    // Allocate CUDA events that we'll use for timing
    cudaEvent_t start, stop;
    checkCudaErrors(cudaEventCreate(&start));
    checkCudaErrors(cudaEventCreate(&stop));

    h_vadd = (char *) malloc(vsize*sizeof(char));

    checkCudaErrors(cudaMalloc(&d_vadd, vsize*sizeof(char)));
    checkCudaErrors(cudaDeviceSynchronize());
 
    // Record the start event
    checkCudaErrors(cudaEventRecord(start));

    // Execute the kernel
  
    if (!strcmp(bench, "shm") ) {
        printf("shm");
        if(time) {
            printf("time \n");
            microKernel_shared_time <<<grid, blk, blk*sizeof(btype)>>>(nitocycles, d_vadd);
        }
        else {
            printf("iterations\n");
            microKernel_shared_iter <<<grid, blk, blk*sizeof(btype)>>>(nitocycles, d_vadd);
        } 
    } else if (!strcmp(bench, "glb") ) {
        printf("glb");
        checkCudaErrors(cudaMalloc(&d_global, SIZEROW*vsize*sizeof(btype)));
        if(time) {
            printf("time\n");
            microKernel_global_time <<<grid, blk, blk*sizeof(btype)>>>(nitocycles, d_vadd, d_global);
        }                                                
        else {
            printf("iterations\n");
            microKernel_global_iter <<<grid, blk, blk*sizeof(btype)>>>(nitocycles, d_vadd, d_global);
        }
    } else if (!strcmp(bench, "reg") ) {
        printf("reg");
        if(time) {
            printf("time\n");
            microKernel_reg_time <<<grid, blk, blk*sizeof(btype)>>>(nitocycles, d_vadd);
        }
        else {
            printf("iterations\n");  
            microKernel_reg_iter <<<grid, blk, blk*sizeof(btype)>>>(nitocycles, d_vadd);
       } 
    } 


    // Record the stop event
    checkCudaErrors(cudaDeviceSynchronize());

    checkCudaErrors(cudaEventRecord(stop));

    // Wait for the stop event to complete
    checkCudaErrors(cudaEventSynchronize(stop));

    float msecTotal = 0.0f;
    checkCudaErrors(cudaEventElapsedTime(&msecTotal, start, stop));

    // Compute and print the performance
    printf( "Elapsed time= %.2f\n", msecTotal);

    //checkCudaErrors(cudaDeviceSynchronize());

    checkCudaErrors( cudaMemcpy(h_vadd, d_vadd, vsize*sizeof(char), cudaMemcpyDeviceToHost) );
    
    printf("Checking computed result for correctness:\n ");
    bool correct = check_error(h_vadd, vsize);

    // Clean up memory
    checkCudaErrors(cudaEventDestroy(start));
    checkCudaErrors(cudaEventDestroy(stop));
    checkCudaErrors(cudaFree(d_vadd));
    if (!strcmp(bench, "glb") ) {
        checkCudaErrors(cudaFree(d_global));
    }
    free(h_vadd);

    return correct;

    /*
    if (correct) {
        return EXIT_SUCCESS;
    } else {
        return EXIT_FAILURE;
    }
    */
}



/**
 * Program main
 */
 
int a;
long int b;
long long int c;
char *buffer,*buffer2;

int main(int argc, char **argv) {
    unsigned int grid, blk, nitocycles;
    long int frec;
    char *bench = (char *) malloc(4);
    bool time;
    unsigned long int long_nitocycles;

    if (checkCmdLineFlag(argc, (const char **)argv, "help") ||
            checkCmdLineFlag(argc, (const char **)argv, "?")) {
        printf("Usage -bench=bench_name ('shm', 'glb', 'reg')\n");
        printf("      -grid=grid_size (Grid size)\n");
        printf("      -blk=block_size (Thread block size)\n");
        printf("      -nit=number_its (number of iterations)\n");
        printf("      -time=time (time to run the microbenchark)\n");

        exit(EXIT_SUCCESS);
    }

/*
   if (checkCmdLineFlag(argc, (const char **)argv, "nit")) {
        nitocycles = getCmdLineArgumentInt(argc, (const char **)argv, "nit");}
if (checkCmdLineFlag(argc, (const char **)argv, "nit")) {
       getCmdLineArgumentString(argc, (const char **)argv, "nit",&buffer);}
printf ("Valor entero %d y cadena %s, long convertido de string %lu", nitocycles,buffer,strtol(buffer,&buffer2,10));
*/

    frec=frec_now(); // Get current frequency to compute time from cycles
    printf("GPU frequency: %lu \n", frec);
    if (checkCmdLineFlag(argc, (const char **)argv, "bench")) {
        getCmdLineArgumentString(argc, (const char **)argv, "bench", &bench);
    }
    else
      printf ("FAIL: bench\n");

    // Grid size
    if (checkCmdLineFlag(argc, (const char **)argv, "grid")) {
        grid = getCmdLineArgumentInt(argc, (const char **)argv, "grid");
    }

    // Thread block size 
    if (checkCmdLineFlag(argc, (const char **)argv, "blk")) {
        blk = getCmdLineArgumentInt(argc, (const char **)argv, "blk");
    }
    else
      printf ("FAIL: blk\n");

    time=false;
    // Kernel time
    if (checkCmdLineFlag(argc, (const char **)argv, "time")) {
        long_nitocycles = ((long int) (frec * getCmdLineArgumentFloat(argc, (const char **)argv, "time")));
        nitocycles=(unsigned int) (long_nitocycles >> BITSNOSIGNIFICATIVOS);
        time=true;
    }
    else // Number of iterations
        if (checkCmdLineFlag(argc, (const char **)argv, "nit")) {
            nitocycles = getCmdLineArgumentInt(argc, (const char **)argv, "nit");
        }
        else
            printf ("FAIL:nit and/or time\n");

    printf("microKernel=%s, grid: %u, blk: %u, nit o cycles: %u\n", bench, grid, blk, nitocycles);

    int kernel_result = launch_kernel(bench, grid, blk, nitocycles,time);

    printf("Launch result: %d\n", kernel_result);

    exit(!kernel_result);
}