#include #include #include #include #include #include #ifdef RD_WG_SIZE_0_0 #define BLOCK_SIZE RD_WG_SIZE_0_0 #elif defined(RD_WG_SIZE_0) #define BLOCK_SIZE RD_WG_SIZE_0 #elif defined(RD_WG_SIZE) #define BLOCK_SIZE RD_WG_SIZE #else #define BLOCK_SIZE 16 #endif #define STR_SIZE 256 /* maximum power density possible (say 300W for a 10mm x 10mm chip) */ #define MAX_PD (3.0e6) /* required precision in degrees */ #define PRECISION 0.001 #define SPEC_HEAT_SI 1.75e6 #define K_SI 100 /* capacitance fitting factor */ #define FACTOR_CHIP 0.5 /* chip parameters */ float t_chip = 0.0005; float chip_height = 0.016; float chip_width = 0.016; /* ambient temperature, assuming no package at all */ float amb_temp = 80.0; void run(int argc, char** argv); /* define timer macros */ #define pin_stats_reset() startCycle() #define pin_stats_pause(cycles) stopCycle(cycles) #define pin_stats_dump(cycles) printf("timer: %Lu\n", cycles) double mysecond() { struct timeval tp; struct timezone tzp; int i = gettimeofday(&tp, &tzp); return ((double) tp.tv_sec + (double) tp.tv_usec * 1.e-6); } void fatal(char *s) { fprintf(stderr, "error: %s\n", s); } void writeoutput(float *vect, int grid_rows, int grid_cols, char *file){ int i,j, index=0; FILE *fp; char str[STR_SIZE]; if( (fp = fopen(file, "w" )) == 0 ) printf( "The file was not opened\n" ); for (i=0; i < grid_rows; i++) for (j=0; j < grid_cols; j++) { sprintf(str, "%d\t%g\n", index, vect[i*grid_cols+j]); fputs(str,fp); index++; } fclose(fp); } void readinput(float *vect, int grid_rows, int grid_cols, char *file){ int i,j; FILE *fp; char str[STR_SIZE]; float val; if( (fp = fopen(file, "r" )) ==0 ) printf( "The file was not opened\n" ); for (i=0; i <= grid_rows-1; i++) for (j=0; j <= grid_cols-1; j++) { fgets(str, STR_SIZE, fp); if (feof(fp)) fatal("not enough lines in file"); //if ((sscanf(str, "%d%f", &index, &val) != 2) || (index != ((i-1)*(grid_cols-2)+j-1))) if ((sscanf(str, "%f", &val) != 1)) fatal("invalid file format"); vect[i*grid_cols+j] = val; } fclose(fp); } #define IN_RANGE(x, min, max) ((x)>=(min) && (x)<=(max)) #define CLAMP_RANGE(x, min, max) x = (x<(min)) ? min : ((x>(max)) ? max : x ) #define MIN(a, b) ((a)<=(b) ? (a) : (b)) __global__ void calculate_temp(int iteration, //number of iteration float *power, //power input float *temp_src, //temperature input/output float *temp_dst, //temperature input/output int grid_cols, //Col of grid int grid_rows, //Row of grid int border_cols, // border offset int border_rows, // border offset float Cap, //Capacitance float Rx, float Ry, float Rz, float step, float time_elapsed){ __shared__ float temp_on_cuda[BLOCK_SIZE][BLOCK_SIZE]; __shared__ float power_on_cuda[BLOCK_SIZE][BLOCK_SIZE]; __shared__ float temp_t[BLOCK_SIZE][BLOCK_SIZE]; // saving temparary temperature result float amb_temp = 80.0; float step_div_Cap; float Rx_1,Ry_1,Rz_1; int bx = blockIdx.x; int by = blockIdx.y; int tx=threadIdx.x; int ty=threadIdx.y; step_div_Cap=step/Cap; Rx_1=1/Rx; Ry_1=1/Ry; Rz_1=1/Rz; // each block finally computes result for a small block // after N iterations. // it is the non-overlapping small blocks that cover // all the input data // calculate the small block size int small_block_rows = BLOCK_SIZE-iteration*2;//EXPAND_RATE int small_block_cols = BLOCK_SIZE-iteration*2;//EXPAND_RATE // calculate the boundary for the block according to // the boundary of its small block int blkY = small_block_rows*by-border_rows; int blkX = small_block_cols*bx-border_cols; int blkYmax = blkY+BLOCK_SIZE-1; int blkXmax = blkX+BLOCK_SIZE-1; // calculate the global thread coordination int yidx = blkY+ty; int xidx = blkX+tx; // load data if it is within the valid input range int loadYidx=yidx, loadXidx=xidx; int index = grid_cols*loadYidx+loadXidx; if(IN_RANGE(loadYidx, 0, grid_rows-1) && IN_RANGE(loadXidx, 0, grid_cols-1)){ temp_on_cuda[ty][tx] = temp_src[index]; // Load the temperature data from global memory to shared memory power_on_cuda[ty][tx] = power[index];// Load the power data from global memory to shared memory } __syncthreads(); // effective range within this block that falls within // the valid range of the input data // used to rule out computation outside the boundary. int validYmin = (blkY < 0) ? -blkY : 0; int validYmax = (blkYmax > grid_rows-1) ? BLOCK_SIZE-1-(blkYmax-grid_rows+1) : BLOCK_SIZE-1; int validXmin = (blkX < 0) ? -blkX : 0; int validXmax = (blkXmax > grid_cols-1) ? BLOCK_SIZE-1-(blkXmax-grid_cols+1) : BLOCK_SIZE-1; int N = ty-1; int S = ty+1; int W = tx-1; int E = tx+1; N = (N < validYmin) ? validYmin : N; S = (S > validYmax) ? validYmax : S; W = (W < validXmin) ? validXmin : W; E = (E > validXmax) ? validXmax : E; bool computed; for (int i=0; i>>(MIN(num_iterations, total_iterations-t), MatrixPower,MatrixTemp[src],MatrixTemp[dst],\ col,row,borderCols, borderRows, Cap,Rx,Ry,Rz,step,time_elapsed); } cudaDeviceSynchronize(); return dst; } void usage(int argc, char **argv) { fprintf(stderr, "Usage: %s \n", argv[0]); fprintf(stderr, "\t - number of rows/cols in the grid (positive integer)\n"); fprintf(stderr, "\t - pyramid heigh(positive integer)\n"); fprintf(stderr, "\t - number of iterations\n"); fprintf(stderr, "\t - name of the file containing the initial temperature values of each cell\n"); fprintf(stderr, "\t - name of the file containing the dissipated power values of each cell\n"); fprintf(stderr, "\t - name of the output file\n"); exit(1); } int main(int argc, char** argv) { printf("WG size of kernel = %d X %d\n", BLOCK_SIZE, BLOCK_SIZE); run(argc,argv); return EXIT_SUCCESS; } void run(int argc, char** argv) { int size; int grid_rows,grid_cols; float *FilesavingTemp,*FilesavingPower,*MatrixOut,*MatrixOut_cp; char *tfile, *pfile, *ofile; double timer = mysecond(); int total_iterations = 60; int pyramid_height = 1; // number of iterations if (argc != 7) usage(argc, argv); if((grid_rows = atoi(argv[1]))<=0|| (grid_cols = atoi(argv[1]))<=0|| (pyramid_height = atoi(argv[2]))<=0|| (total_iterations = atoi(argv[3]))<=0) usage(argc, argv); tfile=argv[4]; pfile=argv[5]; ofile=argv[6]; int cmp=strcmp(ofile,"."); size=grid_rows*grid_cols; /* --------------- pyramid parameters --------------- */ # define EXPAND_RATE 2// add one iteration will extend the pyramid base by 2 per each borderline int borderCols = (pyramid_height)*EXPAND_RATE/2; int borderRows = (pyramid_height)*EXPAND_RATE/2; int smallBlockCol = BLOCK_SIZE-(pyramid_height)*EXPAND_RATE; int smallBlockRow = BLOCK_SIZE-(pyramid_height)*EXPAND_RATE; int blockCols = grid_cols/smallBlockCol+((grid_cols%smallBlockCol==0)?0:1); int blockRows = grid_rows/smallBlockRow+((grid_rows%smallBlockRow==0)?0:1); FilesavingTemp = (float *) malloc(size*sizeof(float)); FilesavingPower = (float *) malloc(size*sizeof(float)); MatrixOut = (float *) calloc (size, sizeof(float)); if(cmp)MatrixOut_cp = (float *) calloc (size, sizeof(float)); else MatrixOut_cp=MatrixOut; if( !FilesavingPower || !FilesavingTemp || !MatrixOut || !MatrixOut_cp) fatal("unable to allocate memory"); printf("pyramidHeight: %d\ngridSize: [%d, %d]\nborder:[%d, %d]\nblockGrid:[%d, %d]\ntargetBlock:[%d, %d]\n",\ pyramid_height, grid_cols, grid_rows, borderCols, borderRows, blockCols, blockRows, smallBlockCol, smallBlockRow); readinput(FilesavingTemp, grid_rows, grid_cols, tfile); readinput(FilesavingPower, grid_rows, grid_cols, pfile); float *MatrixTemp[2], *MatrixPower; cudaMalloc((void**)&MatrixTemp[0], sizeof(float)*size); cudaMalloc((void**)&MatrixTemp[1], sizeof(float)*size); cudaMemcpy(MatrixTemp[0], FilesavingTemp, sizeof(float)*size, cudaMemcpyHostToDevice); cudaMalloc((void**)&MatrixPower, sizeof(float)*size); cudaMemcpy(MatrixPower, FilesavingPower, sizeof(float)*size, cudaMemcpyHostToDevice); printf("Start computing the transient temperature\n"); printf("BEFORE START KERNEL %lf\n", mysecond() - timer); double t1 = mysecond(); int ret = compute_tran_temp(MatrixPower,MatrixTemp,grid_cols,grid_rows, \ total_iterations,pyramid_height, blockCols, blockRows, borderCols, borderRows); double exec_time = mysecond() - t1; printf("KERNEL EXECUTION TIME %lf\n", exec_time); printf("Ending simulation\n"); cudaMemcpy(MatrixOut, MatrixTemp[ret], sizeof(float)*size, cudaMemcpyDeviceToHost); if (cmp) { /* cudaFree(MatrixPower); cudaFree(MatrixTemp[0]); cudaFree(MatrixTemp[1]); cudaMalloc((void**)&MatrixTemp[0], sizeof(float)*size); cudaMalloc((void**)&MatrixTemp[1], sizeof(float)*size); cudaMemcpy(MatrixTemp[0], FilesavingTemp, sizeof(float)*size, cudaMemcpyHostToDevice); cudaMemcpy(MatrixPower, FilesavingPower, sizeof(float)*size, cudaMemcpyHostToDevice); printf("BEFORE START KERNEL FOR COMPARASION %lf\n", mysecond() - timer); ret = compute_tran_temp(MatrixPower,MatrixTemp,grid_cols,grid_rows, \ total_iterations,pyramid_height, blockCols, blockRows, borderCols, borderRows); printf("KERNEL EXECUTION TIME %lf\n", exec_time); printf("Ending comparation\n"); cudaMemcpy(MatrixOut_cp, MatrixTemp[ret], sizeof(float)*size, ); int correcto=1; for (int i=0;i