Commit 1697f52c authored by German Leon's avatar German Leon
Browse files

añadiendo benchmark rodinia

parent 20d55452
include ../../common/make.config
CC := $(CUDA_DIR)/bin/nvcc
INCLUDE := $(CUDA_DIR)/include
SRC = hotspot.cu
EXE = hotspot
release: $(SRC)
$(CC) $(KERNEL_DIM) -g -G $(SRC) -o $(EXE) -I$(INCLUDE) -L$(CUDA_LIB_DIR)
enum: $(SRC)
$(CC) $(KERNEL_DIM) -deviceemu $(SRC) -o $(EXE) -I$(INCLUDE) -L$(CUDA_LIB_DIR)
debug: $(SRC)
$(CC) $(KERNEL_DIM) -g -G $(SRC) -o $(EXE) -I$(INCLUDE) -L$(CUDA_LIB_DIR)
debugenum: $(SRC)
$(CC) $(KERNEL_DIM) -g -deviceemu $(SRC) -o $(EXE) -I$(INCLUDE) -L$(CUDA_LIB_DIR)
clean: $(SRC)
rm -f $(EXE) $(EXE).linkinfo result.txt
[DEFAULT]
debug =True
# Name of the gdb executable
gdbExecName = /usr/local/cuda-10.1.243/bin/cuda-gdb
# Which fault model to use, 0 -> single; 1 -> double;
# 2 -> random; 3 -> zeros; 4 -> least 16 significant bits (LSB);
# 5 -> least 8 significant bits (LSB)
# If you want multiple fault models, place them separated by ','
# faultModel = 0,2,3
faultModel = 0
# Injection site
# Can be:
# RF -> Register File
# INST_OUT -> Instruction Output (NOT IMPLEMENTED YET)
# INST_composed -> Instruction Adress (NOT IMPLEMENTED YET)
injectionSite = RF
# Max time factor to finish the app, this will be multiplied by the application running time
# For example if your app spend 2s, and the maxWaitTimes is 5, the max running time before it is
# Considered as a crash is 10s
maxWaitTimes = 5
# binary file of the application
# Must be full path
benchmarkBinary = /home/badia/rodinia_3.1/cuda/hotspot/hotspot
#benchmarkBinary_noverificar = /home/badia/rodinia_3.1/cuda/lavaMD/lavaMD
# Commands to set the session inside GDB environment
benchmarkArgs = 1024 2 10000 /home/badia/rodinia_3.1/data/hotspot/temp_1024 /home/badia/rodinia_3.1//data/hotspot/power_1024 output.out
#benchmarkArgs = 16384 2 2 /home/badia/rodinia_3.1/data/hotspot/temp_16384 /home/badia/rodinia_3.1/data/hotspot/power_16384 output.out
benchmarkArgs_noverificar = 1024 2 10000 /home/badia/rodinia_3.1/data/hotspot/temp_1024 /home/badia/rodinia_3.1/data/hotspot/power_1024 output.out
#benchmarkArgs_noverificar = 16384 2 2 /home/badia/rodinia_3.1/data/hotspot/temp_16384 /home/badia/rodinia_3.1/data/hotspot/power_16384 .
# CSV output file. It will be overwrite at each injection
csvFile = results/hotspot_RF.csv
# You should create a script on the benchmark source folder to verify GOLD_OUTPUT x INJ_OUTPUT
goldenCheckScript = ../rodinia_3.1/cuda/hotspot/sdc_check.sh
# Number of signals that will be sent to the application
seqSignals = 2
# Initial sleep time in seconds before start sending signals
# Generally the memory setup time
initSleep = 0.56
kernels= compute_temp
section_begin=hotspot.cu:247
section_end=hotspot.cu:257
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <assert.h>
#include <sys/time.h>
#include <string.h>
#ifdef RD_WG_SIZE_0_0
#define BLOCK_SIZE RD_WG_SIZE_0_0
#elif defined(RD_WG_SIZE_0)
#define BLOCK_SIZE RD_WG_SIZE_0
#elif defined(RD_WG_SIZE)
#define BLOCK_SIZE RD_WG_SIZE
#else
#define BLOCK_SIZE 16
#endif
#define STR_SIZE 256
/* maximum power density possible (say 300W for a 10mm x 10mm chip) */
#define MAX_PD (3.0e6)
/* required precision in degrees */
#define PRECISION 0.001
#define SPEC_HEAT_SI 1.75e6
#define K_SI 100
/* capacitance fitting factor */
#define FACTOR_CHIP 0.5
/* chip parameters */
float t_chip = 0.0005;
float chip_height = 0.016;
float chip_width = 0.016;
/* ambient temperature, assuming no package at all */
float amb_temp = 80.0;
void run(int argc, char** argv);
/* define timer macros */
#define pin_stats_reset() startCycle()
#define pin_stats_pause(cycles) stopCycle(cycles)
#define pin_stats_dump(cycles) printf("timer: %Lu\n", cycles)
double mysecond() {
struct timeval tp;
struct timezone tzp;
int i = gettimeofday(&tp, &tzp);
return ((double) tp.tv_sec + (double) tp.tv_usec * 1.e-6);
}
void
fatal(char *s)
{
fprintf(stderr, "error: %s\n", s);
}
void writeoutput(float *vect, int grid_rows, int grid_cols, char *file){
int i,j, index=0;
FILE *fp;
char str[STR_SIZE];
if( (fp = fopen(file, "w" )) == 0 )
printf( "The file was not opened\n" );
for (i=0; i < grid_rows; i++)
for (j=0; j < grid_cols; j++)
{
sprintf(str, "%d\t%g\n", index, vect[i*grid_cols+j]);
fputs(str,fp);
index++;
}
fclose(fp);
}
void readinput(float *vect, int grid_rows, int grid_cols, char *file){
int i,j;
FILE *fp;
char str[STR_SIZE];
float val;
if( (fp = fopen(file, "r" )) ==0 )
printf( "The file was not opened\n" );
for (i=0; i <= grid_rows-1; i++)
for (j=0; j <= grid_cols-1; j++)
{
fgets(str, STR_SIZE, fp);
if (feof(fp))
fatal("not enough lines in file");
//if ((sscanf(str, "%d%f", &index, &val) != 2) || (index != ((i-1)*(grid_cols-2)+j-1)))
if ((sscanf(str, "%f", &val) != 1))
fatal("invalid file format");
vect[i*grid_cols+j] = val;
}
fclose(fp);
}
#define IN_RANGE(x, min, max) ((x)>=(min) && (x)<=(max))
#define CLAMP_RANGE(x, min, max) x = (x<(min)) ? min : ((x>(max)) ? max : x )
#define MIN(a, b) ((a)<=(b) ? (a) : (b))
__global__ void calculate_temp(int iteration, //number of iteration
float *power, //power input
float *temp_src, //temperature input/output
float *temp_dst, //temperature input/output
int grid_cols, //Col of grid
int grid_rows, //Row of grid
int border_cols, // border offset
int border_rows, // border offset
float Cap, //Capacitance
float Rx,
float Ry,
float Rz,
float step,
float time_elapsed){
__shared__ float temp_on_cuda[BLOCK_SIZE][BLOCK_SIZE];
__shared__ float power_on_cuda[BLOCK_SIZE][BLOCK_SIZE];
__shared__ float temp_t[BLOCK_SIZE][BLOCK_SIZE]; // saving temparary temperature result
float amb_temp = 80.0;
float step_div_Cap;
float Rx_1,Ry_1,Rz_1;
int bx = blockIdx.x;
int by = blockIdx.y;
int tx=threadIdx.x;
int ty=threadIdx.y;
step_div_Cap=step/Cap;
Rx_1=1/Rx;
Ry_1=1/Ry;
Rz_1=1/Rz;
// each block finally computes result for a small block
// after N iterations.
// it is the non-overlapping small blocks that cover
// all the input data
// calculate the small block size
int small_block_rows = BLOCK_SIZE-iteration*2;//EXPAND_RATE
int small_block_cols = BLOCK_SIZE-iteration*2;//EXPAND_RATE
// calculate the boundary for the block according to
// the boundary of its small block
int blkY = small_block_rows*by-border_rows;
int blkX = small_block_cols*bx-border_cols;
int blkYmax = blkY+BLOCK_SIZE-1;
int blkXmax = blkX+BLOCK_SIZE-1;
// calculate the global thread coordination
int yidx = blkY+ty;
int xidx = blkX+tx;
// load data if it is within the valid input range
int loadYidx=yidx, loadXidx=xidx;
int index = grid_cols*loadYidx+loadXidx;
if(IN_RANGE(loadYidx, 0, grid_rows-1) && IN_RANGE(loadXidx, 0, grid_cols-1)){
temp_on_cuda[ty][tx] = temp_src[index]; // Load the temperature data from global memory to shared memory
power_on_cuda[ty][tx] = power[index];// Load the power data from global memory to shared memory
}
__syncthreads();
// effective range within this block that falls within
// the valid range of the input data
// used to rule out computation outside the boundary.
int validYmin = (blkY < 0) ? -blkY : 0;
int validYmax = (blkYmax > grid_rows-1) ? BLOCK_SIZE-1-(blkYmax-grid_rows+1) : BLOCK_SIZE-1;
int validXmin = (blkX < 0) ? -blkX : 0;
int validXmax = (blkXmax > grid_cols-1) ? BLOCK_SIZE-1-(blkXmax-grid_cols+1) : BLOCK_SIZE-1;
int N = ty-1;
int S = ty+1;
int W = tx-1;
int E = tx+1;
N = (N < validYmin) ? validYmin : N;
S = (S > validYmax) ? validYmax : S;
W = (W < validXmin) ? validXmin : W;
E = (E > validXmax) ? validXmax : E;
bool computed;
for (int i=0; i<iteration ; i++){
computed = false;
if( IN_RANGE(tx, i+1, BLOCK_SIZE-i-2) && \
IN_RANGE(ty, i+1, BLOCK_SIZE-i-2) && \
IN_RANGE(tx, validXmin, validXmax) && \
IN_RANGE(ty, validYmin, validYmax) ) {
computed = true;
temp_t[ty][tx] = temp_on_cuda[ty][tx] + step_div_Cap * (power_on_cuda[ty][tx] +
(temp_on_cuda[S][tx] + temp_on_cuda[N][tx] - 2.0*temp_on_cuda[ty][tx]) * Ry_1 +
(temp_on_cuda[ty][E] + temp_on_cuda[ty][W] - 2.0*temp_on_cuda[ty][tx]) * Rx_1 +
(amb_temp - temp_on_cuda[ty][tx]) * Rz_1);
}
__syncthreads();
if(i==iteration-1)
break;
if(computed) //Assign the computation range
temp_on_cuda[ty][tx]= temp_t[ty][tx];
__syncthreads();
}
// update the global memory
// after the last iteration, only threads coordinated within the
// small block perform the calculation and switch on ``computed''
if (computed){
temp_dst[index]= temp_t[ty][tx];
}
}
/*
compute N time steps
*/
int compute_tran_temp(float *MatrixPower,float *MatrixTemp[2], int col, int row, \
int total_iterations, int num_iterations, int blockCols, int blockRows, int borderCols, int borderRows)
{
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
dim3 dimGrid(blockCols, blockRows);
float grid_height = chip_height / row;
float grid_width = chip_width / col;
float Cap = FACTOR_CHIP * SPEC_HEAT_SI * t_chip * grid_width * grid_height;
float Rx = grid_width / (2.0 * K_SI * t_chip * grid_height);
float Ry = grid_height / (2.0 * K_SI * t_chip * grid_width);
float Rz = t_chip / (K_SI * grid_height * grid_width);
float max_slope = MAX_PD / (FACTOR_CHIP * t_chip * SPEC_HEAT_SI);
float step = PRECISION / max_slope;
float t;
float time_elapsed;
time_elapsed=0.001;
int src = 1, dst = 0;
for (t = 0; t < total_iterations; t+=num_iterations) {
int temp = src;
src = dst;
dst = temp;
calculate_temp<<<dimGrid, dimBlock>>>(MIN(num_iterations, total_iterations-t), MatrixPower,MatrixTemp[src],MatrixTemp[dst],\
col,row,borderCols, borderRows, Cap,Rx,Ry,Rz,step,time_elapsed);
}
cudaDeviceSynchronize();
return dst;
}
void usage(int argc, char **argv)
{
fprintf(stderr, "Usage: %s <grid_rows/grid_cols> <pyramid_height> <sim_time> <temp_file> <power_file> <output_file>\n", argv[0]);
fprintf(stderr, "\t<grid_rows/grid_cols> - number of rows/cols in the grid (positive integer)\n");
fprintf(stderr, "\t<pyramid_height> - pyramid heigh(positive integer)\n");
fprintf(stderr, "\t<sim_time> - number of iterations\n");
fprintf(stderr, "\t<temp_file> - name of the file containing the initial temperature values of each cell\n");
fprintf(stderr, "\t<power_file> - name of the file containing the dissipated power values of each cell\n");
fprintf(stderr, "\t<output_file> - name of the output file\n");
exit(1);
}
int main(int argc, char** argv)
{
printf("WG size of kernel = %d X %d\n", BLOCK_SIZE, BLOCK_SIZE);
run(argc,argv);
return EXIT_SUCCESS;
}
void run(int argc, char** argv)
{
int size;
int grid_rows,grid_cols;
float *FilesavingTemp,*FilesavingPower,*MatrixOut,*MatrixOut_cp;
char *tfile, *pfile, *ofile;
double timer = mysecond();
int total_iterations = 60;
int pyramid_height = 1; // number of iterations
if (argc != 7)
usage(argc, argv);
if((grid_rows = atoi(argv[1]))<=0||
(grid_cols = atoi(argv[1]))<=0||
(pyramid_height = atoi(argv[2]))<=0||
(total_iterations = atoi(argv[3]))<=0)
usage(argc, argv);
tfile=argv[4];
pfile=argv[5];
ofile=argv[6];
int cmp=strcmp(ofile,".");
size=grid_rows*grid_cols;
/* --------------- pyramid parameters --------------- */
# define EXPAND_RATE 2// add one iteration will extend the pyramid base by 2 per each borderline
int borderCols = (pyramid_height)*EXPAND_RATE/2;
int borderRows = (pyramid_height)*EXPAND_RATE/2;
int smallBlockCol = BLOCK_SIZE-(pyramid_height)*EXPAND_RATE;
int smallBlockRow = BLOCK_SIZE-(pyramid_height)*EXPAND_RATE;
int blockCols = grid_cols/smallBlockCol+((grid_cols%smallBlockCol==0)?0:1);
int blockRows = grid_rows/smallBlockRow+((grid_rows%smallBlockRow==0)?0:1);
FilesavingTemp = (float *) malloc(size*sizeof(float));
FilesavingPower = (float *) malloc(size*sizeof(float));
MatrixOut = (float *) calloc (size, sizeof(float));
if(cmp)MatrixOut_cp = (float *) calloc (size, sizeof(float));
else MatrixOut_cp=MatrixOut;
if( !FilesavingPower || !FilesavingTemp || !MatrixOut || !MatrixOut_cp)
fatal("unable to allocate memory");
printf("pyramidHeight: %d\ngridSize: [%d, %d]\nborder:[%d, %d]\nblockGrid:[%d, %d]\ntargetBlock:[%d, %d]\n",\
pyramid_height, grid_cols, grid_rows, borderCols, borderRows, blockCols, blockRows, smallBlockCol, smallBlockRow);
readinput(FilesavingTemp, grid_rows, grid_cols, tfile);
readinput(FilesavingPower, grid_rows, grid_cols, pfile);
float *MatrixTemp[2], *MatrixPower;
cudaMalloc((void**)&MatrixTemp[0], sizeof(float)*size);
cudaMalloc((void**)&MatrixTemp[1], sizeof(float)*size);
cudaMemcpy(MatrixTemp[0], FilesavingTemp, sizeof(float)*size, cudaMemcpyHostToDevice);
cudaMalloc((void**)&MatrixPower, sizeof(float)*size);
cudaMemcpy(MatrixPower, FilesavingPower, sizeof(float)*size, cudaMemcpyHostToDevice);
printf("Start computing the transient temperature\n");
printf("BEFORE START KERNEL %lf\n", mysecond() - timer);
double t1 = mysecond();
int ret = compute_tran_temp(MatrixPower,MatrixTemp,grid_cols,grid_rows, \
total_iterations,pyramid_height, blockCols, blockRows, borderCols, borderRows);
double exec_time = mysecond() - t1;
printf("KERNEL EXECUTION TIME %lf\n", exec_time);
printf("Ending simulation\n");
cudaMemcpy(MatrixOut, MatrixTemp[ret], sizeof(float)*size, cudaMemcpyDeviceToHost);
if (cmp) {
/* cudaFree(MatrixPower);
cudaFree(MatrixTemp[0]);
cudaFree(MatrixTemp[1]);
cudaMalloc((void**)&MatrixTemp[0], sizeof(float)*size);
cudaMalloc((void**)&MatrixTemp[1], sizeof(float)*size);
cudaMemcpy(MatrixTemp[0], FilesavingTemp, sizeof(float)*size, cudaMemcpyHostToDevice);
cudaMemcpy(MatrixPower, FilesavingPower, sizeof(float)*size, cudaMemcpyHostToDevice);
printf("BEFORE START KERNEL FOR COMPARASION %lf\n", mysecond() - timer);
ret = compute_tran_temp(MatrixPower,MatrixTemp,grid_cols,grid_rows, \
total_iterations,pyramid_height, blockCols, blockRows, borderCols, borderRows);
printf("KERNEL EXECUTION TIME %lf\n", exec_time);
printf("Ending comparation\n");
cudaMemcpy(MatrixOut_cp, MatrixTemp[ret], sizeof(float)*size, );
int correcto=1;
for (int i=0;i<sizeof(float)*size;i++)
if (MatrixOut[i]!=MatrixOut_cp[i]) { correcto=0;printf("%f-%d-%f\n",MatrixOut[i],i,MatrixOut_cp[i]);}
else {printf("\n\n\n##########+%d+\n\n\n",i);}
printf("%s\n", correcto? "Result = PASS" : "Result = FAIL");
printf("Ending comparation\n");*/
writeoutput(MatrixOut,grid_rows, grid_cols, ofile);
}
cudaFree(MatrixPower);
cudaFree(MatrixTemp[0]);
cudaFree(MatrixTemp[1]);
free(MatrixOut);
}
for i in HALF SINGLE DOUBLE;
do
#make PRECISION=$i DEBUG=1 clean
make PRECISION=$i DEBUG=1 generate
#make PRECISION=$i DEBUG=1 test
done;
exit 0
# cuda/lud_cuda -i ../../data/lud/2048.dat
cuda/lud_cuda -s 8192 -v
#!/usr/bin/sh
# SDC checking diff
# Must compare all things here
# Any particular output comparison must be made here
# To be considered as an SDC or CRASH the
# DIFF_LOG and DIFF_ERR_LOG files must not be empty
# INJ_OUTPUT_PATH, INJ_ERR_PATH, GOLD_OUTPUT_PATH, GOLD_ERR_PATH
# are environment variables defined by the fault_injector.py
# diff stdout
diff -B ${INJ_OUTPUT_PATH} ${GOLD_OUTPUT_PATH} > ${DIFF_LOG}
# Special comparison like the following one can be done in this script
grep -q "Result = FAIL" ${INJ_OUTPUT_PATH} >> ${DIFF_LOG}
# diff stderr
diff -B ${INJ_ERR_PATH} ${GOLD_ERR_PATH} > ${DIFF_ERR_LOG}
# Must exit 0
exit 0
\ No newline at end of file
include ../../common/make.config
CC := $(CUDA_DIR)/bin/nvcc
INCLUDE := $(CUDA_DIR)/include
SRC = needle.cu
EXE = needle
release: $(SRC)
$(CC) ${KERNEL_DIM} $(SRC) -o $(EXE)_MUDO -I$(INCLUDE) -L$(CUDA_LIB_DIR)
$(CC) ${KERNEL_DIM} $(SRC) -o $(EXE) -I$(INCLUDE) -L$(CUDA_LIB_DIR) -D TRACEBACK
enum: $(SRC)
$(CC) ${KERNEL_DIM} -deviceemu $(SRC) -o $(EXE) -I$(INCLUDE) -L$(CUDA_LIB_DIR)
debug: $(SRC)
$(CC) ${KERNEL_DIM} -g -G $(SRC) -o $(EXE)_MUDO -I$(INCLUDE) -L$(CUDA_LIB_DIR)
$(CC) ${KERNEL_DIM} -g -G $(SRC) -o $(EXE) -I$(INCLUDE) -L$(CUDA_LIB_DIR) -D TRACEBACK
debugenum: $(SRC)
$(CC) ${KERNEL_DIM} -g -deviceemu $(SRC) -o $(EXE) -I$(INCLUDE) -L$(CUDA_LIB_DIR)
clean: $(SRC)
rm -f $(EXE) $(EXE)_MUDO $(EXE).linkinfo result.txt
#define LIMIT -999
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include "needle.h"
#include <cuda.h>
#include <sys/time.h>
// includes, kernels
#include "needle_kernel.cu"
////////////////////////////////////////////////////////////////////////////////
// declaration, forward
void runTest( int argc, char** argv);
int blosum62[24][24] = {
{ 4, -1, -2, -2, 0, -1, -1, 0, -2, -1, -1, -1, -1, -2, -1, 1, 0, -3, -2, 0, -2, -1, 0, -4},
{-1, 5, 0, -2, -3, 1, 0, -2, 0, -3, -2, 2, -1, -3, -2, -1, -1, -3, -2, -3, -1, 0, -1, -4},
{-2, 0, 6, 1, -3, 0, 0, 0, 1, -3, -3, 0, -2, -3, -2, 1, 0, -4, -2, -3, 3, 0, -1, -4},
{-2, -2, 1, 6, -3, 0, 2, -1, -1, -3, -4, -1, -3, -3, -1, 0, -1, -4, -3, -3, 4, 1, -1, -4},
{ 0, -3, -3, -3, 9, -3, -4, -3, -3, -1, -1, -3, -1, -2, -3, -1, -1, -2, -2, -1, -3, -3, -2, -4},
{-1, 1, 0, 0, -3, 5, 2, -2, 0, -3, -2, 1, 0, -3, -1, 0, -1, -2, -1, -2, 0, 3, -1, -4},
{-1, 0, 0, 2, -4, 2, 5, -2, 0, -3, -3, 1, -2, -3, -1, 0, -1, -3, -2, -2, 1, 4, -1, -4},
{ 0, -2, 0, -1, -3, -2, -2, 6, -2, -4, -4, -2, -3, -3, -2, 0, -2, -2, -3, -3, -1, -2, -1, -4},
{-2, 0, 1, -1, -3, 0, 0, -2, 8, -3, -3, -1, -2, -1, -2, -1, -2, -2, 2, -3, 0, 0, -1, -4},
{-1, -3, -3, -3, -1, -3, -3, -4, -3, 4, 2, -3, 1, 0, -3, -2, -1, -3, -1, 3, -3, -3, -1, -4},
{-1, -2, -3, -4, -1, -2, -3, -4, -3, 2, 4, -2, 2, 0, -3, -2, -1, -2, -1, 1, -4, -3, -1, -4},
{-1, 2, 0, -1, -3, 1, 1, -2, -1, -3, -2, 5, -1, -3, -1, 0, -1, -3, -2, -2, 0, 1, -1, -4},
{-1, -1, -2, -3, -1, 0, -2, -3, -2, 1, 2, -1, 5, 0, -2, -1, -1, -1, -1, 1, -3, -1, -1, -4},
{-2, -3, -3, -3, -2, -3, -3, -3, -1, 0, 0, -3, 0, 6, -4, -2, -2, 1, 3, -1, -3, -3, -1, -4},
{-1, -2, -2, -1, -3, -1, -1, -2, -2, -3, -3, -1, -2, -4, 7, -1, -1, -4, -3, -2, -2, -1, -2, -4},
{ 1, -1, 1, 0, -1, 0, 0, 0, -1, -2, -2, 0, -1, -2, -1, 4, 1, -3, -2, -2, 0, 0, 0, -4},
{ 0, -1, 0, -1, -1, -1, -1, -2, -2, -1, -1, -1, -1, -2, -1, 1, 5, -2, -2, 0, -1, -1, 0, -4},
{-3, -3, -4, -4, -2, -2, -3, -2, -2, -3, -2, -3, -1, 1, -4, -3, -2, 11, 2, -3, -4, -3, -2, -4},
{-2, -2, -2, -3, -2, -1, -2, -3, 2, -1, -1, -2, -1, 3, -3, -2, -2, 2, 7, -1, -3, -2, -1, -4},
{ 0, -3, -3, -3, -1, -2, -2, -3, -3, 3, 1, -2, 1, -1, -2, -2, 0, -3, -1, 4, -3, -2, -1, -4},
{-2, -1, 3, 4, -3, 0, 1, -1, 0, -3, -4, 0, -3, -3, -2, 0, -1, -4, -3, -3, 4, 1, -1, -4},
{-1, 0, 0, 1, -3, 3, 4, -2, 0, -3, -3, 1, -1, -3, -1, 0, -1, -3, -2, -2, 1, 4, -1, -4},
{ 0, -1, -1, -1, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, 0, 0, -2, -1, -1, -1, -1, -1, -4},
{-4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, 1}
};
double gettime() {
struct timeval t;
gettimeofday(&t,NULL);
return t.tv_sec+t.tv_usec*1e-6;
}
double mysecond() {
struct timeval tp;
struct timezone tzp;
int i = gettimeofday(&tp, &tzp);
return ((double) tp.tv_sec + (double) tp.tv_usec * 1.e-6);
}
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
double timer;
int
main( int argc, char** argv)
{
timer = mysecond();
printf("WG size of kernel = %d \n", BLOCK_SIZE);
runTest( argc, argv);
return EXIT_SUCCESS;
}
void usage(int argc, char **argv)
{
fprintf(stderr, "Usage: %s <max_rows/max_cols> <penalty> \n", argv[0]);
fprintf(stderr, "\t<dimension> - x and y dimensions\n");
fprintf(stderr, "\t<penalty> - penalty(positive integer)\n");
exit(1);
}
void needle_cuda_main( int* referrence_cuda,
int* matrix_cuda,
int max_cols,
int penalty)
{
printf("BEFORE START KERNEL %lf\n", mysecond() - timer);
double t1 = mysecond();
dim3 dimGrid;
dim3 dimBlock(BLOCK_SIZE, 1);
int block_width = ( max_cols - 1 )/BLOCK_SIZE;
printf("Processing top-left matrix\n");
//process top-left matrix
for( int i = 1 ; i <= block_width ; i++){
dimGrid.x = i;
dimGrid.y = 1;
needle_cuda_shared_1<<<dimGrid, dimBlock>>>(referrence_cuda, matrix_cuda
,max_cols, penalty, i, block_width);
}
printf("Processing bottom-right matrix\n");
//process bottom-right matrix
for( int i = block_width - 1 ; i >= 1 ; i--){
dimGrid.x = i;
dimGrid.y = 1;
needle_cuda_shared_2<<<dimGrid, dimBlock>>>(referrence_cuda, matrix_cuda
,max_cols, penalty, i, block_width);
}
cudaDeviceSynchronize();
double exec_time = mysecond() - t1;
printf("KERNEL EXECUTION TIME %lf\n", exec_time);
}
void runTest( int argc, char** argv)
{
int max_rows, max_cols, penalty;
int *input_itemsets, *output_itemsets, *referrence;
int *matrix_cuda, *referrence_cuda;
int size;
// the lengths of the two sequences should be able to divided by 16.
// And at current stage max_rows needs to equal max_cols
if (argc == 3)
{
max_rows = atoi(argv[1]);
max_cols = atoi(argv[1]);
penalty = atoi(argv[2]);
}
else{
usage(argc, argv);
}
if(atoi(argv[1])%16!=0){
fprintf(stderr,"The dimension values must be a multiple of 16\n");
exit(1);
}
max_rows = max_rows + 1;
max_cols = max_cols + 1;
referrence = (int *)malloc( max_rows * max_cols * sizeof(int) );
input_itemsets = (int *)malloc( max_rows * max_cols * sizeof(int) );
output_itemsets = (int *)malloc( max_rows * max_cols * sizeof(int) );
if (!input_itemsets)
fprintf(stderr, "error: can not allocate memory");
srand ( 7 );
for (int i = 0 ; i < max_cols; i++){
for (int j = 0 ; j < max_rows; j++){
input_itemsets[i*max_cols+j] = 0;
}
}
printf("Start Needleman-Wunsch\n");
for( int i=1; i< max_rows ; i++){ //please define your own sequence.
input_itemsets[i*max_cols] = rand() % 10 + 1;
}
for( int j=1; j< max_cols ; j++){ //please define your own sequence.
input_itemsets[j] = rand() % 10 + 1;
}
for (int i = 1 ; i < max_cols; i++){
for (int j = 1 ; j < max_rows; j++){
referrence[i*max_cols+j] = blosum62[input_itemsets[i*max_cols]][input_itemsets[j]];
}
}
for( int i = 1; i< max_rows ; i++)
input_itemsets[i*max_cols] = -i * penalty;
for( int j = 1; j< max_cols ; j++)
input_itemsets[j] = -j * penalty;
size = max_cols * max_rows;
cudaMalloc((void**)& referrence_cuda, sizeof(int)*size);
cudaMalloc((void**)& matrix_cuda, sizeof(int)*size);
cudaMemcpy(referrence_cuda, referrence, sizeof(int) * size, cudaMemcpyHostToDevice);
cudaMemcpy(matrix_cuda, input_itemsets, sizeof(int) * size, cudaMemcpyHostToDevice);
needle_cuda_main(referrence_cuda, matrix_cuda
,max_cols, penalty);
cudaMemcpy(output_itemsets, matrix_cuda, sizeof(int) * size, cudaMemcpyDeviceToHost);
//#define TRACEBACK
#ifdef TRACEBACK
FILE *fpo = fopen("result.txt","w");
fprintf(fpo, "print traceback value GPU:\n");
for (int i = max_rows - 2, j = max_rows - 2; i>=0, j>=0;){
int nw, n, w, traceback;
if ( i == max_rows - 2 && j == max_rows - 2 )
fprintf(fpo, "%d ", output_itemsets[ i * max_cols + j]); //print the first element
if ( i == 0 && j == 0 )
break;
if ( i > 0 && j > 0 ){
nw = output_itemsets[(i - 1) * max_cols + j - 1];
w = output_itemsets[ i * max_cols + j - 1 ];
n = output_itemsets[(i - 1) * max_cols + j];
}
else if ( i == 0 ){
nw = n = LIMIT;
w = output_itemsets[ i * max_cols + j - 1 ];
}
else if ( j == 0 ){
nw = w = LIMIT;
n = output_itemsets[(i - 1) * max_cols + j];
}
else{
}
//traceback = maximum(nw, w, n);
int new_nw, new_w, new_n;
new_nw = nw + referrence[i * max_cols + j];
new_w = w - penalty;
new_n = n - penalty;
traceback = maximum(new_nw, new_w, new_n);
if(traceback == new_nw)
traceback = nw;
if(traceback == new_w)
traceback = w;
if(traceback == new_n)
traceback = n;
fprintf(fpo, "%d ", traceback);
if(traceback == nw )
{i--; j--; continue;}
else if(traceback == w )
{j--; continue;}
else if(traceback == n )
{i--; continue;}
else
;
}
fclose(fpo);
#endif
cudaFree(referrence_cuda);
cudaFree(matrix_cuda);
free(referrence);
free(input_itemsets);
free(output_itemsets);
}
#ifdef RD_WG_SIZE_0_0
#define BLOCK_SIZE RD_WG_SIZE_0_0
#elif defined(RD_WG_SIZE_0)
#define BLOCK_SIZE RD_WG_SIZE_0
#elif defined(RD_WG_SIZE)
#define BLOCK_SIZE RD_WG_SIZE
#else
#define BLOCK_SIZE 16
#endif
//#define TRACE
#include "needle.h"
#include <stdio.h>
#define SDATA( index) CUT_BANK_CHECKER(sdata, index)
__device__ __host__ int
maximum( int a,
int b,
int c){
int k;
if( a <= b )
k = b;
else
k = a;
if( k <=c )
return(c);
else
return(k);
}
__global__ void
needle_cuda_shared_1( int* referrence,
int* matrix_cuda,
int cols,
int penalty,
int i,
int block_width)
{
int bx = blockIdx.x;
int tx = threadIdx.x;
int b_index_x = bx;
int b_index_y = i - 1 - bx;
int index = cols * BLOCK_SIZE * b_index_y + BLOCK_SIZE * b_index_x + tx + ( cols + 1 );
int index_n = cols * BLOCK_SIZE * b_index_y + BLOCK_SIZE * b_index_x + tx + ( 1 );
int index_w = cols * BLOCK_SIZE * b_index_y + BLOCK_SIZE * b_index_x + ( cols );
int index_nw = cols * BLOCK_SIZE * b_index_y + BLOCK_SIZE * b_index_x;
__shared__ int temp[BLOCK_SIZE+1][BLOCK_SIZE+1];
__shared__ int ref[BLOCK_SIZE][BLOCK_SIZE];
if (tx == 0)
temp[tx][0] = matrix_cuda[index_nw];
for ( int ty = 0 ; ty < BLOCK_SIZE ; ty++)
ref[ty][tx] = referrence[index + cols * ty];
__syncthreads();
temp[tx + 1][0] = matrix_cuda[index_w + cols * tx];
__syncthreads();
temp[0][tx + 1] = matrix_cuda[index_n];
__syncthreads();
for( int m = 0 ; m < BLOCK_SIZE ; m++){
if ( tx <= m ){
int t_index_x = tx + 1;
int t_index_y = m - tx + 1;
temp[t_index_y][t_index_x] = maximum( temp[t_index_y-1][t_index_x-1] + ref[t_index_y-1][t_index_x-1],
temp[t_index_y][t_index_x-1] - penalty,
temp[t_index_y-1][t_index_x] - penalty);
}
__syncthreads();
}
for( int m = BLOCK_SIZE - 2 ; m >=0 ; m--){
if ( tx <= m){
int t_index_x = tx + BLOCK_SIZE - m ;
int t_index_y = BLOCK_SIZE - tx;
temp[t_index_y][t_index_x] = maximum( temp[t_index_y-1][t_index_x-1] + ref[t_index_y-1][t_index_x-1],
temp[t_index_y][t_index_x-1] - penalty,
temp[t_index_y-1][t_index_x] - penalty);
}
__syncthreads();
}
for ( int ty = 0 ; ty < BLOCK_SIZE ; ty++)
matrix_cuda[index + ty * cols] = temp[ty+1][tx+1];
}
__global__ void
needle_cuda_shared_2( int* referrence,
int* matrix_cuda,
int cols,
int penalty,
int i,
int block_width)
{
int bx = blockIdx.x;
int tx = threadIdx.x;
int b_index_x = bx + block_width - i ;
int b_index_y = block_width - bx -1;
int index = cols * BLOCK_SIZE * b_index_y + BLOCK_SIZE * b_index_x + tx + ( cols + 1 );
int index_n = cols * BLOCK_SIZE * b_index_y + BLOCK_SIZE * b_index_x + tx + ( 1 );
int index_w = cols * BLOCK_SIZE * b_index_y + BLOCK_SIZE * b_index_x + ( cols );
int index_nw = cols * BLOCK_SIZE * b_index_y + BLOCK_SIZE * b_index_x;
__shared__ int temp[BLOCK_SIZE+1][BLOCK_SIZE+1];
__shared__ int ref[BLOCK_SIZE][BLOCK_SIZE];
for ( int ty = 0 ; ty < BLOCK_SIZE ; ty++)
ref[ty][tx] = referrence[index + cols * ty];
__syncthreads();
if (tx == 0)
temp[tx][0] = matrix_cuda[index_nw];
temp[tx + 1][0] = matrix_cuda[index_w + cols * tx];
__syncthreads();
temp[0][tx + 1] = matrix_cuda[index_n];
__syncthreads();
for( int m = 0 ; m < BLOCK_SIZE ; m++){
if ( tx <= m ){
int t_index_x = tx + 1;
int t_index_y = m - tx + 1;
temp[t_index_y][t_index_x] = maximum( temp[t_index_y-1][t_index_x-1] + ref[t_index_y-1][t_index_x-1],
temp[t_index_y][t_index_x-1] - penalty,
temp[t_index_y-1][t_index_x] - penalty);
}
__syncthreads();
}
for( int m = BLOCK_SIZE - 2 ; m >=0 ; m--){
if ( tx <= m){
int t_index_x = tx + BLOCK_SIZE - m ;
int t_index_y = BLOCK_SIZE - tx;
temp[t_index_y][t_index_x] = maximum( temp[t_index_y-1][t_index_x-1] + ref[t_index_y-1][t_index_x-1],
temp[t_index_y][t_index_x-1] - penalty,
temp[t_index_y-1][t_index_x] - penalty);
}
__syncthreads();
}
for ( int ty = 0 ; ty < BLOCK_SIZE ; ty++)
matrix_cuda[index + ty * cols] = temp[ty+1][tx+1];
}
[DEFAULT]
debug =True
# Name of the gdb executable
gdbExecName = /usr/local/cuda-10.1.243/bin/cuda-gdb
# Which fault model to use, 0 -> single; 1 -> double;
# 2 -> random; 3 -> zeros; 4 -> least 16 significant bits (LSB);
# 5 -> least 8 significant bits (LSB)
# If you want multiple fault models, place them separated by ','
# faultModel = 0,2,3
faultModel = 0
# Injection site
# Can be:
# RF -> Register File
# INST_OUT -> Instruction Output (NOT IMPLEMENTED YET)
# INST_composed -> Instruction Adress (NOT IMPLEMENTED YET)
injectionSite = INST_OUT
# Max time factor to finish the app, this will be multiplied by the application running time
# For example if your app spend 2s, and the maxWaitTimes is 5, the max running time before it is
# Considered as a crash is 10s
maxWaitTimes = 5
# binary file of the application
# Must be full path
benchmarkBinary = /home/badia/rodinia_3.1/cuda/nw/needle
benchmarkBinary_noverificar = /home/badia/rodinia_3.1/cuda/nw/needle_MUDO
#benchmarkBinary_noverificar = /home/badia/rodinia_3.1/cuda/lavaMD/lavaMD
# Commands to set the session inside GDB environment
benchmarkArgs = 32000 1
#benchmarkArgs_noverificar = -s 2048
# CSV output file. It will be overwrite at each injection
csvFile = /home/badia/rodinia_3.1/cuda/nw/fi_nw_single_bit.csv
# You should create a script on the benchmark source folder to verify GOLD_OUTPUT x INJ_OUTPUT
goldenCheckScript = ../rodinia_3.1/cuda/nw/sdc_check.sh
# Number of signals that will be sent to the application
seqSignals = 2
# Initial sleep time in seconds before start sending signals
# Generally the memory setup time
initSleep = 9.46
kernels = needle_cuda_shared_1,needle_cuda_shared_2
section_begin=needle.cu:85
section_end=needle.cu:107
./needle 32000 1
#!/usr/bin/sh
# SDC checking diff
# Must compare all things here
# Any particular output comparison must be made here
# To be considered as an SDC or CRASH the
# DIFF_LOG and DIFF_ERR_LOG files must not be empty
# INJ_OUTPUT_PATH, INJ_ERR_PATH, GOLD_OUTPUT_PATH, GOLD_ERR_PATH
# are environment variables defined by the fault_injector.py
# diff stdout
diff -B ${INJ_OUTPUT_PATH} ${GOLD_OUTPUT_PATH} > ${DIFF_LOG}
# Special comparison like the following one can be done in this script
grep -q "Result = FAIL" ${INJ_OUTPUT_PATH} >> ${DIFF_LOG}
# diff stderr
diff -B ${INJ_ERR_PATH} ${GOLD_ERR_PATH} > ${DIFF_ERR_LOG}
echo $pwd
ls -l result*
diff ./result.txt ~/rodinia_3.1/cuda/nw/gold/result.1.txt >> ${INJ_OUTPUT_PATH}
diff ./result.txt ~/rodinia_3.1/cuda/nw/gold/result.1.txt #>> ${INJ_OUTPUT_PATH}
if [ $? -ne 0 ]
then
echo "Result = FAIL" >> ${INJ_OUTPUT_PATH}
else
echo "Result = PASS" >> ${INJ_OUTPUT_PATH}
fi
rm result.txt
# Must exit 0
exit 0
This diff is collapsed.
This diff is collapsed.
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment