Commit 1697f52c authored by German Leon's avatar German Leon
Browse files

añadiendo benchmark rodinia

parent 20d55452
include ../../common/make.config
CC := $(CUDA_DIR)/bin/nvcc
INCLUDE := $(CUDA_DIR)/include
SRC = hotspot.cu
EXE = hotspot
release: $(SRC)
$(CC) $(KERNEL_DIM) -g -G $(SRC) -o $(EXE) -I$(INCLUDE) -L$(CUDA_LIB_DIR)
enum: $(SRC)
$(CC) $(KERNEL_DIM) -deviceemu $(SRC) -o $(EXE) -I$(INCLUDE) -L$(CUDA_LIB_DIR)
debug: $(SRC)
$(CC) $(KERNEL_DIM) -g -G $(SRC) -o $(EXE) -I$(INCLUDE) -L$(CUDA_LIB_DIR)
debugenum: $(SRC)
$(CC) $(KERNEL_DIM) -g -deviceemu $(SRC) -o $(EXE) -I$(INCLUDE) -L$(CUDA_LIB_DIR)
clean: $(SRC)
rm -f $(EXE) $(EXE).linkinfo result.txt
[DEFAULT]
debug =True
# Name of the gdb executable
gdbExecName = /usr/local/cuda-10.1.243/bin/cuda-gdb
# Which fault model to use, 0 -> single; 1 -> double;
# 2 -> random; 3 -> zeros; 4 -> least 16 significant bits (LSB);
# 5 -> least 8 significant bits (LSB)
# If you want multiple fault models, place them separated by ','
# faultModel = 0,2,3
faultModel = 0
# Injection site
# Can be:
# RF -> Register File
# INST_OUT -> Instruction Output (NOT IMPLEMENTED YET)
# INST_composed -> Instruction Adress (NOT IMPLEMENTED YET)
injectionSite = RF
# Max time factor to finish the app, this will be multiplied by the application running time
# For example if your app spend 2s, and the maxWaitTimes is 5, the max running time before it is
# Considered as a crash is 10s
maxWaitTimes = 5
# binary file of the application
# Must be full path
benchmarkBinary = /home/badia/rodinia_3.1/cuda/hotspot/hotspot
#benchmarkBinary_noverificar = /home/badia/rodinia_3.1/cuda/lavaMD/lavaMD
# Commands to set the session inside GDB environment
benchmarkArgs = 1024 2 10000 /home/badia/rodinia_3.1/data/hotspot/temp_1024 /home/badia/rodinia_3.1//data/hotspot/power_1024 output.out
#benchmarkArgs = 16384 2 2 /home/badia/rodinia_3.1/data/hotspot/temp_16384 /home/badia/rodinia_3.1/data/hotspot/power_16384 output.out
benchmarkArgs_noverificar = 1024 2 10000 /home/badia/rodinia_3.1/data/hotspot/temp_1024 /home/badia/rodinia_3.1/data/hotspot/power_1024 output.out
#benchmarkArgs_noverificar = 16384 2 2 /home/badia/rodinia_3.1/data/hotspot/temp_16384 /home/badia/rodinia_3.1/data/hotspot/power_16384 .
# CSV output file. It will be overwrite at each injection
csvFile = results/hotspot_RF.csv
# You should create a script on the benchmark source folder to verify GOLD_OUTPUT x INJ_OUTPUT
goldenCheckScript = ../rodinia_3.1/cuda/hotspot/sdc_check.sh
# Number of signals that will be sent to the application
seqSignals = 2
# Initial sleep time in seconds before start sending signals
# Generally the memory setup time
initSleep = 0.56
kernels= compute_temp
section_begin=hotspot.cu:247
section_end=hotspot.cu:257
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <assert.h>
#include <sys/time.h>
#include <string.h>
#ifdef RD_WG_SIZE_0_0
#define BLOCK_SIZE RD_WG_SIZE_0_0
#elif defined(RD_WG_SIZE_0)
#define BLOCK_SIZE RD_WG_SIZE_0
#elif defined(RD_WG_SIZE)
#define BLOCK_SIZE RD_WG_SIZE
#else
#define BLOCK_SIZE 16
#endif
#define STR_SIZE 256
/* maximum power density possible (say 300W for a 10mm x 10mm chip) */
#define MAX_PD (3.0e6)
/* required precision in degrees */
#define PRECISION 0.001
#define SPEC_HEAT_SI 1.75e6
#define K_SI 100
/* capacitance fitting factor */
#define FACTOR_CHIP 0.5
/* chip parameters */
float t_chip = 0.0005;
float chip_height = 0.016;
float chip_width = 0.016;
/* ambient temperature, assuming no package at all */
float amb_temp = 80.0;
void run(int argc, char** argv);
/* define timer macros */
#define pin_stats_reset() startCycle()
#define pin_stats_pause(cycles) stopCycle(cycles)
#define pin_stats_dump(cycles) printf("timer: %Lu\n", cycles)
double mysecond() {
struct timeval tp;
struct timezone tzp;
int i = gettimeofday(&tp, &tzp);
return ((double) tp.tv_sec + (double) tp.tv_usec * 1.e-6);
}
void
fatal(char *s)
{
fprintf(stderr, "error: %s\n", s);
}
void writeoutput(float *vect, int grid_rows, int grid_cols, char *file){
int i,j, index=0;
FILE *fp;
char str[STR_SIZE];
if( (fp = fopen(file, "w" )) == 0 )
printf( "The file was not opened\n" );
for (i=0; i < grid_rows; i++)
for (j=0; j < grid_cols; j++)
{
sprintf(str, "%d\t%g\n", index, vect[i*grid_cols+j]);
fputs(str,fp);
index++;
}
fclose(fp);
}
void readinput(float *vect, int grid_rows, int grid_cols, char *file){
int i,j;
FILE *fp;
char str[STR_SIZE];
float val;
if( (fp = fopen(file, "r" )) ==0 )
printf( "The file was not opened\n" );
for (i=0; i <= grid_rows-1; i++)
for (j=0; j <= grid_cols-1; j++)
{
fgets(str, STR_SIZE, fp);
if (feof(fp))
fatal("not enough lines in file");
//if ((sscanf(str, "%d%f", &index, &val) != 2) || (index != ((i-1)*(grid_cols-2)+j-1)))
if ((sscanf(str, "%f", &val) != 1))
fatal("invalid file format");
vect[i*grid_cols+j] = val;
}
fclose(fp);
}
#define IN_RANGE(x, min, max) ((x)>=(min) && (x)<=(max))
#define CLAMP_RANGE(x, min, max) x = (x<(min)) ? min : ((x>(max)) ? max : x )
#define MIN(a, b) ((a)<=(b) ? (a) : (b))
__global__ void calculate_temp(int iteration, //number of iteration
float *power, //power input
float *temp_src, //temperature input/output
float *temp_dst, //temperature input/output
int grid_cols, //Col of grid
int grid_rows, //Row of grid
int border_cols, // border offset
int border_rows, // border offset
float Cap, //Capacitance
float Rx,
float Ry,
float Rz,
float step,
float time_elapsed){
__shared__ float temp_on_cuda[BLOCK_SIZE][BLOCK_SIZE];
__shared__ float power_on_cuda[BLOCK_SIZE][BLOCK_SIZE];
__shared__ float temp_t[BLOCK_SIZE][BLOCK_SIZE]; // saving temparary temperature result
float amb_temp = 80.0;
float step_div_Cap;
float Rx_1,Ry_1,Rz_1;
int bx = blockIdx.x;
int by = blockIdx.y;
int tx=threadIdx.x;
int ty=threadIdx.y;
step_div_Cap=step/Cap;
Rx_1=1/Rx;
Ry_1=1/Ry;
Rz_1=1/Rz;
// each block finally computes result for a small block
// after N iterations.
// it is the non-overlapping small blocks that cover
// all the input data
// calculate the small block size
int small_block_rows = BLOCK_SIZE-iteration*2;//EXPAND_RATE
int small_block_cols = BLOCK_SIZE-iteration*2;//EXPAND_RATE
// calculate the boundary for the block according to
// the boundary of its small block
int blkY = small_block_rows*by-border_rows;
int blkX = small_block_cols*bx-border_cols;
int blkYmax = blkY+BLOCK_SIZE-1;
int blkXmax = blkX+BLOCK_SIZE-1;
// calculate the global thread coordination
int yidx = blkY+ty;
int xidx = blkX+tx;
// load data if it is within the valid input range
int loadYidx=yidx, loadXidx=xidx;
int index = grid_cols*loadYidx+loadXidx;
if(IN_RANGE(loadYidx, 0, grid_rows-1) && IN_RANGE(loadXidx, 0, grid_cols-1)){
temp_on_cuda[ty][tx] = temp_src[index]; // Load the temperature data from global memory to shared memory
power_on_cuda[ty][tx] = power[index];// Load the power data from global memory to shared memory
}
__syncthreads();
// effective range within this block that falls within
// the valid range of the input data
// used to rule out computation outside the boundary.
int validYmin = (blkY < 0) ? -blkY : 0;
int validYmax = (blkYmax > grid_rows-1) ? BLOCK_SIZE-1-(blkYmax-grid_rows+1) : BLOCK_SIZE-1;
int validXmin = (blkX < 0) ? -blkX : 0;
int validXmax = (blkXmax > grid_cols-1) ? BLOCK_SIZE-1-(blkXmax-grid_cols+1) : BLOCK_SIZE-1;
int N = ty-1;
int S = ty+1;
int W = tx-1;
int E = tx+1;
N = (N < validYmin) ? validYmin : N;
S = (S > validYmax) ? validYmax : S;
W = (W < validXmin) ? validXmin : W;
E = (E > validXmax) ? validXmax : E;
bool computed;
for (int i=0; i<iteration ; i++){
computed = false;
if( IN_RANGE(tx, i+1, BLOCK_SIZE-i-2) && \
IN_RANGE(ty, i+1, BLOCK_SIZE-i-2) && \
IN_RANGE(tx, validXmin, validXmax) && \
IN_RANGE(ty, validYmin, validYmax) ) {
computed = true;
temp_t[ty][tx] = temp_on_cuda[ty][tx] + step_div_Cap * (power_on_cuda[ty][tx] +
(temp_on_cuda[S][tx] + temp_on_cuda[N][tx] - 2.0*temp_on_cuda[ty][tx]) * Ry_1 +
(temp_on_cuda[ty][E] + temp_on_cuda[ty][W] - 2.0*temp_on_cuda[ty][tx]) * Rx_1 +
(amb_temp - temp_on_cuda[ty][tx]) * Rz_1);
}
__syncthreads();
if(i==iteration-1)
break;
if(computed) //Assign the computation range
temp_on_cuda[ty][tx]= temp_t[ty][tx];
__syncthreads();
}
// update the global memory
// after the last iteration, only threads coordinated within the
// small block perform the calculation and switch on ``computed''
if (computed){
temp_dst[index]= temp_t[ty][tx];
}
}
/*
compute N time steps
*/
int compute_tran_temp(float *MatrixPower,float *MatrixTemp[2], int col, int row, \
int total_iterations, int num_iterations, int blockCols, int blockRows, int borderCols, int borderRows)
{
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
dim3 dimGrid(blockCols, blockRows);
float grid_height = chip_height / row;
float grid_width = chip_width / col;
float Cap = FACTOR_CHIP * SPEC_HEAT_SI * t_chip * grid_width * grid_height;
float Rx = grid_width / (2.0 * K_SI * t_chip * grid_height);
float Ry = grid_height / (2.0 * K_SI * t_chip * grid_width);
float Rz = t_chip / (K_SI * grid_height * grid_width);
float max_slope = MAX_PD / (FACTOR_CHIP * t_chip * SPEC_HEAT_SI);
float step = PRECISION / max_slope;
float t;
float time_elapsed;
time_elapsed=0.001;
int src = 1, dst = 0;
for (t = 0; t < total_iterations; t+=num_iterations) {
int temp = src;
src = dst;
dst = temp;
calculate_temp<<<dimGrid, dimBlock>>>(MIN(num_iterations, total_iterations-t), MatrixPower,MatrixTemp[src],MatrixTemp[dst],\
col,row,borderCols, borderRows, Cap,Rx,Ry,Rz,step,time_elapsed);
}
cudaDeviceSynchronize();
return dst;
}
void usage(int argc, char **argv)
{
fprintf(stderr, "Usage: %s <grid_rows/grid_cols> <pyramid_height> <sim_time> <temp_file> <power_file> <output_file>\n", argv[0]);
fprintf(stderr, "\t<grid_rows/grid_cols> - number of rows/cols in the grid (positive integer)\n");
fprintf(stderr, "\t<pyramid_height> - pyramid heigh(positive integer)\n");
fprintf(stderr, "\t<sim_time> - number of iterations\n");
fprintf(stderr, "\t<temp_file> - name of the file containing the initial temperature values of each cell\n");
fprintf(stderr, "\t<power_file> - name of the file containing the dissipated power values of each cell\n");
fprintf(stderr, "\t<output_file> - name of the output file\n");
exit(1);
}
int main(int argc, char** argv)
{
printf("WG size of kernel = %d X %d\n", BLOCK_SIZE, BLOCK_SIZE);
run(argc,argv);
return EXIT_SUCCESS;
}
void run(int argc, char** argv)
{
int size;
int grid_rows,grid_cols;
float *FilesavingTemp,*FilesavingPower,*MatrixOut,*MatrixOut_cp;
char *tfile, *pfile, *ofile;
double timer = mysecond();
int total_iterations = 60;
int pyramid_height = 1; // number of iterations
if (argc != 7)
usage(argc, argv);
if((grid_rows = atoi(argv[1]))<=0||
(grid_cols = atoi(argv[1]))<=0||
(pyramid_height = atoi(argv[2]))<=0||
(total_iterations = atoi(argv[3]))<=0)
usage(argc, argv);
tfile=argv[4];
pfile=argv[5];
ofile=argv[6];
int cmp=strcmp(ofile,".");
size=grid_rows*grid_cols;
/* --------------- pyramid parameters --------------- */
# define EXPAND_RATE 2// add one iteration will extend the pyramid base by 2 per each borderline
int borderCols = (pyramid_height)*EXPAND_RATE/2;
int borderRows = (pyramid_height)*EXPAND_RATE/2;
int smallBlockCol = BLOCK_SIZE-(pyramid_height)*EXPAND_RATE;
int smallBlockRow = BLOCK_SIZE-(pyramid_height)*EXPAND_RATE;
int blockCols = grid_cols/smallBlockCol+((grid_cols%smallBlockCol==0)?0:1);
int blockRows = grid_rows/smallBlockRow+((grid_rows%smallBlockRow==0)?0:1);
FilesavingTemp = (float *) malloc(size*sizeof(float));
FilesavingPower = (float *) malloc(size*sizeof(float));
MatrixOut = (float *) calloc (size, sizeof(float));
if(cmp)MatrixOut_cp = (float *) calloc (size, sizeof(float));
else MatrixOut_cp=MatrixOut;
if( !FilesavingPower || !FilesavingTemp || !MatrixOut || !MatrixOut_cp)
fatal("unable to allocate memory");
printf("pyramidHeight: %d\ngridSize: [%d, %d]\nborder:[%d, %d]\nblockGrid:[%d, %d]\ntargetBlock:[%d, %d]\n",\
pyramid_height, grid_cols, grid_rows, borderCols, borderRows, blockCols, blockRows, smallBlockCol, smallBlockRow);
readinput(FilesavingTemp, grid_rows, grid_cols, tfile);
readinput(FilesavingPower, grid_rows, grid_cols, pfile);
float *MatrixTemp[2], *MatrixPower;
cudaMalloc((void**)&MatrixTemp[0], sizeof(float)*size);
cudaMalloc((void**)&MatrixTemp[1], sizeof(float)*size);
cudaMemcpy(MatrixTemp[0], FilesavingTemp, sizeof(float)*size, cudaMemcpyHostToDevice);
cudaMalloc((void**)&MatrixPower, sizeof(float)*size);
cudaMemcpy(MatrixPower, FilesavingPower, sizeof(float)*size, cudaMemcpyHostToDevice);
printf("Start computing the transient temperature\n");
printf("BEFORE START KERNEL %lf\n", mysecond() - timer);
double t1 = mysecond();
int ret = compute_tran_temp(MatrixPower,MatrixTemp,grid_cols,grid_rows, \
total_iterations,pyramid_height, blockCols, blockRows, borderCols, borderRows);
double exec_time = mysecond() - t1;
printf("KERNEL EXECUTION TIME %lf\n", exec_time);
printf("Ending simulation\n");
cudaMemcpy(MatrixOut, MatrixTemp[ret], sizeof(float)*size, cudaMemcpyDeviceToHost);
if (cmp) {
/* cudaFree(MatrixPower);
cudaFree(MatrixTemp[0]);
cudaFree(MatrixTemp[1]);
cudaMalloc((void**)&MatrixTemp[0], sizeof(float)*size);
cudaMalloc((void**)&MatrixTemp[1], sizeof(float)*size);
cudaMemcpy(MatrixTemp[0], FilesavingTemp, sizeof(float)*size, cudaMemcpyHostToDevice);
cudaMemcpy(MatrixPower, FilesavingPower, sizeof(float)*size, cudaMemcpyHostToDevice);
printf("BEFORE START KERNEL FOR COMPARASION %lf\n", mysecond() - timer);
ret = compute_tran_temp(MatrixPower,MatrixTemp,grid_cols,grid_rows, \
total_iterations,pyramid_height, blockCols, blockRows, borderCols, borderRows);
printf("KERNEL EXECUTION TIME %lf\n", exec_time);
printf("Ending comparation\n");
cudaMemcpy(MatrixOut_cp, MatrixTemp[ret], sizeof(float)*size, );
int correcto=1;
for (int i=0;i<sizeof(float)*size;i++)
if (MatrixOut[i]!=MatrixOut_cp[i]) { correcto=0;printf("%f-%d-%f\n",MatrixOut[i],i,MatrixOut_cp[i]);}
else {printf("\n\n\n##########+%d+\n\n\n",i);}
printf("%s\n", correcto? "Result = PASS" : "Result = FAIL");
printf("Ending comparation\n");*/
writeoutput(MatrixOut,grid_rows, grid_cols, ofile);
}
cudaFree(MatrixPower);
cudaFree(MatrixTemp[0]);
cudaFree(MatrixTemp[1]);
free(MatrixOut);
}
for i in HALF SINGLE DOUBLE;
do
#make PRECISION=$i DEBUG=1 clean
make PRECISION=$i DEBUG=1 generate
#make PRECISION=$i DEBUG=1 test
done;
exit 0
# cuda/lud_cuda -i ../../data/lud/2048.dat
cuda/lud_cuda -s 8192 -v
#!/usr/bin/sh
# SDC checking diff
# Must compare all things here
# Any particular output comparison must be made here
# To be considered as an SDC or CRASH the
# DIFF_LOG and DIFF_ERR_LOG files must not be empty
# INJ_OUTPUT_PATH, INJ_ERR_PATH, GOLD_OUTPUT_PATH, GOLD_ERR_PATH
# are environment variables defined by the fault_injector.py
# diff stdout
diff -B ${INJ_OUTPUT_PATH} ${GOLD_OUTPUT_PATH} > ${DIFF_LOG}
# Special comparison like the following one can be done in this script
grep -q "Result = FAIL" ${INJ_OUTPUT_PATH} >> ${DIFF_LOG}
# diff stderr
diff -B ${INJ_ERR_PATH} ${GOLD_ERR_PATH} > ${DIFF_ERR_LOG}
# Must exit 0
exit 0
\ No newline at end of file
include ../../common/make.config
CC := $(CUDA_DIR)/bin/nvcc
INCLUDE := $(CUDA_DIR)/include
SRC = needle.cu
EXE = needle
release: $(SRC)
$(CC) ${KERNEL_DIM} $(SRC) -o $(EXE)_MUDO -I$(INCLUDE) -L$(CUDA_LIB_DIR)
$(CC) ${KERNEL_DIM} $(SRC) -o $(EXE) -I$(INCLUDE) -L$(CUDA_LIB_DIR) -D TRACEBACK
enum: $(SRC)
$(CC) ${KERNEL_DIM} -deviceemu $(SRC) -o $(EXE) -I$(INCLUDE) -L$(CUDA_LIB_DIR)
debug: $(SRC)
$(CC) ${KERNEL_DIM} -g -G $(SRC) -o $(EXE)_MUDO -I$(INCLUDE) -L$(CUDA_LIB_DIR)
$(CC) ${KERNEL_DIM} -g -G $(SRC) -o $(EXE) -I$(INCLUDE) -L$(CUDA_LIB_DIR) -D TRACEBACK
debugenum: $(SRC)
$(CC) ${KERNEL_DIM} -g -deviceemu $(SRC) -o $(EXE) -I$(INCLUDE) -L$(CUDA_LIB_DIR)
clean: $(SRC)
rm -f $(EXE) $(EXE)_MUDO $(EXE).linkinfo result.txt
#define LIMIT -999
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include "needle.h"
#include <cuda.h>
#include <sys/time.h>
// includes, kernels
#include "needle_kernel.cu"
////////////////////////////////////////////////////////////////////////////////
// declaration, forward
void runTest( int argc, char** argv);
int blosum62[24][24] = {
{ 4, -1, -2, -2, 0, -1, -1, 0, -2, -1, -1, -1, -1, -2, -1, 1, 0, -3, -2, 0, -2, -1, 0, -4},
{-1, 5, 0, -2, -3, 1, 0, -2, 0, -3, -2, 2, -1, -3, -2, -1, -1, -3, -2, -3, -1, 0, -1, -4},
{-2, 0, 6, 1, -3, 0, 0, 0, 1, -3, -3, 0, -2, -3, -2, 1, 0, -4, -2, -3, 3, 0, -1, -4},
{-2, -2, 1, 6, -3, 0, 2, -1, -1, -3, -4, -1, -3, -3, -1, 0, -1, -4, -3, -3, 4, 1, -1, -4},
{ 0, -3, -3, -3, 9, -3, -4, -3, -3, -1, -1, -3, -1, -2, -3, -1, -1, -2, -2, -1, -3, -3, -2, -4},
{-1, 1, 0, 0, -3, 5, 2, -2, 0, -3, -2, 1, 0, -3, -1, 0, -1, -2, -1, -2, 0, 3, -1, -4},
{-1, 0, 0, 2, -4, 2, 5, -2, 0, -3, -3, 1, -2, -3, -1, 0, -1, -3, -2, -2, 1, 4, -1, -4},
{ 0, -2, 0, -1, -3, -2, -2, 6, -2, -4, -4, -2, -3, -3, -2, 0, -2, -2, -3, -3, -1, -2, -1, -4},
{-2, 0, 1, -1, -3, 0, 0, -2, 8, -3, -3, -1, -2, -1, -2, -1, -2, -2, 2, -3, 0, 0, -1, -4},
{-1, -3, -3, -3, -1, -3, -3, -4, -3, 4, 2, -3, 1, 0, -3, -2, -1, -3, -1, 3, -3, -3, -1, -4},
{-1, -2, -3, -4, -1, -2, -3, -4, -3, 2, 4, -2, 2, 0, -3, -2, -1, -2, -1, 1, -4, -3, -1, -4},
{-1, 2, 0, -1, -3, 1, 1, -2, -1, -3, -2, 5, -1, -3, -1, 0, -1, -3, -2, -2, 0, 1, -1, -4},
{-1, -1, -2, -3, -1, 0, -2, -3, -2, 1, 2, -1, 5, 0, -2, -1, -1, -1, -1, 1, -3, -1, -1, -4},
{-2, -3, -3, -3, -2, -3, -3, -3, -1, 0, 0, -3, 0, 6, -4, -2, -2, 1, 3, -1, -3, -3, -1, -4},
{-1, -2, -2, -1, -3, -1, -1, -2, -2, -3, -3, -1, -2, -4, 7, -1, -1, -4, -3, -2, -2, -1, -2, -4},
{ 1, -1, 1, 0, -1, 0, 0, 0, -1, -2, -2, 0, -1, -2, -1, 4, 1, -3, -2, -2, 0, 0, 0, -4},
{ 0, -1, 0, -1, -1, -1, -1, -2, -2, -1, -1, -1, -1, -2, -1, 1, 5, -2, -2, 0, -1, -1, 0, -4},
{-3, -3, -4, -4, -2, -2, -3, -2, -2, -3, -2, -3, -1, 1, -4, -3, -2, 11, 2, -3, -4, -3, -2, -4},
{-2, -2, -2, -3, -2, -1, -2, -3, 2, -1, -1, -2, -1, 3, -3, -2, -2, 2, 7, -1, -3, -2, -1, -4},
{ 0, -3, -3, -3, -1, -2, -2, -3, -3, 3, 1, -2, 1, -1, -2, -2, 0, -3, -1, 4, -3, -2, -1, -4},
{-2, -1, 3, 4, -3, 0, 1, -1, 0, -3, -4, 0, -3, -3, -2, 0, -1, -4, -3, -3, 4, 1, -1, -4},
{-1, 0, 0, 1, -3, 3, 4, -2, 0, -3, -3, 1, -1, -3, -1, 0, -1, -3, -2, -2, 1, 4, -1, -4},
{ 0, -1, -1, -1, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, 0, 0, -2, -1, -1, -1, -1, -1, -4},
{-4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, 1}
};
double gettime() {
struct timeval t;
gettimeofday(&t,NULL);
return t.tv_sec+t.tv_usec*1e-6;
}
double mysecond() {
struct timeval tp;
struct timezone tzp;
int i = gettimeofday(&tp, &tzp);
return ((double) tp.tv_sec + (double) tp.tv_usec * 1.e-6);
}
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
double timer;
int
main( int argc, char** argv)
{
timer = mysecond();
printf("WG size of kernel = %d \n", BLOCK_SIZE);
runTest( argc, argv);
return EXIT_SUCCESS;
}
void usage(int argc, char **argv)
{
fprintf(stderr, "Usage: %s <max_rows/max_cols> <penalty> \n", argv[0]);
fprintf(stderr, "\t<dimension> - x and y dimensions\n");
fprintf(stderr, "\t<penalty> - penalty(positive integer)\n");
exit(1);
}
void needle_cuda_main( int* referrence_cuda,
int* matrix_cuda,
int max_cols,
int penalty)
{
printf("BEFORE START KERNEL %lf\n", mysecond() - timer);
double t1 = mysecond();
dim3 dimGrid;
dim3 dimBlock(BLOCK_SIZE, 1);
int block_width = ( max_cols - 1 )/BLOCK_SIZE;
printf("Processing top-left matrix\n");
//process top-left matrix
for( int i = 1 ; i <= block_width ; i++){
dimGrid.x = i;
dimGrid.y = 1;
needle_cuda_shared_1<<<dimGrid, dimBlock>>>(referrence_cuda, matrix_cuda
,max_cols, penalty, i, block_width);
}
printf("Processing bottom-right matrix\n");
//process bottom-right matrix
for( int i = block_width - 1 ; i >= 1 ; i--){
dimGrid.x = i;
dimGrid.y = 1;
needle_cuda_shared_2<<<dimGrid, dimBlock>>>(referrence_cuda, matrix_cuda
,max_cols, penalty, i, block_width);
}
cudaDeviceSynchronize();
double exec_time = mysecond() - t1;
printf("KERNEL EXECUTION TIME %lf\n", exec_time);
}
void runTest( int argc, char** argv)
{
int max_rows, max_cols, penalty;
int *input_itemsets, *output_itemsets, *referrence;
int *matrix_cuda, *referrence_cuda;
int size;
// the lengths of the two sequences should be able to divided by 16.
// And at current stage max_rows needs to equal max_cols
if (argc == 3)
{
max_rows = atoi(argv[1]);
max_cols = atoi(argv[1]);
penalty = atoi(argv[2]);
}
else{
usage(argc, argv);
}
if(atoi(argv[1])%16!=0){
fprintf(stderr,"The dimension values must be a multiple of 16\n");
exit(1);
}
max_rows = max_rows + 1;
max_cols = max_cols + 1;
referrence = (int *)malloc( max_rows * max_cols * sizeof(int) );
input_itemsets = (int *)malloc( max_rows * max_cols * sizeof(int) );
output_itemsets = (int *)malloc( max_rows * max_cols * sizeof(int) );
if (!input_itemsets)
fprintf(stderr, "error: can not allocate memory");
srand ( 7 );
for (int i = 0 ; i < max_cols; i++){
for (int j = 0 ; j < max_rows; j++){
input_itemsets[i*max_cols+j] = 0;
}
}
printf("Start Needleman-Wunsch\n");
for( int i=1; i< max_rows ; i++){ //please define your own sequence.
input_itemsets[i*max_cols] = rand() % 10 + 1;
}
for( int j=1; j< max_cols ; j++){ //please define your own sequence.
input_itemsets[j] = rand() % 10 + 1;
}
for (int i = 1 ; i < max_cols; i++){
for (int j = 1 ; j < max_rows; j++){
referrence[i*max_cols+j] = blosum62[input_itemsets[i*max_cols]][input_itemsets[j]];
}
}
for( int i = 1; i< max_rows ; i++)
input_itemsets[i*max_cols] = -i * penalty;
for( int j = 1; j< max_cols ; j++)
input_itemsets[j] = -j * penalty;
size = max_cols * max_rows;
cudaMalloc((void**)& referrence_cuda, sizeof(int)*size);
cudaMalloc((void**)& matrix_cuda, sizeof(int)*size);
cudaMemcpy(referrence_cuda, referrence, sizeof(int) * size, cudaMemcpyHostToDevice);
cudaMemcpy(matrix_cuda, input_itemsets, sizeof(int) * size, cudaMemcpyHostToDevice);
needle_cuda_main(referrence_cuda, matrix_cuda
,max_cols, penalty);
cudaMemcpy(output_itemsets, matrix_cuda, sizeof(int) * size, cudaMemcpyDeviceToHost);
//#define TRACEBACK
#ifdef TRACEBACK
FILE *fpo = fopen("result.txt","w");
fprintf(fpo, "print traceback value GPU:\n");
for (int i = max_rows - 2, j = max_rows - 2; i>=0, j>=0;){
int nw, n, w, traceback;
if ( i == max_rows - 2 && j == max_rows - 2 )
fprintf(fpo, "%d ", output_itemsets[ i * max_cols + j]); //print the first element
if ( i == 0 && j == 0 )
break;
if ( i > 0 && j > 0 ){
nw = output_itemsets[(i - 1) * max_cols + j - 1];
w = output_itemsets[ i * max_cols + j - 1 ];
n = output_itemsets[(i - 1) * max_cols + j];
}
else if ( i == 0 ){
nw = n = LIMIT;
w = output_itemsets[ i * max_cols + j - 1 ];
}
else if ( j == 0 ){
nw = w = LIMIT;
n = output_itemsets[(i - 1) * max_cols + j];
}
else{
}
//traceback = maximum(nw, w, n);
int new_nw, new_w, new_n;
new_nw = nw + referrence[i * max_cols + j];
new_w = w - penalty;
new_n = n - penalty;
traceback = maximum(new_nw, new_w, new_n);
if(traceback == new_nw)
traceback = nw;
if(traceback == new_w)
traceback = w;
if(traceback == new_n)
traceback = n;
fprintf(fpo, "%d ", traceback);
if(traceback == nw )
{i--; j--; continue;}
else if(traceback == w )
{j--; continue;}
else if(traceback == n )
{i--; continue;}
else
;
}
fclose(fpo);
#endif
cudaFree(referrence_cuda);
cudaFree(matrix_cuda);
free(referrence);
free(input_itemsets);
free(output_itemsets);
}
#ifdef RD_WG_SIZE_0_0
#define BLOCK_SIZE RD_WG_SIZE_0_0
#elif defined(RD_WG_SIZE_0)
#define BLOCK_SIZE RD_WG_SIZE_0
#elif defined(RD_WG_SIZE)
#define BLOCK_SIZE RD_WG_SIZE
#else
#define BLOCK_SIZE 16
#endif
//#define TRACE
#include "needle.h"
#include <stdio.h>
#define SDATA( index) CUT_BANK_CHECKER(sdata, index)
__device__ __host__ int
maximum( int a,
int b,
int c){
int k;
if( a <= b )
k = b;
else
k = a;
if( k <=c )
return(c);
else
return(k);
}
__global__ void
needle_cuda_shared_1( int* referrence,
int* matrix_cuda,
int cols,
int penalty,
int i,
int block_width)
{
int bx = blockIdx.x;
int tx = threadIdx.x;
int b_index_x = bx;
int b_index_y = i - 1 - bx;
int index = cols * BLOCK_SIZE * b_index_y + BLOCK_SIZE * b_index_x + tx + ( cols + 1 );
int index_n = cols * BLOCK_SIZE * b_index_y + BLOCK_SIZE * b_index_x + tx + ( 1 );
int index_w = cols * BLOCK_SIZE * b_index_y + BLOCK_SIZE * b_index_x + ( cols );
int index_nw = cols * BLOCK_SIZE * b_index_y + BLOCK_SIZE * b_index_x;
__shared__ int temp[BLOCK_SIZE+1][BLOCK_SIZE+1];
__shared__ int ref[BLOCK_SIZE][BLOCK_SIZE];
if (tx == 0)
temp[tx][0] = matrix_cuda[index_nw];
for ( int ty = 0 ; ty < BLOCK_SIZE ; ty++)
ref[ty][tx] = referrence[index + cols * ty];
__syncthreads();
temp[tx + 1][0] = matrix_cuda[index_w + cols * tx];
__syncthreads();
temp[0][tx + 1] = matrix_cuda[index_n];
__syncthreads();
for( int m = 0 ; m < BLOCK_SIZE ; m++){
if ( tx <= m ){
int t_index_x = tx + 1;
int t_index_y = m - tx + 1;
temp[t_index_y][t_index_x] = maximum( temp[t_index_y-1][t_index_x-1] + ref[t_index_y-1][t_index_x-1],
temp[t_index_y][t_index_x-1] - penalty,
temp[t_index_y-1][t_index_x] - penalty);
}
__syncthreads();
}
for( int m = BLOCK_SIZE - 2 ; m >=0 ; m--){
if ( tx <= m){
int t_index_x = tx + BLOCK_SIZE - m ;
int t_index_y = BLOCK_SIZE - tx;
temp[t_index_y][t_index_x] = maximum( temp[t_index_y-1][t_index_x-1] + ref[t_index_y-1][t_index_x-1],
temp[t_index_y][t_index_x-1] - penalty,
temp[t_index_y-1][t_index_x] - penalty);
}
__syncthreads();
}
for ( int ty = 0 ; ty < BLOCK_SIZE ; ty++)
matrix_cuda[index + ty * cols] = temp[ty+1][tx+1];
}
__global__ void
needle_cuda_shared_2( int* referrence,
int* matrix_cuda,
int cols,
int penalty,
int i,
int block_width)
{
int bx = blockIdx.x;
int tx = threadIdx.x;
int b_index_x = bx + block_width - i ;
int b_index_y = block_width - bx -1;
int index = cols * BLOCK_SIZE * b_index_y + BLOCK_SIZE * b_index_x + tx + ( cols + 1 );
int index_n = cols * BLOCK_SIZE * b_index_y + BLOCK_SIZE * b_index_x + tx + ( 1 );
int index_w = cols * BLOCK_SIZE * b_index_y + BLOCK_SIZE * b_index_x + ( cols );
int index_nw = cols * BLOCK_SIZE * b_index_y + BLOCK_SIZE * b_index_x;
__shared__ int temp[BLOCK_SIZE+1][BLOCK_SIZE+1];
__shared__ int ref[BLOCK_SIZE][BLOCK_SIZE];
for ( int ty = 0 ; ty < BLOCK_SIZE ; ty++)
ref[ty][tx] = referrence[index + cols * ty];
__syncthreads();
if (tx == 0)
temp[tx][0] = matrix_cuda[index_nw];
temp[tx + 1][0] = matrix_cuda[index_w + cols * tx];
__syncthreads();
temp[0][tx + 1] = matrix_cuda[index_n];
__syncthreads();
for( int m = 0 ; m < BLOCK_SIZE ; m++){
if ( tx <= m ){
int t_index_x = tx + 1;
int t_index_y = m - tx + 1;
temp[t_index_y][t_index_x] = maximum( temp[t_index_y-1][t_index_x-1] + ref[t_index_y-1][t_index_x-1],
temp[t_index_y][t_index_x-1] - penalty,
temp[t_index_y-1][t_index_x] - penalty);
}
__syncthreads();
}
for( int m = BLOCK_SIZE - 2 ; m >=0 ; m--){
if ( tx <= m){
int t_index_x = tx + BLOCK_SIZE - m ;
int t_index_y = BLOCK_SIZE - tx;
temp[t_index_y][t_index_x] = maximum( temp[t_index_y-1][t_index_x-1] + ref[t_index_y-1][t_index_x-1],
temp[t_index_y][t_index_x-1] - penalty,
temp[t_index_y-1][t_index_x] - penalty);
}
__syncthreads();
}
for ( int ty = 0 ; ty < BLOCK_SIZE ; ty++)
matrix_cuda[index + ty * cols] = temp[ty+1][tx+1];
}
[DEFAULT]
debug =True
# Name of the gdb executable
gdbExecName = /usr/local/cuda-10.1.243/bin/cuda-gdb
# Which fault model to use, 0 -> single; 1 -> double;
# 2 -> random; 3 -> zeros; 4 -> least 16 significant bits (LSB);
# 5 -> least 8 significant bits (LSB)
# If you want multiple fault models, place them separated by ','
# faultModel = 0,2,3
faultModel = 0
# Injection site
# Can be:
# RF -> Register File
# INST_OUT -> Instruction Output (NOT IMPLEMENTED YET)
# INST_composed -> Instruction Adress (NOT IMPLEMENTED YET)
injectionSite = INST_OUT
# Max time factor to finish the app, this will be multiplied by the application running time
# For example if your app spend 2s, and the maxWaitTimes is 5, the max running time before it is
# Considered as a crash is 10s
maxWaitTimes = 5
# binary file of the application
# Must be full path
benchmarkBinary = /home/badia/rodinia_3.1/cuda/nw/needle
benchmarkBinary_noverificar = /home/badia/rodinia_3.1/cuda/nw/needle_MUDO
#benchmarkBinary_noverificar = /home/badia/rodinia_3.1/cuda/lavaMD/lavaMD
# Commands to set the session inside GDB environment
benchmarkArgs = 32000 1
#benchmarkArgs_noverificar = -s 2048
# CSV output file. It will be overwrite at each injection
csvFile = /home/badia/rodinia_3.1/cuda/nw/fi_nw_single_bit.csv
# You should create a script on the benchmark source folder to verify GOLD_OUTPUT x INJ_OUTPUT
goldenCheckScript = ../rodinia_3.1/cuda/nw/sdc_check.sh
# Number of signals that will be sent to the application
seqSignals = 2
# Initial sleep time in seconds before start sending signals
# Generally the memory setup time
initSleep = 9.46
kernels = needle_cuda_shared_1,needle_cuda_shared_2
section_begin=needle.cu:85
section_end=needle.cu:107
./needle 32000 1
#!/usr/bin/sh
# SDC checking diff
# Must compare all things here
# Any particular output comparison must be made here
# To be considered as an SDC or CRASH the
# DIFF_LOG and DIFF_ERR_LOG files must not be empty
# INJ_OUTPUT_PATH, INJ_ERR_PATH, GOLD_OUTPUT_PATH, GOLD_ERR_PATH
# are environment variables defined by the fault_injector.py
# diff stdout
diff -B ${INJ_OUTPUT_PATH} ${GOLD_OUTPUT_PATH} > ${DIFF_LOG}
# Special comparison like the following one can be done in this script
grep -q "Result = FAIL" ${INJ_OUTPUT_PATH} >> ${DIFF_LOG}
# diff stderr
diff -B ${INJ_ERR_PATH} ${GOLD_ERR_PATH} > ${DIFF_ERR_LOG}
echo $pwd
ls -l result*
diff ./result.txt ~/rodinia_3.1/cuda/nw/gold/result.1.txt >> ${INJ_OUTPUT_PATH}
diff ./result.txt ~/rodinia_3.1/cuda/nw/gold/result.1.txt #>> ${INJ_OUTPUT_PATH}
if [ $? -ne 0 ]
then
echo "Result = FAIL" >> ${INJ_OUTPUT_PATH}
else
echo "Result = PASS" >> ${INJ_OUTPUT_PATH}
fi
rm result.txt
# Must exit 0
exit 0
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment