#init_string = '"file {}; set args {}"'.format(benchmark_binary, benchmark_args_striped)
print("SECTION {}".format(section))
init_string='"{};{};{};file {}; set args {}; set cuda break_on_launch application"'.format(False,True,kernels,benchmark_binary,benchmark_args_striped)
init_string='"{};{};{};{};file {}; set args {}; set cuda break_on_launch application"'.format(False,True,kernels,trace,benchmark_binary,benchmark_args_striped)
#include "./kernel_gpu_cuda.cu" // (in the current directory) GPU kernel, cannot include with header file because of complications with passing of constant memory variables
#define NUMBER_PAR_PER_BOX 100 // keep this low to allow more blocks that share shared memory to run concurrently, code does not work for larger than 110, more speedup can be achieved with larger number and no shared memory used
/* #define NUMBER_THREADS 128 // this should be roughly equal to NUMBER_PAR_PER_BOX for best performance */