Initial commit

e4ae2422 · German Leon · e4ae2422 · e4ae2422 · e4ae2422 · e4ae2422
Commit e4ae2422 authored Dec 11, 2020 by German Leon
--- a/app_profiler.py
+++ b/app_profiler.py
+#!/usr/bin/env python3
+import argparse
+import os
+import re
+import time
+import common_functions as cf
+import common_parameters as cp
+def generate_dict(sm_version, input_file_name):
+    with open(input_file_name, "r") as f:
+        # dictionary to store the number of allocated registers per static
+        kernel_reg = {}
+        kernel_name = ""  # temporary variable to store the kernel_name
+        check_for_register_count = False
+        # process the input file created by capturing the stderr while compiling the
+        # application using -Xptxas -v options
+        for line in f:  # for each line in the file
+            m = re.match(r".*Compiling entry function.*'(\S+)'.*for.*'{}'.*".format(sm_version), line)
+            if m:
+                kernel_name = m.group(1)
+                check_for_register_count = True
+            m = re.match(r".*Used[ ]+(\d+)[ ]+registers.*", line)
+            if check_for_register_count and m:
+                reg_num = m.group(1)  # extract register number
+                if kernel_name not in kernel_reg:
+                    # associate the extracted register number with the kernel name
+                    kernel_reg[kernel_name] = int(reg_num.strip())
+                else:
+                    print("Warning: {} exists in the kernel_reg dictionary. "
+                          "Skipping this register count.".format(kernel_name))
+                check_for_register_count = False
+    return kernel_reg
+"""
+Function that calls the profiler based on the injection mode
+"""
+def profiler_caller(gdb_exec, kernel, benchmark_binary, benchmark_args,device,section,kernel_end):
+    acc_time = 0
+    acc_time_profiler=0
+    script = 'env CUDA_VISIBLE_DEVICES={} {} -ex \'py arg0 = {}\' -n -batch -x {}'
+    benchmark_args_striped = benchmark_args.replace('\\n', '').replace('\\', '')
+    print ("KERNEL"+kernel)
+    #init_string = '"file {}; set args {}"'.format(benchmark_binary, benchmark_args_striped)
+    init_string = '"{};{};file {}; set args {}; break {}"'.format(section,kernel_end,benchmark_binary, benchmark_args_striped,kernel)
+    profiler_cmd = script.format(device, gdb_exec, init_string, cp.PROFILER_SCRIPT)
+    print ("Profiler caller")
+    if cp.DEBUG:
+        print("PROFILER CMD: {}".format(profiler_cmd))
+    for i in range(0, cp.MAX_TIMES_TO_PROFILE):
+        start = time.time()
+        os.system(profiler_cmd)
+        end = time.time()
+        ret_profiler = cf.load_config_file("tmpxxx_return_profiler.conf")
+        acc_time_profiler+=float(ret_profiler.get('DEFAULT', 'Tiempo'))
+        acc_time += end - start
+        cf.kill_all("killall -9 {}; killall -9 {}".format(
+            os.path.basename(gdb_exec), os.path.basename(benchmark_binary)))
+    return acc_time_profiler / cp.MAX_TIMES_TO_PROFILE, acc_time / cp.MAX_TIMES_TO_PROFILE
+"""
+Function to generate the gold execution
+"""
+def generate_gold(gdb_exec, benchmark_binary, benchmark_args,device):
+    # Create tmp path and clean it if it exists
+    tmp_path = os.path.dirname(os.path.realpath(__file__)) + "/" + cp.LOGS_PATH + "/tmp"
+    os.system("mkdir -p " + tmp_path)
+    os.system("rm -rf " + tmp_path + "/*")
+    script = 'env CUDA_VISIBLE_DEVICES={} {} -ex \'py arg0 = {}\' -n -batch -x {} > {} 2> {}'
+    init_string = '"file {}; set args {}"'.format(benchmark_binary, benchmark_args)
+    profiler_cmd = script.format(device, gdb_exec, init_string, cp.PROFILER_SCRIPT, cp.GOLD_OUTPUT_PATH, cp.GOLD_ERR_PATH)
+    if cp.DEBUG:
+        print("PROFILER CMD: {}".format(profiler_cmd))
+    # Execute and save gold file
+    return os.system(profiler_cmd)
+def main():
+    os.system("rm -f {}".format(cp.KERNEL_INFO_DIR))
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-c', '--conf', dest="config_file", help='Configuration file', required=True)
+    parser.add_argument('-d', '--device', dest="device", help="The GPU to perform FI."
+                                                              " Default is 0.", required=False, default=0, type=int)
+    args = parser.parse_args()
+    # Read the configuration file with data for all the apps that will be executed
+    conf = cf.load_config_file(args.config_file)
+    # First set env vars
+    cf.set_python_env()
+    ########################################################################
+    # Profiler step
+    # Max time will be obtained by running
+    # it will also get app output for golden copy
+    # that is,
+    print("###################################################\n1 - Profiling application")
+    if 'benchmarkBinary_noverificar' in conf['DEFAULT']:
+        benchmark_binary = conf.get('DEFAULT', 'benchmarkBinary_noverificar')
+    else:
+        benchmark_binary = conf.get('DEFAULT', 'benchmarkBinary')
+    if 'benchmarkArgs_noverificar' in conf['DEFAULT']:
+      benchmark_args = conf.get('DEFAULT', 'benchmarkArgs_noverificar')
+    else:
+      benchmark_args = conf.get('DEFAULT', 'benchmarkArgs')
+    section= 'kernel_end' in conf['DEFAULT']
+    kernel_end=''
+    if (section):
+       kernel_end=conf.get('DEFAULT','kernel_end')
+    gdb_exec = conf.get("DEFAULT", "gdbExecName")
+    kernel=conf.get('DEFAULT', 'kernel')
+    [max_time_kernel,max_time_app] = profiler_caller(gdb_exec=gdb_exec,kernel=kernel, benchmark_binary=benchmark_binary, benchmark_args=benchmark_args,device=args.device,section=section,kernel_end=kernel_end)
+    print ("Time kernel= "+str(max_time_kernel)+ "Time app "+str(max_time_app))
+    # saving gold
+    print ("Saving gold");
+    generate_gold_result = generate_gold(gdb_exec=gdb_exec,
+                                         benchmark_binary=benchmark_binary, benchmark_args=benchmark_args,device=args.device)
+    if generate_gold_result != 0:
+        raise EnvironmentError("Gold generation did not finish well, the fault injection will not work")
+    # Remove trash GDB info from the std output and the err output
+    cf.remove_useless_information_from_output(cp.GOLD_OUTPUT_PATH)
+    cf.remove_useless_information_from_output(cp.GOLD_ERR_PATH)
+    # Save the kernel configuration txt file
+    cf.save_file(file_path=cp.KERNEL_INFO_DIR, data={'max_time': max_time_app,'max_time_kernel': max_time_kernel})
+    print("1 - Profile finished\n###################################################")
+if __name__ == '__main__':
+    main()
--- a/app_profiler_old.py
+++ b/app_profiler_old.py
+#!/usr/bin/env python3
+import argparse
+import os
+import re
+import time
+import common_functions as cf
+import common_parameters as cp
+def generate_dict(sm_version, input_file_name):
+    with open(input_file_name, "r") as f:
+        # dictionary to store the number of allocated registers per static
+        kernel_reg = {}
+        kernel_name = ""  # temporary variable to store the kernel_name
+        check_for_register_count = False
+        # process the input file created by capturing the stderr while compiling the
+        # application using -Xptxas -v options
+        for line in f:  # for each line in the file
+            m = re.match(r".*Compiling entry function.*'(\S+)'.*for.*'{}'.*".format(sm_version), line)
+            if m:
+                kernel_name = m.group(1)
+                check_for_register_count = True
+            m = re.match(r".*Used[ ]+(\d+)[ ]+registers.*", line)
+            if check_for_register_count and m:
+                reg_num = m.group(1)  # extract register number
+                if kernel_name not in kernel_reg:
+                    # associate the extracted register number with the kernel name
+                    kernel_reg[kernel_name] = int(reg_num.strip())
+                else:
+                    print("Warning: {} exists in the kernel_reg dictionary. "
+                          "Skipping this register count.".format(kernel_name))
+                check_for_register_count = False
+    return kernel_reg
+"""
+Function that calls the profiler based on the injection mode
+"""
+def profiler_caller(gdb_exec, kernel, benchmark_binary, benchmark_args):
+    acc_time = 0
+    script = '{} -ex \'py arg0 = {}\' -n -batch -x {}'
+    benchmark_args_striped = benchmark_args.replace('\\n', '').replace('\\', '')
+    print ("KERNEL"+kernel)
+    init_string = '"file {}; set args {}"'.format(benchmark_binary, benchmark_args_striped)
+    #init_string = '"file {}; set args {}; break {}"'.format(benchmark_binary, benchmark_args_striped,kernel)
+    profiler_cmd = script.format(gdb_exec, init_string, cp.PROFILER_SCRIPT)
+    print ("Profiler caller")
+    if cp.DEBUG:
+        print("PROFILER CMD: {}".format(profiler_cmd))
+    for i in range(0, cp.MAX_TIMES_TO_PROFILE):
+        start = time.time()
+        os.system(profiler_cmd)
+        end = time.time()
+        acc_time += end - start
+        cf.kill_all("killall -9 {}; killall -9 {}".format(
+            os.path.basename(gdb_exec), os.path.basename(benchmark_binary)))
+    return acc_time / cp.MAX_TIMES_TO_PROFILE
+"""
+Function to generate the gold execution
+"""
+def generate_gold(gdb_exec, benchmark_binary, benchmark_args):
+    # Create tmp path and clean it if it exists
+    tmp_path = os.path.dirname(os.path.realpath(__file__)) + "/" + cp.LOGS_PATH + "/tmp"
+    if not os.path.exists(tmp_path):
+        os.mkdir(tmp_path)
+    os.system("rm -rf " + tmp_path + "/*")
+    script = '{} -ex \'py arg0 = {}\' -n -batch -x {} > {} 2> {}'
+    init_string = '"file {}; set args {}"'.format(benchmark_binary, benchmark_args)
+    profiler_cmd = script.format(gdb_exec, init_string, cp.PROFILER_SCRIPT, cp.GOLD_OUTPUT_PATH, cp.GOLD_ERR_PATH)
+    if cp.DEBUG:
+        print("PROFILER CMD: {}".format(profiler_cmd))
+    # Execute and save gold file
+    return os.system(profiler_cmd)
+def main():
+    os.system("rm -f {}".format(cp.KERNEL_INFO_DIR))
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-c', '--conf', dest="config_file", help='Configuration file', required=True)
+    args = parser.parse_args()
+    # Read the configuration file with data for all the apps that will be executed
+    conf = cf.load_config_file(args.config_file)
+    # First set env vars
+    cf.set_python_env()
+    ########################################################################
+    # Profiler step
+    # Max time will be obtained by running
+    # it will also get app output for golden copy
+    # that is,
+    print("###################################################\n1 - Profiling application")
+    if 'benchmarkBinary_noverificar' in conf['DEFAULT']:
+        benchmark_binary = conf.get('DEFAULT', 'benchmarkBinary_noverificar')
+    else:
+        benchmark_binary = conf.get('DEFAULT', 'benchmarkBinary')
+    if 'benchmarkArgs_noverificar' in conf['DEFAULT']:
+      benchmark_args = conf.get('DEFAULT', 'benchmarkArgs_noverificar')
+    else:
+      benchmark_args = conf.get('DEFAULT', 'benchmarkArgs')
+    gdb_exec = conf.get("DEFAULT", "gdbExecName")
+    kernel=conf.get('DEFAULT', 'kernel')
+    max_time_app = profiler_caller(gdb_exec=gdb_exec,kernel=kernel, benchmark_binary=benchmark_binary, benchmark_args=benchmark_args)
+    # saving gold
+    print ("Saving gold");
+    generate_gold_result = generate_gold(gdb_exec=gdb_exec,
+                                         benchmark_binary=benchmark_binary, benchmark_args=benchmark_args)
+    if generate_gold_result != 0:
+        raise EnvironmentError("Gold generation did not finish well, the fault injection will not work")
+    # Remove trash GDB info from the std output and the err output
+    cf.remove_useless_information_from_output(cp.GOLD_OUTPUT_PATH)
+    cf.remove_useless_information_from_output(cp.GOLD_ERR_PATH)
+    # Save the kernel configuration txt file
+    cf.save_file(file_path=cp.KERNEL_INFO_DIR, data={'max_time': max_time_app})
+    print("1 - Profile finished\n###################################################")
+if __name__ == '__main__':
+    main()
--- a/codes/mmElem/matrixMul.cu
+++ b/codes/mmElem/matrixMul.cu
+/**
+ * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
+ *
+ * Please refer to the NVIDIA end user license agreement (EULA) associated
+ * with this source code for terms and conditions that govern your use of
+ * this software. Any use, reproduction, disclosure, or distribution of
+ * this software and related documentation outside the terms of the EULA
+ * is strictly prohibited.
+ *
+ */
+/**
+ * Matrix multiplication: C = A * B.
+ * Host code.
+ *
+ * This sample implements matrix multiplication as described in Chapter 3
+ * of the programming guide.
+ * It has been written for clarity of exposition to illustrate various CUDA
+ * programming principles, not with the goal of providing the most
+ * performant generic kernel for matrix multiplication.
+ *
+ * See also:
+ * V. Volkov and J. Demmel, "Benchmarking GPUs to tune dense linear algebra,"
+ * in Proc. 2008 ACM/IEEE Conf. on Supercomputing (SC '08),
+ * Piscataway, NJ: IEEE Press, 2008, pp. Art. 31:1-11.
+ */
+// System includes
+#include <stdio.h>
+#include <assert.h>
+// CUDA runtime
+#include <cuda_runtime.h>
+// Helper functions and utilities to work with CUDA
+#include <helper_functions.h>
+#include <helper_cuda.h>
+#include <omp.h>
+#if BUILD_TIMER == 1
+static double timer;
+#endif
+/**
+ * Matrix multiplication (CUDA Kernel) on the device: C = A * B
+ * wA is A's width and wB is B's width
+ * Every thread computes one element of C as a dot product
+ * C[i][j] = A[i][:] * B[:][j]
+ */
+__global__ void matrixMulCUDA(float *C, float *A, float *B, int ldA, int ldB, int ldC) {
+   // Thread global indexes
+   int i = blockIdx.y * blockDim.y + threadIdx.y;
+   int j = blockIdx.x * blockDim.x + threadIdx.x;
+//   printf("**C[%d][%d]\n", i, j);
+   float *ptrA = &A[i*ldA]; // Pointer to the first element of row i of A
+   float tmp = 0.0f;
+   for (int k = 0; k < ldA; k++) {
+     tmp += (*ptrA++) * B[k*ldB+j];
+   }
+   C[i*ldC+j] = tmp;
+//   printf("C[%d][%d] = %f\n", i, j, tmp);
+}
+void constantInit(float *data, int size, float val) {
+	for (int i = 0; i < size; ++i) {
+		data[i] = val;
+	}
+}
+double mysecond() {
+	struct timeval tp;
+	struct timezone tzp;
+	int i = gettimeofday(&tp, &tzp);
+	return ((double) tp.tv_sec + (double) tp.tv_usec * 1.e-6);
+}
+/**
+ * Run a simple test of matrix multiplication using CUDA
+ */
+int matrixMultiply(int argc, char **argv, int block_size, dim3 &dimsA,
+		dim3 &dimsB) {
+	// Allocate host memory for matrices A and B
+	unsigned int size_A = dimsA.x * dimsA.y;
+	unsigned int mem_size_A = sizeof(float) * size_A;
+	float *h_A = (float *) malloc(mem_size_A);
+	unsigned int size_B = dimsB.x * dimsB.y;
+	unsigned int mem_size_B = sizeof(float) * size_B;
+	float *h_B = (float *) malloc(mem_size_B);
+	// Initialize host memory
+	const float valB = 0.01f;
+	constantInit(h_A, size_A, 1.0f);
+	constantInit(h_B, size_B, valB);
+	// Allocate device memory
+	float *d_A, *d_B, *d_C;
+	// Allocate host matrix C
+	dim3 dimsC(dimsB.x, dimsA.y, 1);
+	unsigned int mem_size_C = dimsC.x * dimsC.y * sizeof(float);
+	float *h_C = (float *) malloc(mem_size_C);
+	if (h_C == NULL) {
+		fprintf(stderr, "Failed to allocate host matrix C!\n");
+		exit (EXIT_FAILURE);
+	}
+	cudaError_t error;
+	error = cudaMalloc((void **) &d_A, mem_size_A);
+	if (error != cudaSuccess) {
+		printf("cudaMalloc d_A returned error %s (code %d), line(%d)\n",
+				cudaGetErrorString(error), error, __LINE__);
+		exit (EXIT_FAILURE);
+	}
+	error = cudaMalloc((void **) &d_B, mem_size_B);
+	if (error != cudaSuccess) {
+		printf("cudaMalloc d_B returned error %s (code %d), line(%d)\n",
+				cudaGetErrorString(error), error, __LINE__);
+		exit (EXIT_FAILURE);
+	}
+	error = cudaMalloc((void **) &d_C, mem_size_C);
+	if (error != cudaSuccess) {
+		printf("cudaMalloc d_C returned error %s (code %d), line(%d)\n",
+				cudaGetErrorString(error), error, __LINE__);
+		exit (EXIT_FAILURE);
+	}
+	// copy host memory to device
+	error = cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice);
+	if (error != cudaSuccess) {
+		printf("cudaMemcpy (d_A,h_A) returned error %s (code %d), line(%d)\n",
+				cudaGetErrorString(error), error, __LINE__);
+		exit (EXIT_FAILURE);
+	}
+	error = cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice);
+	if (error != cudaSuccess) {
+		printf("cudaMemcpy (d_B,h_B) returned error %s (code %d), line(%d)\n",
+				cudaGetErrorString(error), error, __LINE__);
+		exit (EXIT_FAILURE);
+	}
+	// Setup execution parameters
+	dim3 threads(block_size, block_size);
+	dim3 grid(dimsB.x / threads.x, dimsA.y / threads.y);
+	// Create and start timer
+	printf("Computing result using CUDA Kernel...\n");
+	// Performs warmup operation using matrixMul CUDA kernel
+//	if (block_size == 16) {
+//		matrixMulCUDA<16> <<<grid, threads>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
+//	} else {
+//		matrixMulCUDA<32> <<<grid, threads>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
+//	}
+//	printf("done\n");
+//
+//	cudaDeviceSynchronize();
+	// Allocate CUDA events that we'll use for timing
+	cudaEvent_t start;
+	error = cudaEventCreate(&start);
+	if (error != cudaSuccess) {
+		fprintf(stderr, "Failed to create start event (error code %s)!\n",
+				cudaGetErrorString(error));
+		exit (EXIT_FAILURE);
+	}
+	cudaEvent_t stop;
+	error = cudaEventCreate(&stop);
+	if (error != cudaSuccess) {
+		fprintf(stderr, "Failed to create stop event (error code %s)!\n",
+				cudaGetErrorString(error));
+		exit (EXIT_FAILURE);
+	}
+	// Record the start event
+	error = cudaEventRecord(start, NULL);
+	if (error != cudaSuccess) {
+		fprintf(stderr, "Failed to record start event (error code %s)!\n",
+				cudaGetErrorString(error));
+		exit (EXIT_FAILURE);
+	}
+	// Execute the kernel
+	int nIter = 1;
+#if BUILD_TIMER == 1
+	printf("BEFORE START KERNEL %lf\n", mysecond() - timer);
+	double t1 = mysecond();
+#endif
+	for (int j = 0; j < nIter; j++) {
+		//matrixMulCUDA<32> <<<grid, threads>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
+		matrixMulCUDA <<<grid, threads>>>(d_C, d_A, d_B, dimsA.x, dimsB.x, dimsC.x);
+		cudaDeviceSynchronize();
+	}
+#if BUILD_TIMER == 1
+	double exec_time = mysecond() - t1;
+	printf("KERNEL EXECUTION TIME %lf\n", exec_time);
+#endif
+	// Record the stop event
+	error = cudaEventRecord(stop, NULL);
+	if (error != cudaSuccess) {
+		fprintf(stderr, "Failed to record stop event (error code %s)!\n",
+				cudaGetErrorString(error));
+		exit (EXIT_FAILURE);
+	}
+	// Wait for the stop event to complete
+	error = cudaEventSynchronize(stop);
+	if (error != cudaSuccess) {
+		fprintf(stderr,
+				"Failed to synchronize on the stop event (error code %s)!\n",
+				cudaGetErrorString(error));
+		exit (EXIT_FAILURE);
+	}
+	float msecTotal = 0.0f;
+	error = cudaEventElapsedTime(&msecTotal, start, stop);
+	if (error != cudaSuccess) {
+		fprintf(stderr,
+				"Failed to get time elapsed between events (error code %s)!\n",
+				cudaGetErrorString(error));
+		exit (EXIT_FAILURE);
+	}
+#if BUILD_TIMER == 1
+	// Compute and print the performance
+	float msecPerMatrixMul = msecTotal / nIter;
+	double flopsPerMatrixMul = 2.0 * (double) dimsA.x * (double) dimsA.y
+			* (double) dimsB.x;
+	double gigaFlops = (flopsPerMatrixMul * 1.0e-9f)
+			/ (msecPerMatrixMul / 1000.0f);
+	printf(
+			"Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops, WorkgroupSize= %u threads/block\n",
+			gigaFlops, msecPerMatrixMul, flopsPerMatrixMul,
+			threads.x * threads.y);
+#endif
+	// Copy result from device to host
+	error = cudaMemcpy(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost);
+	if (error != cudaSuccess) {
+		printf("cudaMemcpy (h_C,d_C) returned error %s (code %d), line(%d)\n",
+				cudaGetErrorString(error), error, __LINE__);
+		exit (EXIT_FAILURE);
+	}
+	printf("Checking computed result for correctness: ");
+	bool correct = true;
+	// test relative error by the formula
+	//     |<x, y>_cpu - <x,y>_gpu|/<|x|, |y|>  < eps
+	double eps = 1.e-6; // machine zero
+#if BUILD_TIMER == 1
+	t1 = mysecond();
+#endif
+#pragma omp parallel for shared(h_C, correct)
+	for (int i = 0; i < (int) (dimsC.x * dimsC.y); i++) {
+		float abs_err = fabs(h_C[i] - float(dimsA.x * valB));
+		float dot_length = dimsA.x;
+		float abs_val = fabs(h_C[i]);
+		float rel_err = abs_err / abs_val / dot_length;
+		if (rel_err > eps) {
+			printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n", i,
+					h_C[i], dimsA.x * valB, eps);
+#pragma omp critical
+			{
+				correct = false;
+			}
+		}
+	}
+#if BUILD_TIMER == 1
+	exec_time = mysecond() - t1;
+	printf("CMP TIME %lf\n", exec_time);
+#endif
+	printf("%s\n", correct ? "Result = PASS" : "Result = FAIL");
+	// Clean up memory
+	free(h_A);
+	free(h_B);
+	free(h_C);
+	cudaFree(d_A);
+	cudaFree(d_B);
+	cudaFree(d_C);
+	printf(
+			"\nNOTE: The CUDA Samples are not meant for performance measurements. "
+					"Results may vary when GPU Boost is enabled.\n");
+	if (correct) {
+		return EXIT_SUCCESS;
+	} else {
+		return EXIT_FAILURE;
+	}
+}
+/**
+ * Program main
+ */
+int main(int argc, char **argv) {
+#if BUILD_TIMER == 1
+	timer = mysecond();
+#endif
+	printf("[Matrix Multiply Using CUDA] - Starting...\n");
+	if (checkCmdLineFlag(argc, (const char **) argv, "help")
+			|| checkCmdLineFlag(argc, (const char **) argv, "?")) {
+		printf("Usage -device=n (n >= 0 for deviceID)\n");
+		printf("      -wA=WidthA -hA=HeightA (Width x Height of Matrix A)\n");
+		printf("      -wB=WidthB -hB=HeightB (Width x Height of Matrix B)\n");
+		printf(
+				"  Note: Outer matrix dimensions of A & B matrices must be equal.\n");
+		exit (EXIT_SUCCESS);
+	}
+	// By default, we use device 0, otherwise we override the device ID based on what is provided at the command line
+	int devID = 0;
+	if (checkCmdLineFlag(argc, (const char **) argv, "device")) {
+		devID = getCmdLineArgumentInt(argc, (const char **) argv, "device");
+		cudaSetDevice(devID);
+	}
+	cudaError_t error;
+	cudaDeviceProp deviceProp;
+	error = cudaGetDevice(&devID);
+	if (error != cudaSuccess) {
+		printf("cudaGetDevice returned error %s (code %d), line(%d)\n",
+				cudaGetErrorString(error), error, __LINE__);
+	}
+	error = cudaGetDeviceProperties(&deviceProp, devID);
+	if (deviceProp.computeMode == cudaComputeModeProhibited) {
+		fprintf(stderr,
+				"Error: device is running in <Compute Mode Prohibited>, no threads can use ::cudaSetDevice().\n");
+		exit (EXIT_SUCCESS);
+	}
+	if (error != cudaSuccess) {
+		printf(
+				"cudaGetDeviceProperties returned error %s (code %d), line(%d)\n",
+				cudaGetErrorString(error), error, __LINE__);
+	} else {
+		printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID,
+				deviceProp.name, deviceProp.major, deviceProp.minor);
+	}
+	// Use a larger block size for Fermi and above
+	int block_size = (deviceProp.major < 2) ? 16 : 32;
+	dim3 dimsA(5 * 2 * block_size, 5 * 2 * block_size, 1);
+	dim3 dimsB(5 * 4 * block_size, 5 * 2 * block_size, 1);
+	// width of Matrix A
+	if (checkCmdLineFlag(argc, (const char **) argv, "wA")) {
+		dimsA.x = getCmdLineArgumentInt(argc, (const char **) argv, "wA");
+	}
+	// height of Matrix A
+	if (checkCmdLineFlag(argc, (const char **) argv, "hA")) {
+		dimsA.y = getCmdLineArgumentInt(argc, (const char **) argv, "hA");
+	}
+	// width of Matrix B
+	if (checkCmdLineFlag(argc, (const char **) argv, "wB")) {
+		dimsB.x = getCmdLineArgumentInt(argc, (const char **) argv, "wB");
+	}
+	// height of Matrix B
+	if (checkCmdLineFlag(argc, (const char **) argv, "hB")) {
+		dimsB.y = getCmdLineArgumentInt(argc, (const char **) argv, "hB");
+	}
+	if (dimsA.x != dimsB.y) {
+		printf("Error: outer matrix dimensions must be equal. (%d != %d)\n",
+				dimsA.x, dimsB.y);
+		exit (EXIT_FAILURE);
+	}
+	printf("MatrixA(%d,%d), MatrixB(%d,%d)\n", dimsA.x, dimsA.y, dimsB.x,
+			dimsB.y);
+	int matrix_result = matrixMultiply(argc, argv, block_size, dimsA, dimsB);
+	exit(matrix_result);
+}
--- a/codes/mmElem/matrixMul.cudafe1.c
+++ b/codes/mmElem/matrixMul.cudafe1.c
+# 1 "matrixMul.cu"
+# 139 "/usr/include/stdio.h" 3
+extern FILE *stderr;
+# 74 "/usr/include/c++/9/iostream" 3
+static struct _ZNSt8ios_base4InitE _ZN39_INTERNAL_17_matrixMul_cpp1_ii_9deaad98St8__ioinitE __attribute__((visibility("default"))) = {};
+extern void *__dso_handle __attribute__((visibility("hidden")));
--- a/codes/mmElem/matrixMul.cudafe1.stub.c
+++ b/codes/mmElem/matrixMul.cudafe1.stub.c
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-function"
+#pragma GCC diagnostic ignored "-Wcast-qual"
+#define __NV_CUBIN_HANDLE_STORAGE__ static
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#endif
+#include "crt/host_runtime.h"
+#include "matrixMul.fatbin.c"
+extern void __device_stub__Z13matrixMulCUDAPfS_S_iii(float *, float *, float *, int, int, int);
+static void __nv_cudaEntityRegisterCallback(void **);
+static void __sti____cudaRegisterAll(void) __attribute__((__constructor__));
+void __device_stub__Z13matrixMulCUDAPfS_S_iii(float *__par0, float *__par1, float *__par2, int __par3, int __par4, int __par5){__cudaLaunchPrologue(6);__cudaSetupArgSimple(__par0, 0UL);__cudaSetupArgSimple(__par1, 8UL);__cudaSetupArgSimple(__par2, 16UL);__cudaSetupArgSimple(__par3, 24UL);__cudaSetupArgSimple(__par4, 28UL);__cudaSetupArgSimple(__par5, 32UL);__cudaLaunch(((char *)((void ( *)(float *, float *, float *, int, int, int))matrixMulCUDA)));}
+# 50 "matrixMul.cu"
+void matrixMulCUDA( float *__cuda_0,float *__cuda_1,float *__cuda_2,int __cuda_3,int __cuda_4,int __cuda_5)
+# 50 "matrixMul.cu"
+{__device_stub__Z13matrixMulCUDAPfS_S_iii( __cuda_0,__cuda_1,__cuda_2,__cuda_3,__cuda_4,__cuda_5);
+# 66 "matrixMul.cu"
+}
+# 1 "matrixMul.cudafe1.stub.c"
+static void __nv_cudaEntityRegisterCallback( void **__T3) {  __nv_dummy_param_ref(__T3); __nv_save_fatbinhandle_for_managed_rt(__T3); __cudaRegisterEntry(__T3, ((void ( *)(float *, float *, float *, int, int, int))matrixMulCUDA), _Z13matrixMulCUDAPfS_S_iii, (-1)); }
+static void __sti____cudaRegisterAll(void) {  __cudaRegisterBinary(__nv_cudaEntityRegisterCallback);  }
+#pragma GCC diagnostic pop
--- a/codes/mmElem/matrixMul.fatbin.c
+++ b/codes/mmElem/matrixMul.fatbin.c
--- a/codes/mmElem/matrixMul_dlink.fatbin.c
+++ b/codes/mmElem/matrixMul_dlink.fatbin.c
+#ifndef __SKIP_INTERNAL_FATBINARY_HEADERS
+#include "fatbinary_section.h"
+#endif
+#define __CUDAFATBINSECTION  ".nvFatBinSegment"
+#define __CUDAFATBINDATASECTION  ".nv_fatbin"
+asm(
+".section .nv_fatbin, \"a\"\n"
+".align 8\n"
+"fatbinData:\n"
+".quad 0x00100001ba55ed50,0x0000000000000110,0x0000005001010002,0x00000000000000c0\n"
+".quad 0x00000000000000be,0x0000004600010007,0x0000000c00000040,0x0000000000002013\n"
+".quad 0x0000000000000000,0x0000000000000268,0x754d78697274616d,0x00000000206f2e6c\n"
+".quad 0x010102464c457fa2,0x0002660001000733,0xc0230001006e00be,0xf500010012000801\n"
+".quad 0x380040004605460d,0x0100040040000300,0x72747368732e0000,0x2700082e00626174\n"
+".quad 0x735f00ff00086d79,0x766e2e0078646e68,0x2100326f666e692e,0x2e00df004800010f\n"
+".quad 0x0100402200010003,0x0108003000322e00,0x722f0400400b1f00,0x0174131113004000\n"
+".quad 0x000100a82200010e,0x2a00240600061811,0x0000065700180008,0x0500480f01a80500\n"
+".quad 0x003801130040a81b,0x2f0038081500010f,0x0008801700010006,0x0000000000000000\n"
+".text\n");
+#ifdef __cplusplus
+extern "C" {
+#endif
+extern const unsigned long long fatbinData[36];
+#ifdef __cplusplus
+}
+#endif
+#ifdef __cplusplus
+extern "C" {
+#endif
+static const __fatBinC_Wrapper_t __fatDeviceText __attribute__ ((aligned (8))) __attribute__ ((section (__CUDAFATBINSECTION)))= 
+	{ 0x466243b1, 2, fatbinData, (void**)__cudaPrelinkedFatbins };
+#ifdef __cplusplus
+}
+#endif
--- a/codes/mmElem/matrixMul_dlink.reg.c
+++ b/codes/mmElem/matrixMul_dlink.reg.c
+#define NUM_PRELINKED_OBJECTS 0
--- a/codes/mmElem/matrixmul.conf
+++ b/codes/mmElem/matrixmul.conf
+[DEFAULT]
+debug =True 
+# Name of the gdb executable
+gdbExecName = /usr/local/cuda-10.1.243/bin/cuda-gdb 
+# Which fault model to use, 0 -> single; 1 -> double;
+# 2 -> random; 3 -> zeros; 4 -> least 16 significant bits (LSB);
+# 5 -> least 8 significant bits (LSB)
+# If you want multiple fault models, place them separated by ','
+# faultModel = 0,2,3
+faultModel = 0
+# Injection site
+# Can be:
+# RF -> Register File
+# INST_OUT -> Instruction Output (NOT IMPLEMENTED YET)
+# INST_composed -> Instruction Adress (NOT IMPLEMENTED YET)
+injectionSite = RF
+# Max time factor to finish the app, this will be multiplied by the application running time
+# For example if your app spend 2s, and the maxWaitTimes is 5, the max running time before it is
+# Considered as a crash is 10s
+maxWaitTimes = 5
+# binary file of the application
+# Must be full path
+benchmarkBinary = /home/badia/carol-fi-carol-fi_cuda-parallel/codes/mmElem/matrixMul
+# Commands to set the session inside GDB environment
+benchmarkArgs =  -device=1 -wA=16384 -hA=16384 -hB=16384 -wB=16384
+# CSV output file. It will be overwrite at each injection
+csvFile = codes/mmElem/fi_matrix_mul_single_bit.csv
+# You should create a script on the benchmark source folder to verify GOLD_OUTPUT x INJ_OUTPUT
+goldenCheckScript = codes/mmElem/sdc_check.sh
+# Number of signals that will be sent to the application
+seqSignals = 20
+# Initial sleep time in seconds before start sending signals
+# Generally the memory setup time
+initSleep = 2.3
--- a/codes/mmElem/matrixmul_16K.conf
+++ b/codes/mmElem/matrixmul_16K.conf
+[DEFAULT]
+debug =True 
+# Name of the gdb executable
+gdbExecName = /usr/local/cuda-10.1.243/bin/cuda-gdb 
+# Which fault model to use, 0 -> single; 1 -> double;
+# 2 -> random; 3 -> zeros; 4 -> least 16 significant bits (LSB);
+# 5 -> least 8 significant bits (LSB)
+# If you want multiple fault models, place them separated by ','
+# faultModel = 0,2,3
+faultModel = 0
+# Injection site
+# Can be:
+# RF -> Register File
+# INST_OUT -> Instruction Output (NOT IMPLEMENTED YET)
+# INST_composed -> Instruction Adress (NOT IMPLEMENTED YET)
+injectionSite = RF, INST_OUT
+# Max time factor to finish the app, this will be multiplied by the application running time
+# For example if your app spend 2s, and the maxWaitTimes is 5, the max running time before it is
+# Considered as a crash is 10s
+maxWaitTimes = 5
+# binary file of the application
+# Must be full path
+benchmarkBinary = /home/badia/mycarol-fi/codes/mmElem/matrixMul
+# Commands to set the session inside GDB environment
+benchmarkArgs =   -wA=16384 -hA=16384 -hB=16384 -wB=16384
+# CSV output file. It will be overwrite at each injection
+csvFile = codes/matrixMul/fi_matrix_mul_single_bit.csv
+# You should create a script on the benchmark source folder to verify GOLD_OUTPUT x INJ_OUTPUT
+goldenCheckScript = codes/matrixMul/sdc_check.sh
+# Number of signals that will be sent to the application
+seqSignals = 20
+# Initial sleep time in seconds before start sending signals
+# Generally the memory setup time
+initSleep = 2.1
+#kernel = matrixMulCUDA
+kernel = matrixMul.cu:208
+kernel_end = matrixMul.cu:216
--- a/codes/mmElem/matrixmul_24K.conf
+++ b/codes/mmElem/matrixmul_24K.conf
+[DEFAULT]
+debug =True 
+# Name of the gdb executable
+gdbExecName = /usr/local/cuda-10.1.243/bin/cuda-gdb 
+# Which fault model to use, 0 -> single; 1 -> double;
+# 2 -> random; 3 -> zeros; 4 -> least 16 significant bits (LSB);
+# 5 -> least 8 significant bits (LSB)
+# If you want multiple fault models, place them separated by ','
+# faultModel = 0,2,3
+faultModel = 0
+# Injection site
+# Can be:
+# RF -> Register File
+# INST_OUT -> Instruction Output (NOT IMPLEMENTED YET)
+# INST_composed -> Instruction Adress (NOT IMPLEMENTED YET)
+injectionSite = RF
+# Max time factor to finish the app, this will be multiplied by the application running time
+# For example if your app spend 2s, and the maxWaitTimes is 5, the max running time before it is
+# Considered as a crash is 10s
+maxWaitTimes = 5
+# binary file of the application
+# Must be full path
+benchmarkBinary = /home/badia/carol-fi-carol-fi_cuda-parallel/codes/matrixMul/matrixMul
+# Commands to set the session inside GDB environment
+#benchmarkArgs =  -device=1 -wA=16384 -hA=16384 -hB=16384 -wB=16384
+benchmarkArgs =  -device=1 -wA=24576	 -hA=24576 -hB=24576 -wB=24576
+# CSV output file. It will be overwrite at each injection
+csvFile = codes/matrixMul/fi_matrix_mul_single_bit.csv
+# You should create a script on the benchmark source folder to verify GOLD_OUTPUT x INJ_OUTPUT
+goldenCheckScript = codes/matrixMul/sdc_check.sh
+# Number of signals that will be sent to the application
+seqSignals = 20
+# Initial sleep time in seconds before start sending signals
+# Generally the memory setup time
+initSleep = 4.9
--- a/codes/mmElem/sdc_check.sh
+++ b/codes/mmElem/sdc_check.sh
+#!/usr/bin/sh
+# SDC checking diff
+# Must compare all things here
+# Any particular output comparison must be made here
+# To be considered as an SDC or CRASH the
+# DIFF_LOG and DIFF_ERR_LOG files must not be empty
+# INJ_OUTPUT_PATH, INJ_ERR_PATH, GOLD_OUTPUT_PATH, GOLD_ERR_PATH
+# are environment variables defined by the fault_injector.py
+# diff stdout
+diff -B ${INJ_OUTPUT_PATH} ${GOLD_OUTPUT_PATH} > ${DIFF_LOG}
+# Special comparison like the following one can be done in this script
+grep -q "Result = FAIL" ${INJ_OUTPUT_PATH} >> ${DIFF_LOG}
+# diff stderr
+diff -B ${INJ_ERR_PATH} ${GOLD_ERR_PATH} > ${DIFF_ERR_LOG}
+# Must exit 0
+exit 0
\ No newline at end of file
--- a/common_functions.py
+++ b/common_functions.py
+import os
+import pickle
+import re
+import sys
+import common_parameters as cp
+if sys.version_info >= (3, 0):
+    import configparser  # python 3
+else:
+    import ConfigParser  # python 2
+"""
+Support function to execute a command
+and return the output.
+If the command contains NEWLINE character
+it will result in a list.
+"""
+def execute_command(gdb, to_execute):
+    ret = gdb.execute(to_execute, to_string=True)
+    return ret.splitlines()
+"""
+Serialize a dictionary into a
+file path using pickle.
+"""
+def save_file(file_path, data):
+    with open(file_path, "wb") as f_out:
+        pickle.dump(data, f_out)
+        f_out.close()
+"""
+Serialize a dictionary into a
+file path using pickle.
+"""
+def append_file(file_path, data):
+    with open(file_path, "ab") as f_out:
+        pickle.dump(data, f_out)
+        f_out.close()
+"""
+Load a dictionary from a file path using pickle.
+return a dictionary
+"""
+def load_file(file_path):
+    with open(file_path, "rb") as f_in:
+        data = pickle.load(f_in)
+        return data
+"""
+Read configuration file
+"""
+def load_config_file(flip_config_file):
+    # Read configuration file
+    if sys.version_info >= (3, 0):
+        conf = configparser.ConfigParser()
+    else:
+        conf = ConfigParser.ConfigParser()
+    conf.read(flip_config_file)
+    return conf
+"""
+Kill all remaining processes
+"""
+def kill_all(kill_string, logging=None):
+    for cmd in kill_string.split(";"):
+        os.system(cmd + " > /dev/null 2>&1")
+        if logging:
+            logging.debug("kill cmd: {}".format(cmd))
+"""
+GDB python cannot find common_functions.py, so I added this directory to PYTHONPATH
+"""
+def set_python_env():
+    current_path = os.path.dirname(os.path.realpath(__file__))
+    os.environ['PYTHONPATH'] = "$PYTHONPATH:" + current_path + ":" + current_path + "/classes"
+    os.environ['OMP_NUM_THREADS'] = '1'
+    return current_path
+"""
+Remove all useless information produced by CUDA-GDB on the output files
+before they got to the SDC check script
+"""
+def remove_useless_information_from_output(output_file_path):
+    ok_output_lines = []
+    with open(output_file_path, 'r') as ifp:
+        lines = ifp.readlines()
+        for line in lines:
+            is_line_addable = True
+            for pattern in cp.POSSIBLE_USELESS_GDB_OUTPUT_PATTERNS:
+                # It is addable or not
+                search_result = re.search(pattern=pattern, string=line)
+                if search_result:
+                    is_line_addable = False
+            if is_line_addable:
+                ok_output_lines.append(line)
+    # Overwrite the output file
+    with open(output_file_path, 'w') as ofp:
+        ofp.writelines(ok_output_lines)
+"""
+Show output function
+to allow pretty printing
+"""
+def printf(*args):
+    string_to_print = ""  # ""\r"
+    for i in args:
+        string_to_print += "{0} ".format(i)
+    print(string_to_print)
--- a/common_parameters.py
+++ b/common_parameters.py
+# Max size of register
+SINGLE_MAX_SIZE_REGISTER = 32
+# Times to profile
+# this will be the max number of executions
+# to profiler application
+MAX_TIMES_TO_PROFILE = 2
+# Log path to store all injections info
+LOGS_PATH = 'logs'
+# Temporary file to store kernel information
+KERNEL_INFO_DIR = LOGS_PATH + '/tmp/carol-fi-kernel-info.txt'
+# For golden generation
+GOLD_ERR_PATH = LOGS_PATH + '/tmp/carol_fi_golden_bench_err.txt'
+GOLD_OUTPUT_PATH = LOGS_PATH + '/tmp/carol_fi_golden_bench_output.txt'
+# Files that will be compared to golden ones
+INJ_OUTPUT_PATH = LOGS_PATH + '/tmp/carol_fi_inj_bench_output_{}.txt'
+INJ_ERR_PATH = LOGS_PATH + '/tmp/carol_fi_inj_bench_err_{}.txt'
+# Internal python scripts
+FLIP_SCRIPT = 'flip_value.py'
+PROFILER_SCRIPT = 'profiler_new.py'
+# Temporary difference logs
+DIFF_LOG = LOGS_PATH + '/tmp/diff_{}.log'
+DIFF_ERR_LOG = LOGS_PATH + '/tmp/diff_err_{}.log'
+# Debug env vars
+# Debug FI process
+DEBUG = True
+# Debug profiler process
+DEBUG_PROFILER = True
+# Log file for SignalApp thread
+SIGNAL_APP_LOG = LOGS_PATH + '/tmp/signal_app_thread_{}.txt'
+# Num of sleep time divisor
+NUM_DIVISION_TIMES = 100.0
+# Common body of log filename
+LOG_DEFAULT_NAME = LOGS_PATH + '/tmp/carolfi-flipvalue-{}.log'
+# MAX INT 32 bits
+MAX_INT_32 = 4294967295
+# Most of the benchmarks we cannot wait until the end of the processing
+# Considering most of 90% of the time
+MAX_SIGNAL_BEFORE_ENDING = 0.9
+# termination, program, alarm, asynchronous, job, operation error, miscellaneous, signal interruption
+# 'SIGINT' must not be here, since I used it to send an interruption to app
+SIGNALS = ['SIGKILL', 'SIGTERM', 'SIGQUIT', 'SIGHUP',  # termination codes
+           'SIGFPE', 'SIGILL', 'SIGSEGV', 'SIGBUS', 'SIGABRT', 'SIGIOT', 'SIGTRAP', 'SIGEMT', 'SIGSYS',  # program codes
+           'SIGALRM', 'SIGVTALRM', 'SIGPROF',  # alarm codes
+           'SIGIO', 'SIGURG', 'SIGPOLL',  # asynchronous codes
+           'SIGCHLD', 'SIGCLD', 'SIGCONT', 'SIGSTOP', 'SIGTSTP', 'SIGTTIN', 'SIGTTOU',  # job control
+           'SIGPIPE', 'SIGLOST', 'SIGXCPU', 'SIGXFSZ',  # operation codes
+           'SIGUSR1', 'SIGUSR2', 'SIGWINCH', 'SIGINFO',  # miscellaneous codes
+           'strsignal', 'psignal',  # signal messages
+           # cuda signals
+           'CUDA_EXCEPTION_0', 'CUDA_EXCEPTION_1', 'CUDA_EXCEPTION_2', 'CUDA_EXCEPTION_3', 'CUDA_EXCEPTION_4',
+           'CUDA_EXCEPTION_5',
+           'CUDA_EXCEPTION_6', 'CUDA_EXCEPTION_7', 'CUDA_EXCEPTION_8', 'CUDA_EXCEPTION_9', 'CUDA_EXCEPTION_10',
+           'CUDA_EXCEPTION_11',
+           'CUDA_EXCEPTION_12', 'CUDA_EXCEPTION_13', 'CUDA_EXCEPTION_14', 'CUDA_EXCEPTION_15']
+# All trash produced by GDB must be add here in this list
+# Using the Regular Expression format (python re)
+POSSIBLE_USELESS_GDB_OUTPUT_PATTERNS = [
+        r'.*Thread.*received signal SIGINT, Interrupt.*',  # Thread SIGINT message
+        r'.*New Thread.*',  # New GDB Thread creation
+        r'.*Thread debugging using.*enabled.*',  # Lib thread enabled
+        r'.*Using host.*library.*',  # Using host library
+        r'.*Switching focus to CUDA kernel.*',  # Switching focus to CUDA kernel message
+        r'.*0x.*in.*<<<.*>>>.*',  # Kernel interruption message
+        r'.*Inferior.*\(process.*\) exited normally.*',  # GDB exited normally message
+        r'.*Thread 0x.*exited.*',  # Thread exited
+        r'.*0x.* in cu.* () from /usr/lib/.*libcuda.*',  # Cuda lib calls
+        r'.*0x.*in.*\[clone.*\].*\(\).*',  # OMP calls
+        r'.*0x.*in.*',  # General API call
+        r'.*Inferior.*\(process.*\).*',  # General inferior process
+    ]
+# Injection sites
+RF = 0
+INST_OUT = 1
+INST_ADD = 2
+INJECTION_SITES = {
+    'RF': RF,
+    'INST_OUT': INST_OUT,
+    'INST_ADD': INST_ADD
+}
+# Which fault model to use, 0 -> single; 1 -> double;
+# 2 -> random; 3 -> zeros; 4 -> least 16 significant bits (LSB);
+# 5 -> least 8 significant bits (LSB)
+FLIP_SINGLE_BIT = 0
+FLIP_TWO_BITS = 1
+RANDOM_VALUE = 2
+ZERO_VALUE = 3
+LEAST_16_BITS = 4
+LEAST_8_BITS = 5
+# Focus error string
+FOCUS_ERROR_STRING = "Focus not set on any active CUDA kernel."
--- a/fault_injector.py
+++ b/fault_injector.py
--- a/flip_value.py
+++ b/flip_value.py
+import os
+import gdb
+import time
+from classes.BitFlip import BitFlip
+from classes.Logging import Logging
+import common_parameters as cp
+"""
+Handler attached to exit event
+"""
+def exit_handler(event):
+    global global_logging
+    global_logging.info(str("event type: exit"))
+    print ("llego el final")
+    os.system ("kill -s USR2 " + str(pid))
+    try:
+        global_logging.info("exit code: {}".format(str(event.exit_code)))
+    except Exception as err:
+        err_str = "ERROR: {}".format(str(err))
+        global_logging.exception(err_str)
+"""
+Handler that will put a breakpoint on the kernel after
+signal
+"""
+def set_event(event):
+    # Accessing global vars
+    global global_logging, was_hit, bit_lip,bp,t
+    if (isinstance(event, gdb.BreakpointEvent)):
+      global_logging.info("Before breakpoint"+ str(time.clock()-t))
+      global_logging.info ("Enviado senal a "+ str(pid))
+      os.system ("kill -s USR1 " + str(pid))
+      bp.enabled=False
+      gdb.execute('c')
+     #      #os.system ("killall -2 python3")
+    else:
+      try:
+        # Just checking if it was hit
+        if bit_flip.fault_injected is False:
+            bit_flip.single_event()
+            global_logging.info("BIT FLIP SET ON SIGNAL {}".format(event.stop_signal))
+      except Exception as err:
+        global_logging.exception("EVENT DIFFERENT FROM STOP SIGNAL: {}".format(str(err)))
+"""
+Main function
+"""
+def main():
+    global global_logging, register, injection_site, bits_to_flip, fault_model, was_hit, bit_flip, arg0
+    was_hit = False
+    # Initialize GDB to run the app
+    gdb.execute("set confirm off")
+    gdb.execute("set pagination off")
+    gdb.execute("set target-async off")
+    gdb.execute("set non-stop off")
+    # Connecting to a exit handler event
+    gdb.events.exited.connect(exit_handler)
+    # Connecting to a stop signal event
+    gdb.events.stop.connect(set_event)
+    # Get variables values from environment
+    # Firsn parse line
+    [kernel,pid,bits_to_flip, fault_model, flip_log_file,
+     gdb_init_strings, injection_site] = arg0.split('|')
+    # Logging
+    global_logging = Logging(log_file=flip_log_file)
+    global_logging.info("Starting flip_value script "+" called by " + str(pid) + " for stop kernel " + str(kernel));
+    try:
+        for init_str in gdb_init_strings.split(";"):
+            gdb.execute(init_str)
+            global_logging.info("initializing setup: " + str(init_str))
+    except gdb.error as err:
+        global_logging.exception("ERROR on initializing setup: {}".format(str(err)))
+    # Set Breakpoint attributes to be use
+    bits_to_flip = [i for i in bits_to_flip.split(",")]
+    fault_model = int(fault_model)
+    bit_flip = BitFlip(bits_to_flip=bits_to_flip, fault_model=fault_model,
+                       logging=global_logging, injection_site=cp.INJECTION_SITES[injection_site])
+    # Start app execution
+    t=time.clock();
+    #gdb.execute("break "+kernel)
+    bp=gdb.Breakpoint(kernel)
+    global_logging.info("Put Break "+ str(time.clock()-t))
+    gdb.execute("r")
+    i = 0
+    try:
+        while 'The program' not in gdb.execute('c', to_string=True):
+            i += 1
+    except Exception as err:
+        global_logging.info("CONTINUED {} times".format(i))
+        err_str = str(err).rstrip()
+        global_logging.exception("IGNORED CONTINUE ERROR: {}".format(err_str))
+        # Make sure that it is going to finish
+        if 'Failed' in err_str:
+            gdb.execute('quit')
+            global_logging.exception("QUIT REQUIRED")
+# Call main execution
+global_logging = None
+register = None
+bits_to_flip = None
+fault_model = None
+was_hit = False
+injection_site = None
+bit_flip = None
+main()
--- a/process_start.py
+++ b/process_start.py
+from subprocess import Popen
+from sys import argv
+with open(argv[1], "w") as fp:
+    fp.write(str(Popen(argv[2]).pid))
--- a/profiler.py
+++ b/profiler.py
+import gdb
+"""
+Main function
+"""
+# Initialize GDB to run the app
+gdb.execute("set confirm off")
+gdb.execute("set pagination off")
+gdb.execute("set target-async off")
+gdb.execute("set non-stop off")
+# gdb_init_strings = str(os.environ["CAROL_FI_INFO"])
+gdb_init_strings = arg0
+for init_str in gdb_init_strings.split(";"):
+     gdb.execute(init_str)
+gdb.execute("r")
--- a/profiler_new.py
+++ b/profiler_new.py
+import gdb
+import time
+def exit_handler(event):
+   global nosalir
+   nosalir=False
+   print(str("event type: exit"))
+   try:
+        print("exit code: {}".format(str(event.exit_code)))
+   except Exception as err:
+        err_str = "ERROR: {}".format(str(err))
+        print(err_str)
+"""
+Handler that will put a breakpoint on the kernel after
+signal
+"""
+def set_event(event):
+      global trun,ocurrencias,t,primera
+      print ("Es mi primera vez"+ str(primera)+"  "+str(ocurrencias))
+      if (isinstance(event, gdb.BreakpointEvent)):
+        if (primera):
+          t=time.clock()
+          ocurrencias=ocurrencias+1
+        else:  
+          trun=(time.clock()-t)
+        primera=not primera  
+      else:  
+          trun=(time.clock()-t)  
+"""
+Main function
+"""
+def main():
+  global ocurrencias,t,nosalir,trun,primera
+  primera=True	;
+  ocurrencias=0;
+  # Initialize GDB to run the app
+  gdb.execute("set confirm off")
+  gdb.execute("set pagination off")
+  gdb.execute("set target-async off")
+  gdb.execute("set non-stop off")
+  # Connecting to a exit handler event
+  gdb.events.exited.connect(exit_handler)
+  # Connecting to a stop signal event
+  gdb.events.stop.connect(set_event)
+  # gdb_init_strings = str(os.environ["CAROL_FI_INFO"])
+  gdb_init_strings = arg0
+  cadena=gdb_init_strings.split(";",2)
+  #print (cadena,"-",cadena[0],'-',cadena[1],'-', cadena[2])
+  section =cadena[0]=="True"
+  kernel_end=cadena[1]
+  #print ("B "+section+"ke "+kernel_end+" ....")
+  #print (cadena[2].split(";"))
+  for init_str in  cadena[2].split(";"):
+     gdb.execute(init_str)
+  if (section):
+      gdb.execute ("break "+kernel_end)
+  gdb.execute("r")
+  #nosalir=True
+  #while nosalir:
+  if (section):
+      #print ("Point 1")
+      gdb.execute("c")
+  else:
+      gdb.execute("finish")
+  #print ("Punto 2")
+  #print (" Ocurrencias "+str(ocurrencias)+" Tiempo acumulado de ejecucciones "+ str(trun)+ "\n")
+  gdb.execute("c")
+  #print (" Ocurrencias "+str(ocurrencias)+" Tiempo acumulado de ejecucciones "+ str(trun))
+  f=open("tmpxxx_return_profiler.conf","w")
+  f.write("[DEFAULT] \nOcurrencias = "+str(ocurrencias)+"\nTiempo = "+str(trun)+"\n")
+  f.close()	
+  #print ("End write file \n")	
+  #sys.stdout.flush()
+main()
--- a/prueba.py
+++ b/prueba.py
+import os
+import gdb
+import time
+def exit_handler(event):
+   global nosalir
+   nosalir=False
+   print(str("event type: exit"))
+   try:
+        print("exit code: {}".format(str(event.exit_code)))
+   except Exception as err:
+        err_str = "ERROR: {}".format(str(err))
+        print(err_str)
+"""
+Handler that will put a breakpoint on the kernel after
+signal
+"""
+def set_event(event):
+      global trun,ocurrencias,t
+      if (isinstance(event, gdb.BreakpointEvent)):
+        t=time.clock()
+        ocurrencias=ocurrencias+1
+      else:  
+        trun=(time.clock()-t)
+def main():      
+    global ocurrencias,t,nosalir,trun
+    was_hit = False
+    ocurrencias=0
+    # Initialize GDB to run the appset pagination off
+    gdb.execute("set confirm off")
+    gdb.execute("set pagination off")
+    gdb.execute("set target-async off")
+    gdb.execute("set non-stop off")
+    # Connecting to a exit handler event
+    gdb.events.exited.connect(exit_handler)
+    # Connecting to a stop signal event
+    gdb.events.stop.connect(set_event)
+    gdb.execute("file ~/rodinia_3.1/cuda/lud/cuda/lud_cuda")
+    gdb.execute("set arg -s 10000")
+    gdb.execute("break lud_cuda")
+    gdb.execute('r')
+    nosalir=True
+    while nosalir:
+      gdb.execute("finish")
+      gdb.execute("c")
+    print (" Ocurrencias "+str(ocurrencias)+" Tiempo acumulado de ejecucciones "+ str(trun))
+    f=open("tmpxxx_return_profiler.conf","w")
+    f.write("Ocurrencias ="+str(ocurrencias)+"\n Tiempo "+str(trun)+"\n")
+    f.close()
+main()