Commit 97b1c215 authored by German Leon's avatar German Leon
Browse files

Muchos cambios

parent 8621c3da
......@@ -42,7 +42,7 @@ Function that calls the profiler based on the injection mode
"""
def profiler_caller(gdb_exec, kernels, benchmark_binary, benchmark_args,device,section):
def profiler_caller(gdb_exec, kernels, benchmark_binary, benchmark_args,device,section,trace):
acc_time = 0
acc_time_profiler=0
script = 'env CUDA_VISIBLE_DEVICES={} {} -ex \'py arg0 = {}\' -n -batch -x {}'
......@@ -50,16 +50,17 @@ def profiler_caller(gdb_exec, kernels, benchmark_binary, benchmark_args,device,s
# print ("KERNEL"+kernels)
#init_string = '"file {}; set args {}"'.format(benchmark_binary, benchmark_args_striped)
print ("SECTION {}".format(section))
init_string = '"{};{};{};file {}; set args {}; set cuda break_on_launch application"'.format(False,True,kernels,benchmark_binary, benchmark_args_striped)
init_string = '"{};{};{};{};file {}; set args {}; set cuda break_on_launch application"'.format(False,True,kernels,trace,benchmark_binary, benchmark_args_striped)
profiler_cmd = script.format(device, gdb_exec, init_string, cp.PROFILER_SCRIPT)
max_registers=os.system(profiler_cmd) >>8
os.system("cat tmpxxx/kernels.conf")
print ("Maximo numero de registros ###################################+++")
print(max_registers,max_registers>>8)
print(max_registers,max_registers>>8)
if bool(section):
init_string = '"{};{};{};file {}; set args {}; break {}; break {}"'.format( bool(section),False,kernels,benchmark_binary,benchmark_args_striped,section['begin'],section['end'])
init_string = '"{};{};{};{};file {}; set args {}; break {}; break {}"'.format( bool(section),False,kernels,trace,benchmark_binary,benchmark_args_striped,section['begin'],section['end'])
else:
init_string = '"{};{};{};file {}; set args {}; break {}"'.format(False,False,kernels,benchmark_binary, benchmark_args_striped,kernels.split(",")[0])
init_string = '"{};{};{};{};file {}; set args {}; break {}"'.format(False,False,kernels,trace,benchmark_binary, benchmark_args_striped,kernels.split(",")[0])
profiler_cmd = script.format(device, gdb_exec, init_string, cp.PROFILER_SCRIPT)
print ("Profiler caller")
if cp.DEBUG:
......@@ -69,29 +70,33 @@ def profiler_caller(gdb_exec, kernels, benchmark_binary, benchmark_args,device,s
start = time.time()
os.system(profiler_cmd)
end = time.time()
ret_profiler = cf.load_config_file("tmpxxx_return_profiler.conf")
ret_profiler = cf.load_config_file("tmpxxx/return_profiler.conf")
acc_time_profiler+=float(ret_profiler.get('DEFAULT', 'Tiempo'))
acc_time += end - start
cf.kill_all("killall -9 {}; killall -9 {}".format(
os.path.basename(gdb_exec), os.path.basename(benchmark_binary)))
f=open("tmpxxx/return_profiler.conf","w")
f.write("[DEFAULT] \nMEDIA_TIME_PROFILER= "+str(acc_time_profiler / cp.MAX_TIMES_TO_PROFILE)+"\nMEDIA_TIME= "+
str(acc_time / cp.MAX_TIMES_TO_PROFILE )+"\nMAX_REGISTERS="+str(max_registers))
f.close()
return acc_time_profiler / cp.MAX_TIMES_TO_PROFILE, acc_time / cp.MAX_TIMES_TO_PROFILE, max_registers
"""
"""cf.load
Function to generate the gold execution
"""
def generate_gold(gdb_exec, benchmark_binary, benchmark_args,device):
def generate_gold(gdb_exec, benchmark_binary, benchmark_args,device,trace):
# Create tmp path and clean it if it exists
tmp_path = os.path.dirname(os.path.realpath(__file__)) + "/" + cp.LOGS_PATH + "/tmp"
os.system("mkdir -p " + tmp_path)
os.system("rm -rf " + tmp_path + "/*")
script = 'env CUDA_VISIBLE_DEVICES={} {} -ex \'py arg0 = {}\' -n -batch -x {} > {} 2> {}'
init_string = '"{};{};{};file {}; set args {}"'.format(False,True,"",benchmark_binary, benchmark_args)
init_string = '"{};{};{};{};file {}; set args {}"'.format(False,False,"",trace,benchmark_binary, benchmark_args)
profiler_cmd = script.format(device, gdb_exec, init_string, cp.PROFILER_SCRIPT, cp.GOLD_OUTPUT_PATH, cp.GOLD_ERR_PATH)
if cp.DEBUG:
print("PROFILER CMD: {}".format(profiler_cmd))
......@@ -143,15 +148,16 @@ def main():
section['end']=conf.get('DEFAULT','section_end')
gdb_exec = conf.get("DEFAULT", "gdbExecName")
kernels=conf.get('DEFAULT', 'kernels')
[max_time_kernel,max_time_app,max_regs] = profiler_caller(gdb_exec=gdb_exec,kernels=kernels, benchmark_binary=benchmark_binary, benchmark_args=benchmark_args,device=args.device,section=section)
trace=conf.get('DEFAULT','trace')
[max_time_kernel,max_time_app,max_regs] = profiler_caller(gdb_exec=gdb_exec,kernels=kernels, benchmark_binary=benchmark_binary, benchmark_args=benchmark_args,device=args.device,section=section,trace=trace)
print ("Time kernel= "+str(max_time_kernel)+ "Time app "+str(max_time_app))
# saving gold
print ("Saving gold");
generate_gold_result = generate_gold(gdb_exec=gdb_exec,
benchmark_binary=benchmark_binary, benchmark_args=benchmark_args,device=args.device)
os.system("cat tmpxxx/kernels.conf")
generate_gold_result = generate_gold(gdb_exec=gdb_exec,benchmark_binary=benchmark_binary, benchmark_args=benchmark_args,device=args.device,trace=trace)
os.system("cat tmpxxx/kernels.conf")
if generate_gold_result != 0:
raise EnvironmentError("Gold generation did not finish well, the fault injection will not work")
raise EnvironmentError("Gold generation did not finish well, the fault injection will not work")
# Remove trash GDB info from the std output and the err output
cf.remove_useless_information_from_output(cp.GOLD_OUTPUT_PATH)
......@@ -159,7 +165,7 @@ def main():
# Save the kernel configuration txt file
cf.save_file(file_path=cp.KERNEL_INFO_DIR, data={'max_time': max_time_app,'max_time_kernel': max_time_kernel,'max_regs':max_regs})
os.system("cat tmpxxx/kernels.conf")
print("1 - Profile finished\n###################################################")
......
......@@ -24,23 +24,83 @@ class BitFlip:
self.__injection_site = kwargs.get('injection_site')
self.__maxregs=int(kwargs.get('max_regs'))
self.fault_injected = False
"""
print exception info
"""
@staticmethod
def principiokernel(kernel):
#Obtiene la direccion de comienzo de un kernel dado
try:
l=cf.execute_command(gdb=gdb, to_execute="print {}".format(kernel))
inicio=l[0].split("}")[1].split("<")[0]
return inicio
except:
l=BitFlip.principio2kernel(kernel)
inicio=l.split(":")[0].split("<")[0]
return inicio
@staticmethod
def principio2kernel(kernel):
#Obtiene la direccion de comienzo de un kernel dado de una forma mas lenta y mas segura
l=BitFlip.disas(kernel)
return(l[1])
@staticmethod
def lenkernel(kernel):
#Obtiene la longitud (numero de instruccion) de un kernel
l=BitFlip.disas(kernel)
i= len(l) - 2
while ("NOP" in l[i]):
i=i-1
fin=(l[i].split("<")[1])
fin=(fin.split(">")[0])#.split("+")[1]
return fin
@staticmethod
def address_random(d):
# Proporciona una direccion aleatorio, para realizar un breakpoint
# De la informacion que tiene de los nkernel Principio y Tamano
ky=random.choice(list(d.keys()))
dir=random.randint(0, (d[ky]['Tamano']/16)-1)
dir=d[ky]['Principio']+(dir*16)
return (" *{} ".format(hex(dir)))
def readtraza(self,file):
self.__traza=cf.load_config_file(file)
def read_data_kernels(self,readtrace):
#Leer la informacion de los kernels a trabajar
conf =cf.load_config_file("tmpxxx/kernels.conf")
lista= conf.get('Kernels', 'Nombres')
if readtrace:
self.readtraza(conf.get('DEFAULT',trace))
datoskernels={}
for i in lista.split(","):
elem={}
elem['Principio']=int(conf.get(i,'Principio'),16)
#elem['Principio']=int(BitFlip.principiokernel(i),16)
elem['Tamano']=int(conf.get(i,'Tamano'))
datoskernels[i]=elem
return datoskernels
@staticmethod
def update_data_kernels(dict):
#Rellenar un diccionario con la direccion de comienzo.
for i in dict:
dict[i]['Principio']=int(BitFlip.principiokernel(i),16)
@staticmethod
def __exception_str():
exc_type, exc_obj, exc_tb = sys.exc_info()
return "Exception type {} at line {}".format(exc_type, exc_tb.tb_lineno)
@staticmethod
def numreg (kernel):
def disas(kernel):
#Desensamblando de un kernel
try:
disassemble_array = cf.execute_command(gdb=gdb, to_execute="disassemble {}".format(kernel))
#print ("kenel={}".format(kernel))
except:
#si no ha funcionado la forma general, es que solo habra uno y es el que es el activo.
disassemble_array = cf.execute_command(gdb=gdb, to_execute="disassemble")
return disassemble_array
@staticmethod
def numreg (kernel):
#Calculo del numero de registro usados en un kernel
disassemble_array=BitFlip.disas(kernel)
listareg=set()
listaregdst=set()
listaregcond=set()
......@@ -65,94 +125,243 @@ class BitFlip:
"""
TODO: Describe the method
"""
@staticmethod
def kernelnow():
#Obtiene el kernel enfocado,
str=gdb.execute(" info cuda kernels",to_string=True)
str=str.splitlines()
index=str[0].find("Invocation")
for i in range(1, len(str) ):
if "* " in str[i] :
ret=(str[i][index:]).split("(")[0]
return ret
def asmline(self):
#Leo la linea de ejecucion
#Leo la instruccion en ensamblador a ejecutar.
linea=cf.execute_command(gdb=gdb, to_execute="x/1i $pc")
self.__logging.info("ASSM_LINE:{}".format(linea[0]))
return linea
def asmline2(self):
#Difiere de asmline donde visualiza la informacion
linea=cf.execute_command(gdb=gdb, to_execute="x/1i $pc")
print("ASSM_LINE:{}".format(linea[0]))
return linea
def reg_asmline( self):
#Obtengo los registros de la instruccion
#Obtengo los registros de la instruccion a ejecutar y los devuelvo una lista
linea= self.asmline()
lista=re.findall(r"R(\d+)", linea[0])
#Ahora son todos los registros.
#lista=range(0,self.__maxregs)
setlista=set()
setlista.update(lista)
return setlista
def regmod (self):
def LoadValuesRegsInst(self):
lista=self.reg_asmline()
while len(lista) == 0: #Habria que poner un limite.
#Busco una instruccion que referiencia algun registro
self.__logging.info("INSTRUCTION WITHOUT DESTINATION REGISTER")
gdb.execute("nexti")
lista=self.reg_asmline()
listareg=[" R{} ".format(x) for x in lista]
strlistareg="info registers "
for x in listareg:
strlistareg+=x;
self.__stringregs=strlistareg
def ValueRegsInst (self):
valores= cf.execute_command(gdb=gdb, to_execute=self.__stringregs)
return valores
def regmod (self):
lista=self.reg_asmline()
while len(lista) == 0: #Habria que poner un limite.
#Busco una instruccion que referiencia algun registro
self.__logging.info("INSTRUCTION WITHOUT DESTINATION REGISTER")
self.nextinstr()
self.LoadValuesRegsInst()
#Obtengo el valor de los registro referenciados
valores= cf.execute_command(gdb=gdb, to_execute=strlistareg)
return valores
return self.ValueRegsInst()
def nextinstr(self):
#Obtengo el valor de los registro referenciados por la primera instruccipn
gdb.execute("nexti")
linea= self.asmline()
valores= cf.execute_command(gdb=gdb, to_execute=self.__stringregs)
return valores
x=cf.execute_command(gdb=gdb, to_execute="nexti")
def dictreg(self,valores):
#self.__logging.info("Execute dictreg")
#Almaceno en un dictionario los valores de los registros
#Almaceno en un dictionario los valores de los registros obtenido de un info registers
regs={}
for x in valores:
m = re.match(r".*R(\d+).*0x([0-9a-fA-F]+).*", x)
if m:
regs[m.group(1)]=m.group(2)
return regs
def cmpregdst (self,valores,regs):
regdst=set()
for x in valores:
regdst=set()
for x in valores:
m = re.match(r".*R(\d+).*0x([0-9a-fA-F]+).*", x)
if m:
#print("El registro {} tiene {} y tenia{}".format(m.group(1),m.group(2),regs[m.group(1)]))
if (regs[m.group(1)]!=m.group(2)):
#print("Diferente")
regdst.add(m.group(1))
#self.__logging.info(str(len(regdst)))
return regdst
def __inst_generic_injector(self):
return regdst
def mypcis(self):
#Obtiene el pc relativo
valores= cf.execute_command(gdb=gdb, to_execute="info registers pc")
pc=int((valores[0].split("+")[1]).split(">")[0])
return pc
def RegDstInst(self,despl):
#Obtiene registro destino de la instruccion -despl
linea=cf.execute_command(gdb=gdb, to_execute="x/1i $pc-"+str(despl))
rd=re.findall(r"R(\d+)",linea[0])
if (len(rd) >0):
rd=rd [0]
return rd
def isfininstruction(self):
linea=cf.execute_command(gdb=gdb, to_execute="x/1i $pc")
lista=linea[0].find("EXIT")
return (lista!=-1)
def regdst(self):
#Regsitro modificados en la ejecuccion de una instrucciones. varios si no hace referencia
# a ningun registro.
regs=self.dictreg(self.regmod())
valores=self.nextinstr()
self.nextinstr()
self.LoadValuesRegsInst()
valores=self.ValueRegsInst()
r=self.cmpregdst(valores,regs)
self.__logging.info ("Registros Modificados: {}".format(r))
def CalculateRegDestInst(self,despl):
r=self.RegDstInst(despl)
if cp.HIGHPRECISION:
if (len(r)>0):
print ("=========HP+++++++++++++================")
valores=self.ValueRegsInst()
print ("Valores:...{}".format(valores))
r=self.cmpregdst(valores,self.__regs)
if (len(r)>0):
r=r.pop()
print ("Registros modificados:...{}".format(r))
self.__regs=self.dictreg(valores)
print ("Regs:...{}".format(self.__regs))
return r
def analisis(self,kernels):
gdb.execute("set cuda break_on_launch none")
for kernel in kernels:
self.asmline2()
#self.__logging.raw("Kernel {}".format(kernel))
#gdb.execute()
gdb.execute("delete breakpoints")
str=gdb.execute("break "+kernel,to_string=True)
#self.__logging.raw("====".format(str))
gdb.execute("c")
self.asmline2()
self.__logging.raw("["+kernel+"]\n")
pcold=0
dict={}
fin=self.isfininstruction()
rold=self.RegDstInst(0)
print("Rold={}".format(rold))
pcold=self.mypcis()
self.LoadValuesRegsInst()
self.__regs=self.dictreg(self.ValueRegsInst())
print ("Regs:...{}".format(self.__regs))
self.nextinstr()
#Y si la primera instruccion no tiene argumentos?
i=0
while not fin:
# regs=self.dictreg(self.regmod()) # Igual solo tendriamos que mirar el regfdst de la instruccion
# r=self.cmpregdst(valores,regs)
pc=self.mypcis()
print("====")
self.asmline2()
#r=self.RegDstInst(16) #
r=self.CalculateRegDestInst(16)
if (pc!=pcold+16) or (len(r)==0) :
if not (pc in dict):
print("Rold={}".format(rold))
a={rold:1}
dict[pc]={rold:1}
else:
if not (rold in dict[pc] ):
(dict[pc]) [rold]=1
else:
a=(dict[pc])[rold]
(dict[pc]) [rold]=a+1
print ("++++Instruccion cuyo pc {} tiene estos valores {}".format(pc,dict[pc]))
if (pc!=pcold+16):
print ("++++Ruptura de secuencia desde {}".format(pcold))
else:
print("++++Sin operandos la instruccion anterior")
else:
rold=r
print("Registro {}".format(r))
pcold=pc
self.nextinstr()
fin= self.isfininstruction()
i=i+1
for k in dict:
print("Clave: {} {} ".format(k,type(k)))
#cad= str(key)+ "="
for key in dict:
cad="{} =".format(key)
sep=" "
for key2 in dict[key]:
#cad=cad+sep+str(key2)+ "x"+str(( dict[key]) [key2])
cad="{}{}{}x{}".format(cad,sep,key2,(dict[key]) [key2])
sep=","
self.__logging.raw(cad+"\n")
def LastRegDest(self):
#Obtiene el ultimo registro destino modificado, se consulta la traza
# si el pc no esta en la traza, es que la anterior instruccion indica
# el ultimo registro destino.
kernel=self.kernelnow()
pc=str(self.mypcis())
traza=self.__traza
self.__logging.info("kernel {} pc {} pcespeciales{} \n".format(kernel,pc,traza.options(kernel)))
if (pc in traza.options(kernel)):
self.__logging.info("Si es especial {}".format(traza.get(kernel,pc)))
a= (traza.get(kernel,pc)).split(",")
ac=0
self.__logging.info(" Valor {}".format(a))
for x in a:
ac=ac+int(x.split("x")[1])
self.__logging.info("Total {}".format(ac))
rd=0
iac=(random.randint(0, ac-1))
self.__logging.info("Elegido {}".format(iac))
for x in a:
t=x.split("x")
if (int(t[1]) >=iac):
rd=t[0]
break
else:
iac=iac-int(t[1])
#print("Registro Destino es {}".format(rd))
else:
rd=self.RegDstInst(16)
self.__logging.info("Es normal {}".format(rd))
self.__logging.info("Registro es {}".format(rd))
return rd
def __inst_generic_injector(self):
r=self.regdst()
while (len(r) ==0):
self.__logging.info("INSTRUCTION WITHOUT OPERANDS")
gdb.execute("nexti")
regs=self.dictreg( self.regmod())
valores=self.nextinstr()
r=self.cmpregdst(valores,regs)
r=self.regdst()
self.__register="R{}".format(r.pop())
self.__logging.info("SELECTED_REGISTER:{}".format(self.__register))
......@@ -170,14 +379,19 @@ class BitFlip:
# Register File mode
if cp.RF == self.__injection_site:
# Select the register before the injection
#self.__logging.info("select-register")
self.__select_register()
# RF is the default mode of injection
#self.__logging.info("rf-genera")
self.__rf_generic_injector()
#self.__logging.info("rf-genera-fin")
# Instruction Output mode
elif cp.INST_OUT == self.__injection_site:
self.__inst_generic_injector()
self.__inst_generic_injector()()
elif cp.INST_OUT_ORI == self.__injection_site:
self.__inst_generic_injector_old()
elif cp.INST_V1 == self.__injection_site:
self.__inst_generic_injector_preliminar()
# Instruction Address mode
elif cp.INST_ADD == self.__injection_site:
self.__logging.exception("INST_ADD NOT IMPLEMENTED YET")
......@@ -340,13 +554,16 @@ class BitFlip:
"""
Instruction injector
"""
def __inst_generic_injector_preliminar(self):
self.__register="R{}".format(self.LastRegDest())
self.__logging.info("SELECTED_REGISTER:{}".format(self.__register))
self.__rf_generic_injector()
def __inst_generic_injector_old(self):
disassemble_array = cf.execute_command(gdb=gdb, to_execute="disassemble")
# Search the line to inject
# -1 will use the next instruction after program counter
for i in range(0, len(disassemble_array) - 1):
next_line = disassemble_array[i + 1]
next_line = disassemble_array[i] # He modifico i+1 por i
# => defines where the program counter is
# There is an instruction on this line
......
......@@ -44,7 +44,10 @@ class Logging:
d = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')
fp.write("[SUMMARY -- " + d + "]\nFI-uniqueID=" + str(self.unique_id) + "\n" + msg + "\n")
# fp.close()
def raw (self, msg):
with open(self.log_file, "a") as fp:
fp.write(msg)
# fp.close()
def search(self, find):
with open(self.log_file, "r") as fp:
lines = fp.readlines()
......
......@@ -48,22 +48,13 @@ class SignalApp(Thread):
crashsystem=True
return
#self.__log.info("Timeout syncron of breakpoint\n")
pidf=open(self.__file_connection,"r")
pid=int(pidf.read())
pidf.close()
#os.remove(self.__file_connection)
#os.system(self.__signal_cmd)
#pidf = (os.popen(self.__signal_cmd))
#print("Comando {} y resultado {} de tipo {}".format(self.__signal_cmd,pid, type (pid)) )
#pid=int(pid.split('\n')[0])
#pidf.close()
print(" resultado pid {} de tipo {}".format(pid, type (pid)) )
# Time random
#print ("INIT:"+str(self.__init_wait_time)+"sleep"+str())
time.sleep(self.__init_wait_time)
time.sleep(self.__init_wait_time)
self.__log.info("Begin injection")
crash=False
for signals in range(0, self.__signals_to_send):
#os.system("{} > /dev/null 2>/dev/null".format(self.__signal_cmd))
......@@ -71,22 +62,26 @@ class SignalApp(Thread):
os.kill(pid,signal.SIGINT)
self.__log.info("sending signal {}".format(signals))
except:
self.__log.info("Process is dead. Crash?")
os.kill(pid,signal.SIGKILL)
self.__log.info("Crash? o DeadLock")
print ("Voy por aqui")
(self._waitfinish).abort()
(self._waitfinish).reset()
(self._syncro).abort()
(self._syncro).reset()
try:
os.kill(pid,signal.SIGKILL)
except:
self.__log.info("Process is dead")
crash=True
break
try:
(self._syncro).wait()
except:
(self._syncro).abort()
# try:
# (self._syncro).wait()
# except:
# (self._syncro).abort()
#break
#print("Breakpoint fuera de tiempo")
(self._syncro).reset()
# (self._syncro).reset()
time.sleep(self.__time_to_sleep)
#(self._syncro).reset()
if not crash:
......@@ -98,6 +93,8 @@ class SignalApp(Thread):
self.__log.info("Hang timeout execution")
hang=True
self.__log.info("Timeout execution programa")
else:
print ("He salido por aqui")
(self._waitfinish).reset()
def ishang (self):
......
//----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------200
// plasmaKernel_gpu_2
//----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------200
__global__ void kernel_gpu_cuda(par_str d_par_gpu,
dim_str d_dim_gpu,
box_str* d_box_gpu,
FOUR_VECTOR* d_rv_gpu,
fp* d_qv_gpu,
FOUR_VECTOR* d_fv_gpu)
{
//--------------------------------------------------------------------------------------------------------------------------------------------------------------------------180
// THREAD PARAMETERS
//--------------------------------------------------------------------------------------------------------------------------------------------------------------------------180
int bx = blockIdx.x; // get current horizontal block index (0-n)
int tx = threadIdx.x; // get current horizontal thread index (0-n)
// int ax = bx*NUMBER_THREADS+tx;
// int wbx = bx;
int wtx = tx;
//--------------------------------------------------------------------------------------------------------------------------------------------------------------------------180
// DO FOR THE NUMBER OF BOXES
//--------------------------------------------------------------------------------------------------------------------------------------------------------------------------180
if(bx<d_dim_gpu.number_boxes){
// while(wbx<box_indexes_counter){
//------------------------------------------------------------------------------------------------------------------------------------------------------160
// Extract input parameters
//------------------------------------------------------------------------------------------------------------------------------------------------------160
// parameters
fp a2 = 2.0*d_par_gpu.alpha*d_par_gpu.alpha;
// home box
int first_i;
FOUR_VECTOR* rA;
FOUR_VECTOR* fA;
__shared__ FOUR_VECTOR rA_shared[100];
// nei box
int pointer;
int k = 0;
int first_j;
FOUR_VECTOR* rB;
fp* qB;
int j = 0;
__shared__ FOUR_VECTOR rB_shared[100];
__shared__ double qB_shared[100];
// common
fp r2;
fp u2;
fp vij;
fp fs;
fp fxij;
fp fyij;
fp fzij;
THREE_VECTOR d;
//------------------------------------------------------------------------------------------------------------------------------------------------------160
// Home box
//------------------------------------------------------------------------------------------------------------------------------------------------------160
//----------------------------------------------------------------------------------------------------------------------------------140
// Setup parameters
//----------------------------------------------------------------------------------------------------------------------------------140
// home box - box parameters
first_i = d_box_gpu[bx].offset;
// home box - distance, force, charge and type parameters
rA = &d_rv_gpu[first_i];
fA = &d_fv_gpu[first_i];
//----------------------------------------------------------------------------------------------------------------------------------140
// Copy to shared memory
//----------------------------------------------------------------------------------------------------------------------------------140
// home box - shared memory
while(wtx<NUMBER_PAR_PER_BOX){
rA_shared[wtx] = rA[wtx];
wtx = wtx + NUMBER_THREADS;
}
wtx = tx;
// synchronize threads - not needed, but just to be safe
__syncthreads();
//------------------------------------------------------------------------------------------------------------------------------------------------------160
// nei box loop
//------------------------------------------------------------------------------------------------------------------------------------------------------160
// loop over neiing boxes of home box
for (k=0; k<(1+d_box_gpu[bx].nn); k++){
//----------------------------------------50
// nei box - get pointer to the right box
//----------------------------------------50
if(k==0){
pointer = bx; // set first box to be processed to home box
}
else{
pointer = d_box_gpu[bx].nei[k-1].number; // remaining boxes are nei boxes
}
//----------------------------------------------------------------------------------------------------------------------------------140
// Setup parameters
//----------------------------------------------------------------------------------------------------------------------------------140
// nei box - box parameters
first_j = d_box_gpu[pointer].offset;
// nei box - distance, (force), charge and (type) parameters
rB = &d_rv_gpu[first_j];
qB = &d_qv_gpu[first_j];
//----------------------------------------------------------------------------------------------------------------------------------140
// Setup parameters
//----------------------------------------------------------------------------------------------------------------------------------140
// nei box - shared memory
while(wtx<NUMBER_PAR_PER_BOX){
rB_shared[wtx] = rB[wtx];
qB_shared[wtx] = qB[wtx];
wtx = wtx + NUMBER_THREADS;
}
wtx = tx;
// synchronize threads because in next section each thread accesses data brought in by different threads here
__syncthreads();
//----------------------------------------------------------------------------------------------------------------------------------140
// Calculation
//----------------------------------------------------------------------------------------------------------------------------------140
// loop for the number of particles in the home box
// for (int i=0; i<nTotal_i; i++){
while(wtx<NUMBER_PAR_PER_BOX){
// loop for the number of particles in the current nei box
for (j=0; j<NUMBER_PAR_PER_BOX; j++){
// r2 = rA[wtx].v + rB[j].v - DOT(rA[wtx],rB[j]);
// u2 = a2*r2;
// vij= exp(-u2);
// fs = 2.*vij;
// d.x = rA[wtx].x - rB[j].x;
// fxij=fs*d.x;
// d.y = rA[wtx].y - rB[j].y;
// fyij=fs*d.y;
// d.z = rA[wtx].z - rB[j].z;
// fzij=fs*d.z;
// fA[wtx].v += qB[j]*vij;
// fA[wtx].x += qB[j]*fxij;
// fA[wtx].y += qB[j]*fyij;
// fA[wtx].z += qB[j]*fzij;
r2 = (fp)rA_shared[wtx].v + (fp)rB_shared[j].v - DOT((fp)rA_shared[wtx],(fp)rB_shared[j]);
u2 = a2*r2;
vij= exp(-u2);
fs = 2*vij;
d.x = (fp)rA_shared[wtx].x - (fp)rB_shared[j].x;
fxij=fs*d.x;
d.y = (fp)rA_shared[wtx].y - (fp)rB_shared[j].y;
fyij=fs*d.y;
d.z = (fp)rA_shared[wtx].z - (fp)rB_shared[j].z;
fzij=fs*d.z;
fA[wtx].v += (double)((fp)qB_shared[j]*vij);
fA[wtx].x += (double)((fp)qB_shared[j]*fxij);
fA[wtx].y += (double)((fp)qB_shared[j]*fyij);
fA[wtx].z += (double)((fp)qB_shared[j]*fzij);
}
// increment work thread index
wtx = wtx + NUMBER_THREADS;
}
// reset work index
wtx = tx;
// synchronize after finishing force contributions from current nei box not to cause conflicts when starting next box
__syncthreads();
//----------------------------------------------------------------------------------------------------------------------------------140
// Calculation END
//----------------------------------------------------------------------------------------------------------------------------------140
}
// // increment work block index
// wbx = wbx + NUMBER_BLOCKS;
// // synchronize - because next iteration will overwrite current shared memory
// __syncthreads();
//------------------------------------------------------------------------------------------------------------------------------------------------------160
// nei box loop END
//------------------------------------------------------------------------------------------------------------------------------------------------------160
}
}
//========================================================================================================================================================================================================200
// DEFINE/INCLUDE
//========================================================================================================================================================================================================200
//======================================================================================================================================================150
// MAIN FUNCTION HEADER
//======================================================================================================================================================150
#include "./../main.h" // (in the main program folder) needed to recognized input parameters
//======================================================================================================================================================150
// UTILITIES
//======================================================================================================================================================150
#include "./../util/device/device.h" // (in library path specified to compiler) needed by for device functions
#include "./../util/timer/timer.h" // (in library path specified to compiler) needed by timer
//======================================================================================================================================================150
// KERNEL_GPU_CUDA_WRAPPER FUNCTION HEADER
//======================================================================================================================================================150
#include "./kernel_gpu_cuda_wrapper.h" // (in the current directory)
//======================================================================================================================================================150
// KERNEL
//======================================================================================================================================================150
#include "./kernel_gpu_cuda.cu" // (in the current directory) GPU kernel, cannot include with header file because of complications with passing of constant memory variables
//========================================================================================================================================================================================================200
// KERNEL_GPU_CUDA_WRAPPER FUNCTION
//========================================================================================================================================================================================================200
void
kernel_gpu_cuda_wrapper(par_str par_cpu,
dim_str dim_cpu,
box_str* box_cpu,
FOUR_VECTOR* rv_cpu,
fp* qv_cpu,
FOUR_VECTOR* fv_cpu)
{
//======================================================================================================================================================150
// CPU VARIABLES
//======================================================================================================================================================150
// timer
long long time0;
long long time1;
long long time2;
long long time3;
long long time4;
long long time5;
long long time6;
time0 = get_time();
//======================================================================================================================================================150
// GPU SETUP
//======================================================================================================================================================150
//====================================================================================================100
// INITIAL DRIVER OVERHEAD
//====================================================================================================100
cudaThreadSynchronize();
//====================================================================================================100
// VARIABLES
//====================================================================================================100
box_str* d_box_gpu;
FOUR_VECTOR* d_rv_gpu;
fp* d_qv_gpu;
FOUR_VECTOR* d_fv_gpu;
dim3 threads;
dim3 blocks;
//====================================================================================================100
// EXECUTION PARAMETERS
//====================================================================================================100
blocks.x = dim_cpu.number_boxes;
blocks.y = 1;
threads.x = NUMBER_THREADS; // define the number of threads in the block
threads.y = 1;
time1 = get_time();
//======================================================================================================================================================150
// GPU MEMORY (MALLOC)
//======================================================================================================================================================150
//====================================================================================================100
// GPU MEMORY (MALLOC) COPY IN
//====================================================================================================100
//==================================================50
// boxes
//==================================================50
cudaMalloc( (void **)&d_box_gpu,
dim_cpu.box_mem);
//==================================================50
// rv
//==================================================50
cudaMalloc( (void **)&d_rv_gpu,
dim_cpu.space_mem);
//==================================================50
// qv
//==================================================50
cudaMalloc( (void **)&d_qv_gpu,
dim_cpu.space_mem2);
//====================================================================================================100
// GPU MEMORY (MALLOC) COPY
//====================================================================================================100
//==================================================50
// fv
//==================================================50
cudaMalloc( (void **)&d_fv_gpu,
dim_cpu.space_mem);
time2 = get_time();
//======================================================================================================================================================150
// GPU MEMORY COPY
//======================================================================================================================================================150
//====================================================================================================100
// GPU MEMORY (MALLOC) COPY IN
//====================================================================================================100
//==================================================50
// boxes
//==================================================50
cudaMemcpy( d_box_gpu,
box_cpu,
dim_cpu.box_mem,
cudaMemcpyHostToDevice);
//==================================================50
// rv
//==================================================50
cudaMemcpy( d_rv_gpu,
rv_cpu,
dim_cpu.space_mem,
cudaMemcpyHostToDevice);
//==================================================50
// qv
//==================================================50
cudaMemcpy( d_qv_gpu,
qv_cpu,
dim_cpu.space_mem2,
cudaMemcpyHostToDevice);
//====================================================================================================100
// GPU MEMORY (MALLOC) COPY
//====================================================================================================100
//==================================================50
// fv
//==================================================50
cudaMemcpy( d_fv_gpu,
fv_cpu,
dim_cpu.space_mem,
cudaMemcpyHostToDevice);
time3 = get_time();
//======================================================================================================================================================150
// KERNEL
//======================================================================================================================================================150
// launch kernel - all boxes
kernel_gpu_cuda<<<blocks, threads>>>( par_cpu,
dim_cpu,
d_box_gpu,
d_rv_gpu,
d_qv_gpu,
d_fv_gpu);
checkCUDAError("Start");
cudaThreadSynchronize();
time4 = get_time();
//======================================================================================================================================================150
// GPU MEMORY COPY (CONTD.)
//======================================================================================================================================================150
cudaMemcpy( fv_cpu,
d_fv_gpu,
dim_cpu.space_mem,
cudaMemcpyDeviceToHost);
time5 = get_time();
//======================================================================================================================================================150
// GPU MEMORY DEALLOCATION
//======================================================================================================================================================150
cudaFree(d_rv_gpu);
cudaFree(d_qv_gpu);
cudaFree(d_fv_gpu);
cudaFree(d_box_gpu);
time6 = get_time();
//======================================================================================================================================================150
// DISPLAY TIMING
//======================================================================================================================================================150
printf("Time spent in different stages of GPU_CUDA KERNEL:\n");
printf("%15.12f s, %15.12f % : GPU: SET DEVICE / DRIVER INIT\n", (float) (time1-time0) / 1000000, (float) (time1-time0) / (float) (time6-time0) * 100);
printf("%15.12f s, %15.12f % : GPU MEM: ALO\n", (float) (time2-time1) / 1000000, (float) (time2-time1) / (float) (time6-time0) * 100);
printf("%15.12f s, %15.12f % : GPU MEM: COPY IN\n", (float) (time3-time2) / 1000000, (float) (time3-time2) / (float) (time6-time0) * 100);
printf("%15.12f s, %15.12f % : GPU: KERNEL\n", (float) (time4-time3) / 1000000, (float) (time4-time3) / (float) (time6-time0) * 100);
printf("%15.12f s, %15.12f % : GPU MEM: COPY OUT\n", (float) (time5-time4) / 1000000, (float) (time5-time4) / (float) (time6-time0) * 100);
printf("%15.12f s, %15.12f % : GPU MEM: FRE\n", (float) (time6-time5) / 1000000, (float) (time6-time5) / (float) (time6-time0) * 100);
printf("Total time:\n");
printf("%.12f s\n", (float) (time6-time0) / 1000000);
}
#ifdef __cplusplus
extern "C" {
#endif
//========================================================================================================================================================================================================200
// KERNEL_GPU_CUDA_WRAPPER HEADER
//========================================================================================================================================================================================================200
void kernel_gpu_cuda_wrapper( par_str parms_cpu,
dim_str dim_cpu,
box_str* box_cpu,
FOUR_VECTOR* rv_cpu,
fp* qv_cpu,
FOUR_VECTOR* fv_cpu);
#ifdef __cplusplus
}
#endif
[DEFAULT]
debug =True
# Name of the gdb executable
gdbExecName = /usr/local/cuda-10.1.243/bin/cuda-gdb
# Which fault model to use, 0 -> single; 1 -> double;
# 2 -> random; 3 -> zeros; 4 -> least 16 significant bits (LSB);
# 5 -> least 8 significant bits (LSB)
# If you want multiple fault models, place them separated by ','
# faultModel = 0,2,3
faultModel = 0
# Injection site
# Can be:
# RF -> Register File
# INST_OUT -> Instruction Output (NOT IMPLEMENTED YET)
# INST_composed -> Instruction Adress (NOT IMPLEMENTED YET)
#injectionSite = RF
injectionSite = INST_OUT
# Max time factor to finish the app, this will be multiplied by the application running time
# For example if your app spend 2s, and the maxWaitTimes is 5, the max running time before it is
# Considered as a crash is 10s
maxWaitTimes = 5
# binary file of the application
# Must be full path
benchmarkBinary = /home/badia/mycarol-fi/codes/lavaMD/lavaMD
# Commands to set the session inside GDB environment
benchmarkArgs = -boxes1d 60
# CSV output file. It will be overwrite at each injection
csvFile = results/lavaMD_IO.csv
# You should create a script on the benchmark source folder to verify GOLD_OUTPUT x INJ_OUTPUT
goldenCheckScript = codes/lavaMD/sdc_check.sh
# Number of signals that will be sent to the application
seqSignals = 20
# Initial sleep time in seconds before start sending signals
# Generally the memory setup time
initSleep = 0.95
kernels=kernel_gpu_cuda
section_begin=kernel_gpu_cuda_wrapper.cu:188
section_end=kernel_gpu_cuda_wrapper.cu:198
//========================================================================================================================================================================================================200
//======================================================================================================================================================150
//====================================================================================================100
//==================================================50
//========================================================================================================================================================================================================200
// UPDATE
//========================================================================================================================================================================================================200
// 14 APR 2011 Lukasz G. Szafaryn
//========================================================================================================================================================================================================200
// DEFINE/INCLUDE
//========================================================================================================================================================================================================200
//======================================================================================================================================================150
// LIBRARIES
//======================================================================================================================================================150
#include "golden.h"
//float golden [1000][4];
#include <stdio.h> // (in path known to compiler) needed by printf
#include <stdlib.h> // (in path known to compiler) needed by malloc
#include <stdbool.h> // (in path known to compiler) needed by true/false
#include <math.h>
//======================================================================================================================================================150
// UTILITIES
//======================================================================================================================================================150
#include "./util/timer/timer.h" // (in path specified here)
#include "./util/num/num.h" // (in path specified here)
//======================================================================================================================================================150
// MAIN FUNCTION HEADER
//======================================================================================================================================================150
#include "./main.h" // (in the current directory)
//======================================================================================================================================================150
// KERNEL
//======================================================================================================================================================150
#include "./kernel/kernel_gpu_cuda_wrapper.h" // (in library path specified here)
//========================================================================================================================================================================================================200
// MAIN FUNCTION
//========================================================================================================================================================================================================200
int
main( int argc,
char *argv [])
{
printf("thread block size of kernel = %d \n", NUMBER_THREADS);
//======================================================================================================================================================150
// CPU/MCPU VARIABLES
//======================================================================================================================================================150
// timer
long long time0;
time0 = get_time();
// timer
long long time1;
long long time2;
long long time3;
long long time4;
long long time5;
long long time6;
long long time7;
// counters
int i, j, k, l, m, n;
// system memory
par_str par_cpu;
dim_str dim_cpu;
box_str* box_cpu;
FOUR_VECTOR* rv_cpu;
fp* qv_cpu;
FOUR_VECTOR* fv_cpu;
int nh;
time1 = get_time();
//======================================================================================================================================================150
// CHECK INPUT ARGUMENTS
//======================================================================================================================================================150
// assing default values
dim_cpu.boxes1d_arg = 1;
// go through arguments
for(dim_cpu.cur_arg=1; dim_cpu.cur_arg<argc; dim_cpu.cur_arg++){
// check if -boxes1d
if(strcmp(argv[dim_cpu.cur_arg], "-boxes1d")==0){
// check if value provided
if(argc>=dim_cpu.cur_arg+1){
// check if value is a number
if(isInteger(argv[dim_cpu.cur_arg+1])==1){
dim_cpu.boxes1d_arg = atoi(argv[dim_cpu.cur_arg+1]);
if(dim_cpu.boxes1d_arg<0){
printf("ERROR: Wrong value to -boxes1d parameter, cannot be <=0\n");
return 0;
}
dim_cpu.cur_arg = dim_cpu.cur_arg+1;
}
// value is not a number
else{
printf("ERROR: Value to -boxes1d parameter in not a number\n");
return 0;
}
}
// value not provided
else{
printf("ERROR: Missing value to -boxes1d parameter\n");
return 0;
}
}
// unknown
else{
printf("ERROR: Unknown parameter\n");
return 0;
}
}
// Print configuration
printf("Configuration used: boxes1d = %d\n", dim_cpu.boxes1d_arg);
time2 = get_time();
//======================================================================================================================================================150
// INPUTS
//======================================================================================================================================================150
par_cpu.alpha = 0.5;
time3 = get_time();
//======================================================================================================================================================150
// DIMENSIONS
//======================================================================================================================================================150
// total number of boxes
dim_cpu.number_boxes = dim_cpu.boxes1d_arg * dim_cpu.boxes1d_arg * dim_cpu.boxes1d_arg;
// how many particles space has in each direction
dim_cpu.space_elem = dim_cpu.number_boxes * NUMBER_PAR_PER_BOX;
dim_cpu.space_mem = dim_cpu.space_elem * sizeof(FOUR_VECTOR);
dim_cpu.space_mem2 = dim_cpu.space_elem * sizeof(fp);
// box array
dim_cpu.box_mem = dim_cpu.number_boxes * sizeof(box_str);
time4 = get_time();
//======================================================================================================================================================150
// SYSTEM MEMORY
//======================================================================================================================================================150
//====================================================================================================100
// BOX
//====================================================================================================100
// allocate boxes
box_cpu = (box_str*)malloc(dim_cpu.box_mem);
// initialize number of home boxes
nh = 0;
// home boxes in z direction
for(i=0; i<dim_cpu.boxes1d_arg; i++){
// home boxes in y direction
for(j=0; j<dim_cpu.boxes1d_arg; j++){
// home boxes in x direction
for(k=0; k<dim_cpu.boxes1d_arg; k++){
// current home box
box_cpu[nh].x = k;
box_cpu[nh].y = j;
box_cpu[nh].z = i;
box_cpu[nh].number = nh;
box_cpu[nh].offset = nh * NUMBER_PAR_PER_BOX;
// initialize number of neighbor boxes
box_cpu[nh].nn = 0;
// neighbor boxes in z direction
for(l=-1; l<2; l++){
// neighbor boxes in y direction
for(m=-1; m<2; m++){
// neighbor boxes in x direction
for(n=-1; n<2; n++){
// check if (this neighbor exists) and (it is not the same as home box)
if( (((i+l)>=0 && (j+m)>=0 && (k+n)>=0)==true && ((i+l)<dim_cpu.boxes1d_arg && (j+m)<dim_cpu.boxes1d_arg && (k+n)<dim_cpu.boxes1d_arg)==true) &&
(l==0 && m==0 && n==0)==false ){
// current neighbor box
box_cpu[nh].nei[box_cpu[nh].nn].x = (k+n);
box_cpu[nh].nei[box_cpu[nh].nn].y = (j+m);
box_cpu[nh].nei[box_cpu[nh].nn].z = (i+l);
box_cpu[nh].nei[box_cpu[nh].nn].number = (box_cpu[nh].nei[box_cpu[nh].nn].z * dim_cpu.boxes1d_arg * dim_cpu.boxes1d_arg) +
(box_cpu[nh].nei[box_cpu[nh].nn].y * dim_cpu.boxes1d_arg) +
box_cpu[nh].nei[box_cpu[nh].nn].x;
box_cpu[nh].nei[box_cpu[nh].nn].offset = box_cpu[nh].nei[box_cpu[nh].nn].number * NUMBER_PAR_PER_BOX;
// increment neighbor box
box_cpu[nh].nn = box_cpu[nh].nn + 1;
}
} // neighbor boxes in x direction
} // neighbor boxes in y direction
} // neighbor boxes in z direction
// increment home box
nh = nh + 1;
} // home boxes in x direction
} // home boxes in y direction
} // home boxes in z direction
//====================================================================================================100
// PARAMETERS, DISTANCE, CHARGE AND FORCE
//====================================================================================================100
// random generator seed set to random value - time in this case
// srand(time(NULL));
srand (0);
// input (distances)
rv_cpu = (FOUR_VECTOR*)malloc(dim_cpu.space_mem);
for(i=0; i<dim_cpu.space_elem; i=i+1){
rv_cpu[i].v = (rand()%10 + 1) / 10.0; // get a number in the range 0.1 - 1.0
rv_cpu[i].x = (rand()%10 + 1) / 10.0; // get a number in the range 0.1 - 1.0
rv_cpu[i].y = (rand()%10 + 1) / 10.0; // get a number in the range 0.1 - 1.0
rv_cpu[i].z = (rand()%10 + 1) / 10.0; // get a number in the range 0.1 - 1.0
}
// input (charge)
qv_cpu = (fp*)malloc(dim_cpu.space_mem2);
for(i=0; i<dim_cpu.space_elem; i=i+1){
qv_cpu[i] = (rand()%10 + 1) / 10.0; // get a number in the range 0.1 - 1.0
}
// output (forces)
fv_cpu = (FOUR_VECTOR*)malloc(dim_cpu.space_mem);
for(i=0; i<dim_cpu.space_elem; i=i+1){
fv_cpu[i].v = 0; // set to 0, because kernels keeps adding to initial value
fv_cpu[i].x = 0; // set to 0, because kernels keeps adding to initial value
fv_cpu[i].y = 0; // set to 0, because kernels keeps adding to initial value
fv_cpu[i].z = 0; // set to 0, because kernels keeps adding to initial value
}
time5 = get_time();
//======================================================================================================================================================150
// KERNEL
//======================================================================================================================================================150
//====================================================================================================100
// GPU_CUDA
//====================================================================================================100
kernel_gpu_cuda_wrapper(par_cpu,
dim_cpu,
box_cpu,
rv_cpu,
qv_cpu,
fv_cpu);
time6 = get_time();
//======================================================================================================================================================150
// SYSTEM MEMORY DEALLOCATION
//======================================================================================================================================================150
// dump res componete ults
#ifdef OUTPUT
FILE *fptr;
fptr = fopen("result.txt", "w");
for(i=0; i<dim_cpu.space_elem; i=i+1){
fprintf(fptr, "%.10f, %.10f, %.10f, %.10f\n", fv_cpu[i].v, fv_cpu[i].x, fv_cpu[i].y, fv_cpu[i].z);
}
fclose(fptr);
#endif
int correct=true;
#pragma omp parallel for shared(golden,fv_cpu, correct)
for (int i = 0; i < (int) (dim_cpu.space_elem); i++) {
// float abs_err = fabs(h_C[i] - float(dimsA.x * valB));
// float dot_length = dimsA.x;
// float abs_val = fabs(h_C[i]);
// float rel_err = abs_err / abs_val / dot_length;
//
//
int semicorrect=0;
double eps = 1.e-6;
semicorrect=(fabs(fv_cpu[i].v-golden[i][0]) <= eps);
// printf ("V %f golden %f dif %f Error %d\n",fv_cpu[i].v,golden[i][0],fabs(fv_cpu[i].v-golden[i][0]),semicorrect );
semicorrect&=(fabs(fv_cpu[i].x-golden[i][1])<= eps);
// printf ("X %f golden %f dif %f Error %d\n",fv_cpu[i].x,golden[i][1],fabs(fv_cpu[i].x-golden[i][1]) ,semicorrect );
semicorrect&=(fabs(fv_cpu[i].y-golden[i][2])<=eps);
//printf ("Y %f golden %f dif %f Error %d \n",fv_cpu[i].y,golden[i][2],fabs(fv_cpu[i].y-golden[i][2]) ,semicorrect );
semicorrect&=(fabs(fv_cpu[i].z-golden[i][3])<=eps);
//printf ("Z %f golden %f dif %f Error %d\n",fv_cpu[i].z,golden[i][3],fabs(fv_cpu[i].z-golden[i][3]) ,semicorrect );
if (!semicorrect) {
//printf("Error! En la componete %05d \n", i
//);
//
int basura=0;
#pragma omp critical
{
correct = false;
}
}
}
// i=0;
// int nosalir=1;
// while (nosalir && (i<dim_cpu.space_elem))
// {
// nosalir=(fv_cpu[i].v ==golden[i][0]);
// nosalir&=(fv_cpu[i].x==golden[i][1]);
// nosalir&=(fv_cpu[i].y==golden[i][2]);
// nosalir&=(fv_cpu[i].z==golden[i][3]);
// i++;
// }
// //if (nosalir) printf ("Result: PASS\n");
// else printf ("Result: FAIL\n");
printf("%s\n", correct ? "Result = PASS" : "Result = FAIL");
free(rv_cpu);
free(qv_cpu);
free(fv_cpu);
free(box_cpu);
time7 = get_time();
//======================================================================================================================================================150
// DISPLAY TIMING
//======================================================================================================================================================150
// printf("Time spent in different stages of the application:\n");
// printf("%15.12f s, %15.12f % : VARIABLES\n", (float) (time1-time0) / 1000000, (float) (time1-time0) / (float) (time7-time0) * 100);
// printf("%15.12f s, %15.12f % : INPUT ARGUMENTS\n", (float) (time2-time1) / 1000000, (float) (time2-time1) / (float) (time7-time0) * 100);
// printf("%15.12f s, %15.12f % : INPUTS\n", (float) (time3-time2) / 1000000, (float) (time3-time2) / (float) (time7-time0) * 100);
// printf("%15.12f s, %15.12f % : dim_cpu\n", (float) (time4-time3) / 1000000, (float) (time4-time3) / (float) (time7-time0) * 100);
// printf("%15.12f s, %15.12f % : SYS MEM: ALO\n", (float) (time5-time4) / 1000000, (float) (time5-time4) / (float) (time7-time0) * 100);
// printf("%15.12f s, %15.12f % : KERNEL: COMPUTE\n", (float) (time6-time5) / 1000000, (float) (time6-time5) / (float) (time7-time0) * 100);
// printf("%15.12f s, %15.12f % : SYS MEM: FRE\n", (float) (time7-time6) / 1000000, (float) (time7-time6) / (float) (time7-time0) * 100);
// printf("Total time:\n");
// printf("%.12f s\n", (float) (time7-time0) / 1000000);
//======================================================================================================================================================150
// RETURN
//======================================================================================================================================================150
return 0.0; // always returns 0.0
}
//----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------200
// DEFINE / INCLUDE
//----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------200
#define fp double
#define NUMBER_PAR_PER_BOX 100 // keep this low to allow more blocks that share shared memory to run concurrently, code does not work for larger than 110, more speedup can be achieved with larger number and no shared memory used
/* #define NUMBER_THREADS 128 // this should be roughly equal to NUMBER_PAR_PER_BOX for best performance */
// Parameterized work group size
#ifdef RD_WG_SIZE_0_0
#define NUMBER_THREADS RD_WG_SIZE_0_0
#elif defined(RD_WG_SIZE_0)
#define NUMBER_THREADS RD_WG_SIZE_0
#elif defined(RD_WG_SIZE)
#define NUMBER_THREADS RD_WG_SIZE
#else
#define NUMBER_THREADS 128
#endif
#define DOT(A,B) ((A.x)*(B.x)+(A.y)*(B.y)+(A.z)*(B.z)) // STABLE
//===============================================================================================================================================================================================================200
// STRUCTURES
//===============================================================================================================================================================================================================200
typedef struct
{
fp x, y, z;
} THREE_VECTOR;
typedef struct
{
fp v, x, y, z;
} FOUR_VECTOR;
typedef struct nei_str
{
// neighbor box
int x, y, z;
int number;
long offset;
} nei_str;
typedef struct box_str
{
// home box
int x, y, z;
int number;
long offset;
// neighbor boxes
int nn;
nei_str nei[26];
} box_str;
typedef struct par_str
{
fp alpha;
} par_str;
typedef struct dim_str
{
// input arguments
int cur_arg;
int arch_arg;
int cores_arg;
int boxes1d_arg;
// system memory
long number_boxes;
long box_mem;
long space_elem;
long space_mem;
long space_mem2;
} dim_str;
//===============================================================================================================================================================================================================200
// FUNCTION PROTOTYPES
//===============================================================================================================================================================================================================200
int
main( int argc,
char *argv []);
//========================================================================================================================================================================================================200
//======================================================================================================================================================150
//====================================================================================================100
//==================================================50
//========================================================================================================================================================================================================200
// UPDATE
//========================================================================================================================================================================================================200
// 14 APR 2011 Lukasz G. Szafaryn
//========================================================================================================================================================================================================200
// DEFINE/INCLUDE
//========================================================================================================================================================================================================200
//======================================================================================================================================================150
// LIBRARIES
//======================================================================================================================================================150
#include "golden.h"
//float golden [1000][4];
#include <stdio.h> // (in path known to compiler) needed by printf
#include <stdlib.h> // (in path known to compiler) needed by malloc
#include <stdbool.h> // (in path known to compiler) needed by true/false
#include <math.h>
//======================================================================================================================================================150
// UTILITIES
//======================================================================================================================================================150
#include "./util/timer/timer.h" // (in path specified here)
#include "./util/num/num.h" // (in path specified here)
//======================================================================================================================================================150
// MAIN FUNCTION HEADER
//======================================================================================================================================================150
#include "./main.h" // (in the current directory)
//======================================================================================================================================================150
// KERNEL
//======================================================================================================================================================150
#include "./kernel/kernel_gpu_cuda_wrapper.h" // (in library path specified here)
//========================================================================================================================================================================================================200
// MAIN FUNCTION
//========================================================================================================================================================================================================200
int
main( int argc,
char *argv [])
{
printf("thread block size of kernel = %d \n", NUMBER_THREADS);
//======================================================================================================================================================150
// CPU/MCPU VARIABLES
//======================================================================================================================================================150
// timer
long long time0;
time0 = get_time();
// timer
long long time1;
long long time2;
long long time3;
long long time4;
long long time5;
long long time6;
long long time7;
// counters
int i, j, k, l, m, n;
// system memory
par_str par_cpu;
dim_str dim_cpu;
box_str* box_cpu;
FOUR_VECTOR* rv_cpu;
fp* qv_cpu;
FOUR_VECTOR* fv_cpu;
int nh;
time1 = get_time();
//======================================================================================================================================================150
// CHECK INPUT ARGUMENTS
//======================================================================================================================================================150
// assing default values
dim_cpu.boxes1d_arg = 1;
// go through arguments
for(dim_cpu.cur_arg=1; dim_cpu.cur_arg<argc; dim_cpu.cur_arg++){
// check if -boxes1d
if(strcmp(argv[dim_cpu.cur_arg], "-boxes1d")==0){
// check if value provided
if(argc>=dim_cpu.cur_arg+1){
// check if value is a number
if(isInteger(argv[dim_cpu.cur_arg+1])==1){
dim_cpu.boxes1d_arg = atoi(argv[dim_cpu.cur_arg+1]);
if(dim_cpu.boxes1d_arg<0){
printf("ERROR: Wrong value to -boxes1d parameter, cannot be <=0\n");
return 0;
}
dim_cpu.cur_arg = dim_cpu.cur_arg+1;
}
// value is not a number
else{
printf("ERROR: Value to -boxes1d parameter in not a number\n");
return 0;
}
}
// value not provided
else{
printf("ERROR: Missing value to -boxes1d parameter\n");
return 0;
}
}
// unknown
else{
printf("ERROR: Unknown parameter\n");
return 0;
}
}
// Print configuration
printf("Configuration used: boxes1d = %d\n", dim_cpu.boxes1d_arg);
time2 = get_time();
//======================================================================================================================================================150
// INPUTS
//======================================================================================================================================================150
par_cpu.alpha = 0.5;
time3 = get_time();
//======================================================================================================================================================150
// DIMENSIONS
//======================================================================================================================================================150
// total number of boxes
dim_cpu.number_boxes = dim_cpu.boxes1d_arg * dim_cpu.boxes1d_arg * dim_cpu.boxes1d_arg;
// how many particles space has in each direction
dim_cpu.space_elem = dim_cpu.number_boxes * NUMBER_PAR_PER_BOX;
dim_cpu.space_mem = dim_cpu.space_elem * sizeof(FOUR_VECTOR);
dim_cpu.space_mem2 = dim_cpu.space_elem * sizeof(fp);
// box array
dim_cpu.box_mem = dim_cpu.number_boxes * sizeof(box_str);
time4 = get_time();
//======================================================================================================================================================150
// SYSTEM MEMORY
//======================================================================================================================================================150
//====================================================================================================100
// BOX
//====================================================================================================100
// allocate boxes
box_cpu = (box_str*)malloc(dim_cpu.box_mem);
// initialize number of home boxes
nh = 0;
// home boxes in z direction
for(i=0; i<dim_cpu.boxes1d_arg; i++){
// home boxes in y direction
for(j=0; j<dim_cpu.boxes1d_arg; j++){
// home boxes in x direction
for(k=0; k<dim_cpu.boxes1d_arg; k++){
// current home box
box_cpu[nh].x = k;
box_cpu[nh].y = j;
box_cpu[nh].z = i;
box_cpu[nh].number = nh;
box_cpu[nh].offset = nh * NUMBER_PAR_PER_BOX;
// initialize number of neighbor boxes
box_cpu[nh].nn = 0;
// neighbor boxes in z direction
for(l=-1; l<2; l++){
// neighbor boxes in y direction
for(m=-1; m<2; m++){
// neighbor boxes in x direction
for(n=-1; n<2; n++){
// check if (this neighbor exists) and (it is not the same as home box)
if( (((i+l)>=0 && (j+m)>=0 && (k+n)>=0)==true && ((i+l)<dim_cpu.boxes1d_arg && (j+m)<dim_cpu.boxes1d_arg && (k+n)<dim_cpu.boxes1d_arg)==true) &&
(l==0 && m==0 && n==0)==false ){
// current neighbor box
box_cpu[nh].nei[box_cpu[nh].nn].x = (k+n);
box_cpu[nh].nei[box_cpu[nh].nn].y = (j+m);
box_cpu[nh].nei[box_cpu[nh].nn].z = (i+l);
box_cpu[nh].nei[box_cpu[nh].nn].number = (box_cpu[nh].nei[box_cpu[nh].nn].z * dim_cpu.boxes1d_arg * dim_cpu.boxes1d_arg) +
(box_cpu[nh].nei[box_cpu[nh].nn].y * dim_cpu.boxes1d_arg) +
box_cpu[nh].nei[box_cpu[nh].nn].x;
box_cpu[nh].nei[box_cpu[nh].nn].offset = box_cpu[nh].nei[box_cpu[nh].nn].number * NUMBER_PAR_PER_BOX;
// increment neighbor box
box_cpu[nh].nn = box_cpu[nh].nn + 1;
}
} // neighbor boxes in x direction
} // neighbor boxes in y direction
} // neighbor boxes in z direction
// increment home box
nh = nh + 1;
} // home boxes in x direction
} // home boxes in y direction
} // home boxes in z direction
//====================================================================================================100
// PARAMETERS, DISTANCE, CHARGE AND FORCE
//====================================================================================================100
// random generator seed set to random value - time in this case
// srand(time(NULL));
srand (0);
// input (distances)
rv_cpu = (FOUR_VECTOR*)malloc(dim_cpu.space_mem);
for(i=0; i<dim_cpu.space_elem; i=i+1){
rv_cpu[i].v = (rand()%10 + 1) / 10.0; // get a number in the range 0.1 - 1.0
rv_cpu[i].x = (rand()%10 + 1) / 10.0; // get a number in the range 0.1 - 1.0
rv_cpu[i].y = (rand()%10 + 1) / 10.0; // get a number in the range 0.1 - 1.0
rv_cpu[i].z = (rand()%10 + 1) / 10.0; // get a number in the range 0.1 - 1.0
}
// input (charge)
qv_cpu = (fp*)malloc(dim_cpu.space_mem2);
for(i=0; i<dim_cpu.space_elem; i=i+1){
qv_cpu[i] = (rand()%10 + 1) / 10.0; // get a number in the range 0.1 - 1.0
}
// output (forces)
fv_cpu = (FOUR_VECTOR*)malloc(dim_cpu.space_mem);
for(i=0; i<dim_cpu.space_elem; i=i+1){
fv_cpu[i].v = 0; // set to 0, because kernels keeps adding to initial value
fv_cpu[i].x = 0; // set to 0, because kernels keeps adding to initial value
fv_cpu[i].y = 0; // set to 0, because kernels keeps adding to initial value
fv_cpu[i].z = 0; // set to 0, because kernels keeps adding to initial value
}
time5 = get_time();
//======================================================================================================================================================150
// KERNEL
//======================================================================================================================================================150
//====================================================================================================100
// GPU_CUDA
//====================================================================================================100
kernel_gpu_cuda_wrapper(par_cpu,
dim_cpu,
box_cpu,
rv_cpu,
qv_cpu,
fv_cpu);
time6 = get_time();
//======================================================================================================================================================150
// SYSTEM MEMORY DEALLOCATION
//======================================================================================================================================================150
// dump res componete ults
#ifdef OUTPUT
FILE *fptr;
fptr = fopen("result.txt", "w");
for(i=0; i<dim_cpu.space_elem; i=i+1){
fprintf(fptr, "%.10f, %.10f, %.10f, %.10f\n", fv_cpu[i].v, fv_cpu[i].x, fv_cpu[i].y, fv_cpu[i].z);
}
fclose(fptr);
#endif
int correct=true;
#pragma omp parallel for shared(golden,fv_cpu, correct)
for (int i = 0; i < (int) (dim_cpu.space_elem); i++) {
// float abs_err = fabs(h_C[i] - float(dimsA.x * valB));
// float dot_length = dimsA.x;
// float abs_val = fabs(h_C[i]);
// float rel_err = abs_err / abs_val / dot_length;
//
//
int semicorrect=0;
double eps = 1.e-6;
semicorrect=(fabs(fv_cpu[i].v-golden[i][0]) <= eps);
// printf ("V %f golden %f dif %f Error %d\n",fv_cpu[i].v,golden[i][0],fabs(fv_cpu[i].v-golden[i][0]),semicorrect );
semicorrect&=(fabs(fv_cpu[i].x-golden[i][1])<= eps);
// printf ("X %f golden %f dif %f Error %d\n",fv_cpu[i].x,golden[i][1],fabs(fv_cpu[i].x-golden[i][1]) ,semicorrect );
semicorrect&=(fabs(fv_cpu[i].y-golden[i][2])<=eps);
//printf ("Y %f golden %f dif %f Error %d \n",fv_cpu[i].y,golden[i][2],fabs(fv_cpu[i].y-golden[i][2]) ,semicorrect );
semicorrect&=(fabs(fv_cpu[i].z-golden[i][3])<=eps);
//printf ("Z %f golden %f dif %f Error %d\n",fv_cpu[i].z,golden[i][3],fabs(fv_cpu[i].z-golden[i][3]) ,semicorrect );
if (!semicorrect) {
//printf("Error! En la componete %05d \n", i
// );
#pragma omp critical
{
correct = false;
}
}
}
// i=0;
// int nosalir=1;
// while (nosalir && (i<dim_cpu.space_elem))
// {
// nosalir=(fv_cpu[i].v ==golden[i][0]);
// nosalir&=(fv_cpu[i].x==golden[i][1]);
// nosalir&=(fv_cpu[i].y==golden[i][2]);
// nosalir&=(fv_cpu[i].z==golden[i][3]);
// i++;
// }
// //if (nosalir) printf ("Result: PASS\n");
// else printf ("Result: FAIL\n");
printf("%s\n", correct ? "Result = PASS" : "Result = FAIL");
free(rv_cpu);
free(qv_cpu);
free(fv_cpu);
free(box_cpu);
time7 = get_time();
//======================================================================================================================================================150
// DISPLAY TIMING
//======================================================================================================================================================150
// printf("Time spent in different stages of the application:\n");
// printf("%15.12f s, %15.12f % : VARIABLES\n", (float) (time1-time0) / 1000000, (float) (time1-time0) / (float) (time7-time0) * 100);
// printf("%15.12f s, %15.12f % : INPUT ARGUMENTS\n", (float) (time2-time1) / 1000000, (float) (time2-time1) / (float) (time7-time0) * 100);
// printf("%15.12f s, %15.12f % : INPUTS\n", (float) (time3-time2) / 1000000, (float) (time3-time2) / (float) (time7-time0) * 100);
// printf("%15.12f s, %15.12f % : dim_cpu\n", (float) (time4-time3) / 1000000, (float) (time4-time3) / (float) (time7-time0) * 100);
// printf("%15.12f s, %15.12f % : SYS MEM: ALO\n", (float) (time5-time4) / 1000000, (float) (time5-time4) / (float) (time7-time0) * 100);
// printf("%15.12f s, %15.12f % : KERNEL: COMPUTE\n", (float) (time6-time5) / 1000000, (float) (time6-time5) / (float) (time7-time0) * 100);
// printf("%15.12f s, %15.12f % : SYS MEM: FRE\n", (float) (time7-time6) / 1000000, (float) (time7-time6) / (float) (time7-time0) * 100);
// printf("Total time:\n");
// printf("%.12f s\n", (float) (time7-time0) / 1000000);
//======================================================================================================================================================150
// RETURN
//======================================================================================================================================================150
return 0.0; // always returns 0.0
}
//========================================================================================================================================================================================================200
//======================================================================================================================================================150
//====================================================================================================100
//==================================================50
//========================================================================================================================================================================================================200
// UPDATE
//========================================================================================================================================================================================================200
// 14 APR 2011 Lukasz G. Szafaryn
//========================================================================================================================================================================================================200
// DEFINE/INCLUDE
//========================================================================================================================================================================================================200
//======================================================================================================================================================150
// LIBRARIES
//======================================================================================================================================================150
//#include "golden.h"
//float golden [1000][4];
#include <stdio.h> // (in path known to compiler) needed by printf
#include <stdlib.h> // (in path known to compiler) needed by malloc
#include <stdbool.h> // (in path known to compiler) needed by true/false
#include <math.h>
//======================================================================================================================================================150
// UTILITIES
//======================================================================================================================================================150
#include "./util/timer/timer.h" // (in path specified here)
#include "./util/num/num.h" // (in path specified here)
//======================================================================================================================================================150
// MAIN FUNCTION HEADER
//======================================================================================================================================================150
#include "./main.h" // (in the current directory)
//======================================================================================================================================================150
// KERNEL
//======================================================================================================================================================150
#include "./kernel/kernel_gpu_cuda_wrapper.h" // (in library path specified here)
//========================================================================================================================================================================================================200
// MAIN FUNCTION
//========================================================================================================================================================================================================200
int
main( int argc,
char *argv [])
{
printf("thread block size of kernel = %d \n", NUMBER_THREADS);
//======================================================================================================================================================150
// CPU/MCPU VARIABLES
//======================================================================================================================================================150
// timer
long long time0;
time0 = get_time();
// timer
long long time1;
long long time2;
long long time3;
long long time4;
long long time5;
long long time6;
long long time7;
// counters
int i, j, k, l, m, n;
// system memory
par_str par_cpu;
dim_str dim_cpu;
box_str* box_cpu;
FOUR_VECTOR* rv_cpu;
fp* qv_cpu;
FOUR_VECTOR* fv_cpu;
int nh;
time1 = get_time();
//======================================================================================================================================================150
// CHECK INPUT ARGUMENTS
//======================================================================================================================================================150
// assing default values
dim_cpu.boxes1d_arg = 1;
// go through arguments
for(dim_cpu.cur_arg=1; dim_cpu.cur_arg<argc; dim_cpu.cur_arg++){
// check if -boxes1d
if(strcmp(argv[dim_cpu.cur_arg], "-boxes1d")==0){
// check if value provided
if(argc>=dim_cpu.cur_arg+1){
// check if value is a number
if(isInteger(argv[dim_cpu.cur_arg+1])==1){
dim_cpu.boxes1d_arg = atoi(argv[dim_cpu.cur_arg+1]);
if(dim_cpu.boxes1d_arg<0){
printf("ERROR: Wrong value to -boxes1d parameter, cannot be <=0\n");
return 0;
}
dim_cpu.cur_arg = dim_cpu.cur_arg+1;
}
// value is not a number
else{
printf("ERROR: Value to -boxes1d parameter in not a number\n");
return 0;
}
}
// value not provided
else{
printf("ERROR: Missing value to -boxes1d parameter\n");
return 0;
}
}
// unknown
else{
printf("ERROR: Unknown parameter\n");
return 0;
}
}
// Print configuration
printf("Configuration used: boxes1d = %d\n", dim_cpu.boxes1d_arg);
time2 = get_time();
//======================================================================================================================================================150
// INPUTS
//======================================================================================================================================================150
par_cpu.alpha = 0.5;
time3 = get_time();
//======================================================================================================================================================150
// DIMENSIONS
//======================================================================================================================================================150
// total number of boxes
dim_cpu.number_boxes = dim_cpu.boxes1d_arg * dim_cpu.boxes1d_arg * dim_cpu.boxes1d_arg;
// how many particles space has in each direction
dim_cpu.space_elem = dim_cpu.number_boxes * NUMBER_PAR_PER_BOX;
dim_cpu.space_mem = dim_cpu.space_elem * sizeof(FOUR_VECTOR);
dim_cpu.space_mem2 = dim_cpu.space_elem * sizeof(fp);
// box array
dim_cpu.box_mem = dim_cpu.number_boxes * sizeof(box_str);
time4 = get_time();
//======================================================================================================================================================150
// SYSTEM MEMORY
//======================================================================================================================================================150
//====================================================================================================100
// BOX
//====================================================================================================100
// allocate boxes
box_cpu = (box_str*)malloc(dim_cpu.box_mem);
// initialize number of home boxes
nh = 0;
// home boxes in z direction
for(i=0; i<dim_cpu.boxes1d_arg; i++){
// home boxes in y direction
for(j=0; j<dim_cpu.boxes1d_arg; j++){
// home boxes in x direction
for(k=0; k<dim_cpu.boxes1d_arg; k++){
// current home box
box_cpu[nh].x = k;
box_cpu[nh].y = j;
box_cpu[nh].z = i;
box_cpu[nh].number = nh;
box_cpu[nh].offset = nh * NUMBER_PAR_PER_BOX;
// initialize number of neighbor boxes
box_cpu[nh].nn = 0;
// neighbor boxes in z direction
for(l=-1; l<2; l++){
// neighbor boxes in y direction
for(m=-1; m<2; m++){
// neighbor boxes in x direction
for(n=-1; n<2; n++){
// check if (this neighbor exists) and (it is not the same as home box)
if( (((i+l)>=0 && (j+m)>=0 && (k+n)>=0)==true && ((i+l)<dim_cpu.boxes1d_arg && (j+m)<dim_cpu.boxes1d_arg && (k+n)<dim_cpu.boxes1d_arg)==true) &&
(l==0 && m==0 && n==0)==false ){
// current neighbor box
box_cpu[nh].nei[box_cpu[nh].nn].x = (k+n);
box_cpu[nh].nei[box_cpu[nh].nn].y = (j+m);
box_cpu[nh].nei[box_cpu[nh].nn].z = (i+l);
box_cpu[nh].nei[box_cpu[nh].nn].number = (box_cpu[nh].nei[box_cpu[nh].nn].z * dim_cpu.boxes1d_arg * dim_cpu.boxes1d_arg) +
(box_cpu[nh].nei[box_cpu[nh].nn].y * dim_cpu.boxes1d_arg) +
box_cpu[nh].nei[box_cpu[nh].nn].x;
box_cpu[nh].nei[box_cpu[nh].nn].offset = box_cpu[nh].nei[box_cpu[nh].nn].number * NUMBER_PAR_PER_BOX;
// increment neighbor box
box_cpu[nh].nn = box_cpu[nh].nn + 1;
}
} // neighbor boxes in x direction
} // neighbor boxes in y direction
} // neighbor boxes in z direction
// increment home box
nh = nh + 1;
} // home boxes in x direction
} // home boxes in y direction
} // home boxes in z direction
//====================================================================================================100
// PARAMETERS, DISTANCE, CHARGE AND FORCE
//====================================================================================================100
// random generator seed set to random value - time in this case
// srand(time(NULL));
srand (0);
// input (distances)
rv_cpu = (FOUR_VECTOR*)malloc(dim_cpu.space_mem);
for(i=0; i<dim_cpu.space_elem; i=i+1){
rv_cpu[i].v = (rand()%10 + 1) / 10.0; // get a number in the range 0.1 - 1.0
rv_cpu[i].x = (rand()%10 + 1) / 10.0; // get a number in the range 0.1 - 1.0
rv_cpu[i].y = (rand()%10 + 1) / 10.0; // get a number in the range 0.1 - 1.0
rv_cpu[i].z = (rand()%10 + 1) / 10.0; // get a number in the range 0.1 - 1.0
}
// input (charge)
qv_cpu = (fp*)malloc(dim_cpu.space_mem2);
for(i=0; i<dim_cpu.space_elem; i=i+1){
qv_cpu[i] = (rand()%10 + 1) / 10.0; // get a number in the range 0.1 - 1.0
}
// output (forces)
fv_cpu = (FOUR_VECTOR*)malloc(dim_cpu.space_mem);
for(i=0; i<dim_cpu.space_elem; i=i+1){
fv_cpu[i].v = 0; // set to 0, because kernels keeps adding to initial value
fv_cpu[i].x = 0; // set to 0, because kernels keeps adding to initial value
fv_cpu[i].y = 0; // set to 0, because kernels keeps adding to initial value
fv_cpu[i].z = 0; // set to 0, because kernels keeps adding to initial value
}
time5 = get_time();
//======================================================================================================================================================150
// KERNEL
//======================================================================================================================================================150
//====================================================================================================100
// GPU_CUDA
//====================================================================================================100
kernel_gpu_cuda_wrapper(par_cpu,
dim_cpu,
box_cpu,
rv_cpu,
qv_cpu,
fv_cpu);
time6 = get_time();
//======================================================================================================================================================150
// SYSTEM MEMORY DEALLOCATION
//======================================================================================================================================================150
// dump res componete ults
#ifdef OUTPUT
FILE *fptr;
fptr = fopen("result.txt", "w");
for(i=0; i<dim_cpu.space_elem; i=i+1){
fprintf(fptr, "%.10f, %.10f, %.10f, %.10f\n", fv_cpu[i].v, fv_cpu[i].x, fv_cpu[i].y, fv_cpu[i].z);
}
fclose(fptr);
#endif
int correct=true;
#pragma omp parallel for shared(golden,fv_cpu, correct)
for (int i = 0; i < (int) (dim_cpu.space_elem); i++) {
// float abs_err = fabs(h_C[i] - float(dimsA.x * valB));
// float dot_length = dimsA.x;
// float abs_val = fabs(h_C[i]);
// float rel_err = abs_err / abs_val / dot_length;
//
//
int semicorrect=0;
double eps = 1.e-6;
float golden[4];
FILE *fptrr;
fptrr = fopen("result.txt", "r");
fscanf(fptrr, "%.10f, %.10f, %.10f, %.10f\n", golden,golden+1, golden+2,golden+3);
semicorrect=(fabs(fv_cpu[i].v-golden[0]) <= eps);
// printf ("V %f golden %f dif %f Error %d\n",fv_cpu[i].v,golden[0],fabs(fv_cpu[i].v-golden[0]),semicorrect );
semicorrect&=(fabs(fv_cpu[i].x-golden[1])<= eps);
// printf ("X %f golden %f dif %f Error %d\n",fv_cpu[i].x,golden[1],fabs(fv_cpu[i].x-golden[1]) ,semicorrect );
semicorrect&=(fabs(fv_cpu[i].y-golden[2])<=eps);
//printf ("Y %f golden %f dif %f Error %d \n",fv_cpu[i].y,golden[2],fabs(fv_cpu[i].y-golden[2]) ,semicorrect );
semicorrect&=(fabs(fv_cpu[i].z-golden[3])<=eps);
//printf ("Z %f golden %f dif %f Error %d\n",fv_cpu[i].z,golden[3],fabs(fv_cpu[i].z-golden[3]) ,semicorrect );
if (!semicorrect) {
// printf("Error! En la componete %05d \n", i
);
#pragma omp critical
{
correct = false;
}
}
}
// i=0;
// int nosalir=1;
// while (nosalir && (i<dim_cpu.space_elem))
// {
// nosalir=(fv_cpu[i].v ==golden[i][0]);
// nosalir&=(fv_cpu[i].x==golden[i][1]);
// nosalir&=(fv_cpu[i].y==golden[i][2]);
// nosalir&=(fv_cpu[i].z==golden[i][3]);
// i++;
// }
// //if (nosalir) printf ("Result: PASS\n");
// else printf ("Result: FAIL\n");
printf("%s\n", correct ? "Result = PASS" : "Result = FAIL");
free(rv_cpu);
free(qv_cpu);
free(fv_cpu);
free(box_cpu);
time7 = get_time();
//======================================================================================================================================================150
// DISPLAY TIMING
//======================================================================================================================================================150
// printf("Time spent in different stages of the application:\n");
// printf("%15.12f s, %15.12f % : VARIABLES\n", (float) (time1-time0) / 1000000, (float) (time1-time0) / (float) (time7-time0) * 100);
// printf("%15.12f s, %15.12f % : INPUT ARGUMENTS\n", (float) (time2-time1) / 1000000, (float) (time2-time1) / (float) (time7-time0) * 100);
// printf("%15.12f s, %15.12f % : INPUTS\n", (float) (time3-time2) / 1000000, (float) (time3-time2) / (float) (time7-time0) * 100);
// printf("%15.12f s, %15.12f % : dim_cpu\n", (float) (time4-time3) / 1000000, (float) (time4-time3) / (float) (time7-time0) * 100);
// printf("%15.12f s, %15.12f % : SYS MEM: ALO\n", (float) (time5-time4) / 1000000, (float) (time5-time4) / (float) (time7-time0) * 100);
// printf("%15.12f s, %15.12f % : KERNEL: COMPUTE\n", (float) (time6-time5) / 1000000, (float) (time6-time5) / (float) (time7-time0) * 100);
// printf("%15.12f s, %15.12f % : SYS MEM: FRE\n", (float) (time7-time6) / 1000000, (float) (time7-time6) / (float) (time7-time0) * 100);
// printf("Total time:\n");
// printf("%.12f s\n", (float) (time7-time0) / 1000000);
//======================================================================================================================================================150
// RETURN
//======================================================================================================================================================150
return 0.0; // always returns 0.0
}
tam=$(cat $1|wc -l )
echo "double golden["${tam}"][4]={"
#for i in $1 do
# echo "{"${i}"}"
#done
sep=
while IFS= read -r line
do
## take some action on $line
echo $sep"{$line}"
sep=,
done < $1
echo "};"
......@@ -180,6 +180,7 @@ lud_verify(float *m, float *lu, int matrix_dim){
cmp_res(m,tmp,matrix_dim);
free(tmp);
}
void
cmp_res (float *m, float *tmp, int matrix_dim){
int i,j;
......
......@@ -22,7 +22,7 @@ injectionSite = INST_OUT
# Max time factor to finish the app, this will be multiplied by the application running time
# For example if your app spend 2s, and the maxWaitTimes is 5, the max running time before it is
# Considered as a crash is 10s
maxWaitTimes = 5
maxWaitTimes = 50
# binary file of the application
# Must be full path
......@@ -36,7 +36,7 @@ benchmarkArgs = -s 10000 -v
benchmarkArgs_noverificar = -s 10000
# CSV output file. It will be overwrite at each injection
csvFile = results/lud_IO.csv
csvFile = results/lud_IO_original_2.csv
# You should create a script on the benchmark source folder to verify GOLD_OUTPUT x INJ_OUTPUT
goldenCheckScript = codes/lud/sdc_check.sh
......
......@@ -17,9 +17,9 @@ faultModel = 0
# RF -> Register File
# INST_OUT -> Instruction Output (NOT IMPLEMENTED YET)
# INST_composed -> Instruction Adress (NOT IMPLEMENTED YET)
#injectionSite = RF
injectionSite = RF
injectionSite = INST_OUT
#injectionSite = INST_OUT
# Max time factor to finish the app, this will be multiplied by the application running time
# For example if your app spend 2s, and the maxWaitTimes is 5, the max running time before it is
# Considered as a crash is 10s
......
......@@ -16,7 +16,7 @@ grep -q "Result = FAIL" ${INJ_OUTPUT_PATH} >> ${DIFF_LOG}
# diff stderr
diff -B ${INJ_ERR_PATH} ${GOLD_ERR_PATH} > ${DIFF_ERR_LOG}
echo $pwd
ls -l result*
diff ./result.txt codes/nw/gold/result.1.txt >> ${INJ_OUTPUT_PATH}
#diff ./result.txt ~/rodinia_3.1/cuda/nw/gold/result.1.txt #>> ${INJ_OUTPUT_PATH}
......
......@@ -122,7 +122,6 @@ def remove_useless_information_from_output(output_file_path):
with open(output_file_path, 'w') as ofp:
ofp.writelines(ok_output_lines)
"""
Show output function
to allow pretty printing
......
......@@ -11,9 +11,9 @@ MAX_TIMES_TO_PROFILE = 2
# Log path to store all injections info
LOGS_PATH = 'logs/'
FILE_PID_PIF= "/pid_program_injected"
TMP_PATH=' tmpxxx/'
INDIRECT_KERNEL=False
HIGHPRECISION= True
# Internal python scripts
FLIP_SCRIPT = 'flip_value.py'
PROFILER_SCRIPT = 'profiler.py'
......@@ -30,11 +30,11 @@ DEBUG_PROFILER = True
NUM_DIVISION_TIMES = 100.0
# Common body of log filename
#SIGNALSTOP= "USR1"
SIGNAL_STOP=signal.SIGRTMIN
SIGNAL_STOP= signal.SIGUSR1
#SIGNAL_STOP=signal.SIGRTMIN
#SIGNALEXIT= "USR2"
SIGNAL_EXIT=signal.SIGRTMIN+1
SIGNAL_EXIT= signal.SIGUSR2
#SIGNAL_EXIT=signal.SIGRTMIN+1
# MAX INT 32 bits
MAX_INT_32 = 4294967295
......@@ -83,11 +83,16 @@ POSSIBLE_USELESS_GDB_OUTPUT_PATTERNS = [
RF = 0
INST_OUT = 1
INST_ADD = 2
INST_OUT_ORI=3
INST_OUT_V1=4
INST_OUT_V2=5
INJECTION_SITES = {
'RF': RF,
'INST_OUT': INST_OUT,
'INST_ADD': INST_ADD
'INST_ADD': INST_ADD,
'INST_OUT_ORI':INST_OUT_ORI,
'INST_OUT_V1':INST_OUT_V1,
'INST_OUT_V2':INST_OUT_V2,
}
# Which fault model to use, 0 -> single; 1 -> double;
......
......@@ -320,7 +320,7 @@ def check_injection_outcome(host_thread, logging, injection_site):
dpc['relativo']=assm_line.split('<')[1].split('>')[0]
#print("---PC: "+dpc['absoluto']+ "PC rel"+dpc['relativo'])
pc=dpc['absoluto']+"<"+dpc['relativo']+">"
if cp.INJECTION_SITES[injection_site] in [cp.INST_OUT, cp.INST_ADD]:
if cp.INJECTION_SITES[injection_site] in [cp.INST_OUT, cp.INST_OUT_ORI,cp.INST_OUT_V1,cp.INST_OUT_V2,cp.INST_ADD]:
# if fault was injected ASSM_LINE MUST be in the logfile
instruction = re.match(r".*:\t(\S+) .*", assm_line).group(1)
......@@ -420,6 +420,9 @@ def gdb_inject_fault(**kwargs):
if cp.DEBUG:
cf.printf("THREAD {} PROCESSES SPAWNED".format(host_thread))
signal_app_thread.join(timeout=300)
cf.printf("THREAD {} PROCESS JOINED".format(signal_app_thread))
# Start counting time
timestamp_start = int(time.time())
......@@ -433,15 +436,15 @@ def gdb_inject_fault(**kwargs):
cf.printf("THREAD {} FINISH CHECK OK".format(host_thread))
# finishing and removing thrash
fi_process.join()
fi_process.join(timeout=300)
# fi_process.terminate()
signal_app_thread.join()
cf.printf("THREAD {} PROCESS JOINED".format(fi_process))
# Get the signal init wait time before destroy the thread
signal_init_wait_time = signal_app_thread.get_int_wait_time()
print(1)
del fi_process, signal_app_thread
print(2)
if cp.DEBUG:
cf.printf("THREAD {} PROCESS JOINED".format(host_thread))
......@@ -557,7 +560,7 @@ by sending a SIGINT signal to the application
def fault_injection_by_signal(**kwargs):
# Global rows list
global lock, exit_injector,num_rounds,kill_strings,crashsystem
global lock, exit_injector,num_rounds,kill_strings,crashsystem,mode
benchmark_binary = kwargs.get('benchmark_binary')
#kwargs['signal_cmd'] = "killall -2 {}".format(os.path.basename(benchmark_binary))
kwargs['signal_cmd'] = "pgrep {}".format(os.path.basename(benchmark_binary))
......@@ -574,14 +577,16 @@ def fault_injection_by_signal(**kwargs):
for fault_model in fault_models:
# Execute iterations number of fault injection for a specific app
print("================")
num_rounds = 1
try:
print(num_rounds)
ret_profiler = cf.load_config_file("tmpxxx_num_rounds.conf")
num_rounds=int(ret_profiler.get('DEFAULT', 'Ocurrencias'))
print(num_rounds)
os.system ("rm tmpxxx_num_rounds.conf")
#print("+++".format(num_rounds))
ret_profiler = cf.load_config_file("tmpxxx/num_rounds.conf")
num_rounds=int(ret_profiler.get('DEFAULT', 'Ocurrencias'))
#print("++++".format(num_rounds))
os.system ("rm tmpxxx/num_rounds.conf")
except:
num_rounds = 1
print("Error al acceso num_rounds.conf")
print(num_rounds)
print("================")
while num_rounds <= iterations:
......@@ -637,10 +642,10 @@ def fault_injection_by_signal(**kwargs):
for th in gpus_threads:
try:
th.join()
th.join(timeout=300)
except:
nulo=1
f=open("tmpxxx_num_rounds.conf","w")
f=open("tmpxxx/num_rounds.conf","w")
f.write("[DEFAULT] \nOcurrencias = "+str(num_rounds)+"\n")
f.close()
......@@ -653,7 +658,7 @@ Main function
"""
def main():
global kill_strings, current_path, gpus_threads, lock, syncro, wait_finish
global kill_strings, current_path, gpus_threads, lock, syncro, wait_finish,mode
signal.signal(cp.SIGNAL_STOP,receiveSignal);
signal.signal(cp.SIGNAL_EXIT,receiveEnd);
......@@ -707,7 +712,7 @@ def main():
'new_value', 'inj_site', 'fault_successful', 'hang', 'crash', 'masked', 'sdc', 'Exception','time',
'inj_time_location', 'bits_flipped', 'instruction', 'pc', 'user_defined']
if os.path.exists("tmpxxx_num_rounds.conf"):
if os.path.exists("tmpxxx/num_rounds.conf"):
mode='a'
else:
mode='w'
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment