from.base_row import BaseRow,Source,Traffic
from apm_helpers.justifiedbool import JustifiedBool
from apm_helpers.messages import Messages as msg
NonOffloadReasons=msg.NonOffload.Reasons
from debug import myprint
from.convert import memsize_string_to_bytes
from functools import reduce
from collections import defaultdict
import math
GPU_SAMPLES_REQUIRED=10
MIN_GPU_SAMPLING_INTERVAL=1e-4
def get_sum_metrics(row,type_,*metrics):
 return sum(type_(row[m])for m in metrics if m in row and row[m])if row else 0.0
class GpuRow(BaseRow):
 DATA_TYPE={'2':'float','3':'int'}
 OP_TYPE={'2':('basic',''),'3':('fma',''),'4':('math',''),'5':('div',''),'6':('pow',''),'7':('send','global'),'8':('send','global'),'9':('send','local'),'10':('send','local'),'11':('mov',''),'12':('bitwise',''),'15':('atomic','global'),'16':('atomic','local'),}
 @staticmethod
 def get_key(row):
  return row['computing_task_id']
 def __init__(self,row,key,minor_rows):
  super(GpuRow,self).__init__(row,key,minor_rows)
  self._data['key_column']=key
  self._data['unique_index']=key
  self._data['type']='GPU kernel'
  self._data['source_location']='GPU '+row['computing_task']
  self._data['module']='JIT'
  self._data['loop_height']=''
  self._data['function_type']=''
  self._data['dependency_type']='Parallel: GPU Kernel'
  self._data['dependency_key']='parallel_gpu'
  for t,s in(('function_call_sites_and_loops','computing_task'),('location','computing_task'),('loop_function_id','computing_task'),('module_id','module_module_id'),):
   self._data[t]=row[s]
  for t,s,type_,default_value in(('total_elapsed_time','computing_task_total_time',float,0.0),('self_elapsed_time','computing_task_total_time',float,0.0),('total_time','computing_task_total_time',float,0.0),('self_time','computing_task_total_time',float,0.0),('total_time_per_largest_thread','computing_task_total_time',float,0.0),('self_time_per_largest_thread','computing_task_total_time',float,0.0),('call_count','computing_task_instance_count',float,1.0),('instance_count','computing_task_instance_count',float,1.0),('kernel_average_time','computing_task_average_time',float,0.0),('total_gflop','gpu_compute_performance_gflop',float,0.0),('total_ai','gpu_compute_performance_fp_ai',float,0.0),('global_atomic_accesses','gpu_shader_atomics',int,0),('barrier_accesses','gpu_shader_barriers',int,0),('simd_width','computing_task_simd_width',int,0),):
   try:
    self._data[t]=type_(row[s])
   except(KeyError,ValueError):
    self._data[t]=default_value
  self._data['total_gflops']=self._data['total_gflop']/self._data['total_time']
  if len(row['work_size_global'].split(';'))==1:
   work_global_list=tuple(int(x)for x in row['work_size_global'].split('x'))
   work_items=reduce(lambda x,y:x*y,work_global_list,1)
   self._data['source_execution_count']=self._data['execution_count']=self._data['trip_count_total']=work_items*self._data['instance_count']
   self._data['source_iterations']=self._data['trip_count_average']=work_items
   self._data['global_size']=((1,1,1)+work_global_list)[-3:]
   if len(row['work_size_local'])>0:
    self._data['local_size']=((1,1,1)+tuple(int(x)for x in row['work_size_local'].split('x')))[-3:]
   else:
    self._data['local_size']=None
  while self._data['global_size']and self._data['local_size']and self._data['global_size'][0]==1 and self._data['local_size'][0]==1:
   self._data['global_size']=self._data['global_size'][1:]
   self._data['local_size']=self._data['local_size'][1:]
  self._data['vector_length']=1
  self._traffic['local']=local_traffic=Traffic()
  local_traffic.carm=(get_sum_metrics(row,float,'shared_local_memory_data_transferred_gb_read')*1e9,get_sum_metrics(row,float,'shared_local_memory_data_transferred_gb_write')*1e9,)
  try:
   local_traffic.footprint=(int(row['computing_task_local_memory_size']),0)
  except(ValueError):
   local_traffic.footprint=(0,0)
  local_traffic.by_level.append(local_traffic.carm)
  gti_traffic=(get_sum_metrics(row,float,'gpu_memory_data_transferred_gb_read')*1e9,get_sum_metrics(row,float,'gpu_memory_data_transferred_gb_write')*1e9,)
  gti_traffic_total=sum(gti_traffic)
  if gti_traffic_total>0:
   gti_rw_ratio=tuple(t/gti_traffic_total for t in gti_traffic)
  else:
   gti_rw_ratio=(0.5,0.5)
  l3_gb_read=get_sum_metrics(row,float,'typed_memory_data_transferred_gb_read','untyped_memory_data_transferred_gb_read')
  l3_gb_write=get_sum_metrics(row,float,'typed_memory_data_transferred_gb_write','untyped_memory_data_transferred_gb_write')
  l3_gb_total=get_sum_metrics(row,float,'l3_shader_data_transferred_gb')
  l3_rd=max(l3_gb_read*1e+9-local_traffic.carm[0],0)
  l3_wr=max(l3_gb_write*1e+9-local_traffic.carm[1],0)
  l3_total=l3_gb_total*1e+9
  if l3_rd==0 and l3_wr==0:
   l3_rd=l3_total*gti_rw_ratio[0]
   l3_wr=l3_total*gti_rw_ratio[1]
   l3_rw_ratio=gti_rw_ratio
  else:
   l3_total=max(l3_total,l3_rd+l3_wr)
   if l3_rd+l3_wr!=l3_total:
    l3_rd=l3_total-l3_wr
   l3_rw_ratio=(l3_rd/l3_total,l3_wr/l3_total)
  self._traffic['global']=global_traffic=Traffic()
  global_traffic.by_level.append((l3_rd,l3_wr))
  global_traffic.by_level.append(gti_traffic)
  carm_traffic=get_sum_metrics(row,float,'carm_traffic_gb')*1e9
  if l3_total:
   global_traffic.carm=tuple(carm_traffic*r for r in l3_rw_ratio)
  else:
   global_traffic.carm=(0,0)
  self._data['total_l3_traffic']=sum(global_traffic.by_level[0])
  self._data['total_gti_traffic']=sum(global_traffic.by_level[1])
  self._is_bottomup=False
  self._basic_blocks=[]
  self._source=Source()
  self._is_executed=True
  self._is_loop=True
  self._is_offload_candidate=JustifiedBool(True)
  self._lazy_items={'memory_footprint':{'together':['memory_footprint'],'getter':self._get_memory_footprint},}
 def set_aggregated_subitems(self,subitems):
  super().set_aggregated_subitems(subitems)
  self._is_offload_candidate=JustifiedBool(False,NonOffloadReasons.SOURCE_TASK)
 def apply_instruction_mix(self,instruction_mix):
  self.set_instruction_mix(instruction_mix)
  self.set_instruction_counters(instruction_mix)
  self.set_atomic_counters(instruction_mix)
 def set_instruction_mix(self,instruction_mix):
  instruction_count_source_list=['operations_count_gop','dynamic_instruction_count','executed_instruction_count','callcount',]
  for x in instruction_count_source_list:
   if x not in self._ins_mix:
    self._ins_mix[x]=defaultdict(lambda:0.0)
  threads_issued=math.ceil(self._data['source_iterations']/self._data['simd_width'])*self._data['instance_count']
  source_execution_count=self._data['source_execution_count']
  if not threads_issued or not source_execution_count:
   return
  for ins in instruction_mix:
   op_type=GpuRow.OP_TYPE.get(ins['instruction_class'],('other',''))
   try:
    key=(op_type[0],GpuRow.DATA_TYPE.get(ins['operand_type'],'other'),int(ins['operand_size'])if ins['operand_size']else None,op_type[1],)
    if op_type[0]in('send','atomic')and int(ins['callcount']):
     count=float(ins['callcount'])/threads_issued
     for source_key in instruction_count_source_list:
      self._ins_mix[source_key][key]+=count
    else:
     for source_key in instruction_count_source_list:
      count=float(ins[source_key])/source_execution_count
      if source_key=='operations_count_gop':
       count*=1e+9
      if source_key=='operations_count_gop' and op_type[0]=='fma':
       count/=2
      if count>0:
       self._ins_mix[source_key][key]+=count
   except(KeyError,ValueError):
    pass
 def set_instruction_counters(self,instruction_mix):
  for k in GpuRow.set_instruction_counters.mapping.values():
   if k not in self._data:
    self._data[k]=0.0
  for item in instruction_mix:
   op_type=GpuRow.OP_TYPE.get(item['instruction_class'],('other',''))
   ins_class=GpuRow.set_instruction_counters.mapping.get(op_type[0])
   if ins_class:
    count=item['callcount']
    if count:
     self._data[ins_class]+=float(count)
 set_instruction_counters.mapping={'fma':'total_fma_count','math':'total_4c_compute','div':'total_8c_compute','pow':'total_8c_compute',}
 def set_atomic_counters(self,instruction_mix):
  for ins in instruction_mix:
   op_type=GpuRow.OP_TYPE.get(ins['instruction_class'],('other',''))
   if op_type[0]!='atomic':
    continue
   try:
    res={'data_type':GpuRow.DATA_TYPE.get(ins['operand_type'],'other'),'mem_type':op_type[1],}
    for dst,src,type_,default in(('call_count','callcount',int,0,),('op_count','dynamic_instruction_count',int,0,),('exec_count','executed_instruction_count',int,0,),('data_size','operand_size',int,32),):
     try:
      res[dst]=type_(ins[src])
     except KeyError:
      res[dst]=default
    if res['exec_count']and res['call_count']:
     res['exec_size']=res['exec_count']/res['call_count']
     res['static_count']=1
    else:
     res['exec_size']=None
     res['static_count']=0
    self._atomic_counts.append(res)
   except(ValueError,TypeError):
    pass
  self._data['atomic_send_count']=sum(x['call_count']for x in self._atomic_counts)
 def _get_memory_footprint(self):
  res={}
  memory_footprint=0
  if len(self.memory_objects)>0:
   for memory_obj in self.memory_objects:
    memory_footprint+=int(memory_obj['size'])
  res['memory_footprint']=memory_footprint
  myprint(msg.DEBUG_ESTIMATED_FOOTPRINT.format(self._data['location'],self._data['key_column'],memory_footprint))
  return res
 def _fill_call_stack_based_metrics(self,row):
  pass
 @staticmethod
 def filter_children(row):
  return[(x,[])for x in row.children]
 def select_instruction_mix(self,compute_time_source):
  if not self._ins_mix[compute_time_source]and compute_time_source!='operations_count_gop':
   self._diagnostics.append(msg.Diagnostics.DYNAMIC_OP_COUNT_USED)
   myprint(msg.DEBUG_DETAILED_INSTRUCTION_MIX_NOT_FOUND.format(self,compute_time_source))
   compute_time_source='operations_count_gop'
  try:
   self._instruction_mix=self._ins_mix[compute_time_source]
  except KeyError:
   raise
def get_baseline_cache_sizes(surv_cfg_info,gpu_device_props):
 context_vars={}
 for dst_key,keys,handler in(('gpuEuCount',('gpuEuCount','gpuVectorEngineTotalCount'),int),('gpuPciDeviceId',('gpuPciDeviceId',),int),('gpuL3Size',('gpuL3Size',),lambda x:int(x or 0)),('gpuLLCSize',('gpuLLCSize',),lambda x:int(x or 0)),):
  try:
   value=None
   for key in keys:
    value=surv_cfg_info.get(key)
    if value:
     break
   context_vars[dst_key]=handler(value)
  except TypeError:
   myprint(msg.ERROR_NO_VALUE_IN_CONTEXT_VARIABLE.format(key),severity=4)
 virtual_memory_level_size=context_vars['gpuEuCount']*32
 L3_memory_level_size=context_vars['gpuL3Size']
 LLC_memory_level_size=context_vars['gpuLLCSize']
 gpuPciDeviceId='0x{0:04X}'.format(context_vars['gpuPciDeviceId'])
 gpu_device_id_props=gpu_device_props.get(gpuPciDeviceId)
 if gpu_device_id_props:
  if L3_memory_level_size==0:
   L3_memory_level_size=int(memsize_string_to_bytes(gpu_device_id_props['l3_size']))
  gpu_tiles=gpu_device_id_props.get('tiles',1)
  L3_memory_level_size/=gpu_tiles
 cache_sizes=[L3_memory_level_size]
 if LLC_memory_level_size!=0:
  cache_sizes.append(LLC_memory_level_size)
 return{'local':(virtual_memory_level_size,),'global':(virtual_memory_level_size,*cache_sizes),}
def check_rows_time(rows,surv_cfg_info,filter_threshold,gpu_sampling_interval):
 desired_sampling_interval=float('inf')
 for row in rows:
  if row['total_time']>filter_threshold:
   if row['kernel_average_time']<GPU_SAMPLES_REQUIRED*gpu_sampling_interval:
    row_desired_sampling_interval=10**math.floor(math.log10(row['kernel_average_time']/GPU_SAMPLES_REQUIRED))
    if row_desired_sampling_interval<desired_sampling_interval:
     desired_sampling_interval=row_desired_sampling_interval
    params={'kernel':row['function_call_sites_and_loops'],'time':round(row['kernel_average_time']*1e3,3),'t_sample':round(gpu_sampling_interval*1e3,3),'expected_time':round(GPU_SAMPLES_REQUIRED*MIN_GPU_SAMPLING_INTERVAL*1e3,3),'expected_t_sample':round(MIN_GPU_SAMPLING_INTERVAL*1e3,3),'min_samples':GPU_SAMPLES_REQUIRED,}
    if desired_sampling_interval>=MIN_GPU_SAMPLING_INTERVAL:
     myprint(msg.DEBUG_DECREASE_SAMPLING_INTERVAL.format(**params))
    else:
     myprint(msg.DEBUG_N_GPU_SAMPLES_REQUIRED.format(**params))
 if desired_sampling_interval<gpu_sampling_interval:
  myprint(msg.INFO_CHANGE_GPU_SAMPLING_INTERVAL.format(**params),severity=2)
def get_compute_task_weight(rows,metric):
 kernel_id_name='module_id'
 total_vals={}
 counts={}
 res={}
 for curr_row in rows:
  try:
   module_id=curr_row[kernel_id_name]
   curr_val=float(curr_row[metric])
  except(KeyError,ValueError):
   continue
  if module_id in total_vals:
   total_vals[module_id]+=curr_val
   counts[module_id]+=1
  else:
   total_vals[module_id]=curr_val
   counts[module_id]=1
 for curr_row in rows:
  try:
   curr_val=float(curr_row[metric])
  except(KeyError,ValueError):
   continue
  total_val=total_vals[curr_row[kernel_id_name]]
  if total_val:
   res[curr_row]=curr_val/total_val
  else:
   res[curr_row]=curr_val/counts[curr_row[kernel_id_name]]
 return res
