# mypy: allow-untyped-defsimportbisectimportitertoolsimportmathfromcollectionsimportdefaultdict,namedtuplefromoperatorimportattrgetterfromtypingimportAny,Dict,List,Optional,Tuplefromtyping_extensionsimportdeprecatedimporttorchfromtorch.autogradimportDeviceType__all__=["EventList","FormattedTimesMixin","Interval","Kernel","FunctionEvent","FunctionEventAvg","StringTable","MemRecordsAcc",]classEventList(list):"""A list of Events (for pretty printing)."""def__init__(self,*args,**kwargs):use_device=kwargs.pop("use_device",None)profile_memory=kwargs.pop("profile_memory",False)with_flops=kwargs.pop("with_flops",False)super().__init__(*args,**kwargs)self._use_device=use_deviceself._profile_memory=profile_memoryself._tree_built=Falseself._with_flops=with_flopsdef_build_tree(self):self._populate_cpu_children()self._remove_dup_nodes()self._set_backward_stacktraces()self._tree_built=Truedef__str__(self):returnself.table()def_remove_dup_nodes(self):whileTrue:to_delete=set()foridxinrange(len(self)):if(self[idx].cpu_parentisnotNoneandself[idx].cpu_parent.name==self[idx].nameandlen(self[idx].cpu_parent.cpu_children)==1):self[idx].cpu_parent.cpu_children=self[idx].cpu_childrenself[idx].cpu_parent.kernels=self[idx].kernels# lift kernels upforchinself[idx].cpu_children:ch.cpu_parent=self[idx].cpu_parentto_delete.add(idx)iflen(to_delete)==0:breaknew_evts=[evforind,evinenumerate(self)ifindnotinto_delete]self.clear()self.extend(new_evts)def_populate_cpu_children(self):"""Populate child events into each underlying FunctionEvent object. One event is a child of another if [s1, e1) is inside [s2, e2). Where s1 and e1 would be start and end of the child event's interval. And s2 and e2 start and end of the parent event's interval Example: In event list [[0, 10], [1, 3], [3, 4]] would have make [0, 10] be a parent of two other intervals. If for any reason two intervals intersect only partially, this function will not record a parent child relationship between then. """# Some events can be async (i.e. start and end on different threads),# since it's generally undefined how to attribute children ranges to# async ranges, we do not use them when calculating nested ranges and statssync_events=[evtforevtinselfifnotevt.is_asyncandevt.device_type==DeviceType.CPU]events=sorted(sync_events,key=attrgetter("thread"),)# Group by both thread and node_id, so that events that happen to have# the same thread_id but are from different nodes aren't incorrectly# grouped together.threads=itertools.groupby(events,key=lambdaevent:(event.thread,event.node_id))# For each thread we keep a stack of current nested parents.# We maintain the invariant that each interval is a subset of all other# intervals lower in the stack.## First we sort the intervals by their start time. Then we iterate over them.# Every time we see a new interval we remove several parents from# the top until we restore the invariant. Then parent child relationship# if recorded if the stack is not empty.# Finally we add new interval to the list## Algorithm has O(N * log(N)) complexity where N is number of# intervalsforthread_id,thread_eventsinthreads:thread_events_=sorted(thread_events,key=lambdaevent:[event.time_range.start,-event.time_range.end],)current_events:List[FunctionEvent]=[]cur_end=0foreventinthread_events_:whilelen(current_events)>0:parent=current_events[-1]if(event.time_range.start>=parent.time_range.endorevent.time_range.end>parent.time_range.end):# this can't be a parentcurrent_events.pop()else:parent.append_cpu_child(event)assert(event.cpu_parentisNone),f"There is already a CPU parent event for {event.key}"event.set_cpu_parent(parent)breakcurrent_events.append(event)def_set_backward_stacktraces(self):defbw_parent(evt):ifevtisNone:returnNoneelifevt.scope==1:# BACKWARD_FUNCTIONreturnevtelse:returnbw_parent(evt.cpu_parent)fwd_stacks={}forevtinself:ifbw_parent(evt)isNoneandevt.stackisnotNone:t=(evt.sequence_nr,evt.thread)iftnotinfwd_stacks:fwd_stacks[t]=evt.stackforevtinself:p=bw_parent(evt)ifpisnotNone:assertp.fwd_threadisnotNonet=(p.sequence_nr,p.fwd_thread)iftinfwd_stacks:evt.stack=fwd_stacks[t]else:evt.stack=[]@propertydefself_cpu_time_total(self):returnsum(event.self_cpu_time_totalforeventinself)deftable(self,sort_by=None,row_limit=100,max_src_column_width=75,max_name_column_width=55,max_shapes_column_width=80,header=None,top_level_events_only=False,):"""Print an EventList as a nicely formatted table. Args: sort_by (str, optional): Attribute used to sort entries. By default they are printed in the same order as they were registered. Valid keys include: ``cpu_time``, ``cuda_time``, ``xpu_time``, ``cpu_time_total``, ``cuda_time_total``, ``xpu_time_total``, ``cpu_memory_usage``, ``cuda_memory_usage``, ``xpu_memory_usage``, ``self_cpu_memory_usage``, ``self_cuda_memory_usage``, ``self_xpu_memory_usage``, ``count``. top_level_events_only(bool, optional): Boolean flag to determine the selection of events to display. If true, the profiler will only display events at top level like top-level invocation of python `lstm`, python `add` or other functions, nested events like low-level cpu/cuda/xpu ops events are omitted for profiler result readability. Returns: A string containing the table. """return_build_table(self,sort_by=sort_by,row_limit=row_limit,max_src_column_width=max_src_column_width,max_name_column_width=max_name_column_width,max_shapes_column_width=max_shapes_column_width,header=header,profile_memory=self._profile_memory,with_flops=self._with_flops,top_level_events_only=top_level_events_only,)defexport_chrome_trace(self,path):"""Export an EventList as a Chrome tracing tools file. The checkpoint can be later loaded and inspected under ``chrome://tracing`` URL. Args: path (str): Path where the trace will be written. """importosdevice_name="cuda"ifnotself._use_deviceelseself._use_devicewithopen(path,"w")asf:chrome_events=[]next_id=0# Use file IO over using json.dump since JSON dumping is very slow and# this technique is proven to give a 4x speedup.f.write("[")forevtinself:ifevt.trace_nameisNone:continuef.write('{{"name": "{}", ''"ph": "X", ''"ts": {}, ''"dur": {}, ''"tid": {}, ''"pid": "CPU functions", ''"args": {{}}}}, '.format(evt.trace_name,evt.time_range.start,evt.time_range.elapsed_us(),evt.threadifnotevt.is_remoteelsef'" node_id:{evt.node_id}, thread_id:{evt.thread} "',))forkinevt.kernels:# 's' and 'f' draw Flow arrows from# the CPU launch to the GPU kernelf.write(f'{{"name": "{evt.trace_name}", ''"ph": "s", 'f'"ts": {evt.time_range.start}, 'f'"tid": {evt.thread}, ''"pid": "CPU functions", 'f'"id": {next_id}, 'f'"cat": "cpu_to_{device_name}", ''"args": {}}, ')# Note: use torch.profiler to get device kernel tracenext_id+=1iflen(self)>0:# remove trailing whitespace and commaf.seek(f.tell()-2,os.SEEK_SET)f.truncate()f.write("]")defsupported_export_stacks_metrics(self):return["self_cpu_time_total","self_cuda_time_total","self_xpu_time_total","self_privateuse1_time_total",]defexport_stacks(self,path:str,metric:str):ifmetricnotinself.supported_export_stacks_metrics():raiseValueError("metric should be one of: "+str(self.supported_export_stacks_metrics()))translate_table=str.maketrans(" ;\t\n","____")withopen(path,"w")asf:forevtinself:ifevt.stackandlen(evt.stack)>0:metric_value=getattr(evt,metric.replace("cuda","device").replace("xpu","device").replace("privateuse1","device"),)ifint(metric_value)>0:stack_str=""forentryinreversed(evt.stack):stack_str+=entry.translate(translate_table)stack_str+=";"stack_str=stack_str[:-1]+" "+str(int(metric_value))f.write(stack_str+"\n")defkey_averages(self,group_by_input_shapes=False,group_by_stack_n=0):"""Averages all function events over their keys. Args: group_by_input_shapes: group entries by (event name, input shapes) rather than just event name. This is useful to see which input shapes contribute to the runtime the most and may help with size-specific optimizations or choosing the best candidates for quantization (aka fitting a roof line) group_by_stack_n: group by top n stack trace entries Returns: An EventList containing FunctionEventAvg objects. """assertself._tree_builtstats:Dict[Tuple[str,...],FunctionEventAvg]=defaultdict(FunctionEventAvg)defget_key(event,group_by_input_shapes,group_by_stack_n)->Tuple[str,...]:key=[str(event.key),str(event.node_id),str(event.device_type),str(event.is_legacy),str(event.is_user_annotation),]ifgroup_by_input_shapes:key.append(str(event.input_shapes))ifgroup_by_stack_n>0:key+=event.stack[:group_by_stack_n]returntuple(key)forevtinself:stats[get_key(evt,group_by_input_shapes,group_by_stack_n)].add(evt)avg_list=EventList(stats.values(),use_device=self._use_device,profile_memory=self._profile_memory,with_flops=self._with_flops,)forevtinavg_list:evt.stack=evt.stack[:group_by_stack_n]ifnotgroup_by_input_shapes:evt.input_shapes=""returnavg_listdeftotal_average(self):"""Averages all events. Returns: A FunctionEventAvg object. """total_stat=FunctionEventAvg()forevtinself:total_stat+=evttotal_stat.key=Nonetotal_stat.key="Total"returntotal_statdef_format_time(time_us):"""Define how to format time in FunctionEvent."""US_IN_SECOND=1000.0*1000.0US_IN_MS=1000.0iftime_us>=US_IN_SECOND:returnf"{time_us/US_IN_SECOND:.3f}s"iftime_us>=US_IN_MS:returnf"{time_us/US_IN_MS:.3f}ms"returnf"{time_us:.3f}us"def_format_time_share(time_us,total_time_us):"""Define how to format time in FunctionEvent."""iftotal_time_us==0:asserttime_us==0,f"Expected time_us == 0 but got {time_us}"return"NaN"returnf"{time_us*100.0/total_time_us:.2f}%"def_format_memory(nbytes):"""Return a formatted memory size string."""KB=1024MB=1024*KBGB=1024*MBifabs(nbytes)>=GB:returnf"{nbytes*1.0/GB:.2f} Gb"elifabs(nbytes)>=MB:returnf"{nbytes*1.0/MB:.2f} Mb"elifabs(nbytes)>=KB:returnf"{nbytes*1.0/KB:.2f} Kb"else:returnstr(nbytes)+" b"def_attr_formatter(name):returnproperty(lambdaself:_format_time(getattr(self,name)))classFormattedTimesMixin:"""Helpers for FunctionEvent and FunctionEventAvg. The subclass should define `*_time_total` and `count` attributes. """cpu_time_str=_attr_formatter("cpu_time")device_time_str=_attr_formatter("device_time")cpu_time_total_str=_attr_formatter("cpu_time_total")device_time_total_str=_attr_formatter("device_time_total")self_cpu_time_total_str=_attr_formatter("self_cpu_time_total")self_device_time_total_str=_attr_formatter("self_device_time_total")@propertydefcpu_time(self):return0.0ifself.count==0else1.0*self.cpu_time_total/self.count# type: ignore[attr-defined]@propertydefdevice_time(self):return0.0ifself.count==0else1.0*self.device_time_total/self.count# type: ignore[attr-defined]@property@deprecated("`cuda_time` is deprecated, please use `device_time` instead.",category=FutureWarning,)defcuda_time(self):# To be deprecatedreturnself.device_time
[docs]defelapsed_us(self):r""" Returns the length of the interval """returnself.end-self.start
Kernel=namedtuple("Kernel",["name","device","duration"])classFunctionEvent(FormattedTimesMixin):"""Profiling information about a single function."""def__init__(self,id,name,thread,start_us,end_us,fwd_thread=None,input_shapes=None,stack=None,scope=0,use_device=None,cpu_memory_usage=0,device_memory_usage=0,is_async=False,is_remote=False,sequence_nr=-1,node_id=-1,device_type=DeviceType.CPU,device_index=0,device_resource_id=None,is_legacy=False,flops=None,trace_name=None,concrete_inputs=None,kwinputs=None,is_user_annotation=False,):self.id:int=idself.node_id:int=node_idself.name:str=nameself.trace_name:str=trace_nameself.time_range:Interval=Interval(start_us,end_us)self.thread:int=threadself.fwd_thread:Optional[int]=fwd_threadself.kernels:List[Kernel]=[]self.count:int=1self.cpu_children:List[FunctionEvent]=[]self.cpu_parent:Optional[FunctionEvent]=Noneself.input_shapes:Tuple[int,...]=input_shapesself.concrete_inputs:List[Any]=concrete_inputsself.kwinputs:Dict[str,Any]=kwinputsself.stack:List=stackself.scope:int=scopeself.use_device:Optional[str]=use_deviceself.cpu_memory_usage:int=cpu_memory_usageself.device_memory_usage:int=device_memory_usageself.is_async:bool=is_asyncself.is_remote:bool=is_remoteself.sequence_nr:int=sequence_nrself.device_type:DeviceType=device_typeself.device_index:int=device_indexself.device_resource_id:int=(threadifdevice_resource_idisNoneelsedevice_resource_id)self.is_legacy:bool=is_legacyself.flops:Optional[int]=flopsself.is_user_annotation:Optional[bool]=is_user_annotationself.self_cpu_percent=-1self.total_cpu_percent=-1self.total_device_percent=-1defappend_kernel(self,name,device,duration):assertself.device_type==DeviceType.CPUself.kernels.append(Kernel(name,device,duration))defappend_cpu_child(self,child):"""Append a CPU child of type FunctionEvent. One is supposed to append only direct children to the event to have correct self cpu time being reported. """assertself.device_type==DeviceType.CPUassertisinstance(child,FunctionEvent)assertchild.device_type==DeviceType.CPUself.cpu_children.append(child)defset_cpu_parent(self,parent):"""Set the immediate CPU parent of type FunctionEvent. One profiling FunctionEvent should have only one CPU parent such that the child's range interval is completely inside the parent's. We use this connection to determine the event is from top-level op or not. """assertself.device_type==DeviceType.CPUassertisinstance(parent,FunctionEvent)assertparent.device_type==DeviceType.CPUself.cpu_parent=parent# Note: async events don't have children, are not used when computing 'self'# metrics of other events, have only total cpu time@propertydefself_cpu_memory_usage(self):ifself.is_asyncorself.device_type!=DeviceType.CPU:return0returnself.cpu_memory_usage-sum(child.cpu_memory_usageforchildinself.cpu_children)@propertydefself_device_memory_usage(self):ifself.is_asyncorself.device_type!=DeviceType.CPU:return0returnself.device_memory_usage-sum(child.device_memory_usageforchildinself.cpu_children)@property@deprecated("`self_cuda_memory_usage` is deprecated. Use `self_device_memory_usage` instead.",category=FutureWarning,)defself_cuda_memory_usage(self):# To be deprecatedreturnself.self_device_memory_usage@propertydefcpu_time_total(self):ifself.device_type==DeviceType.CPU:returnself.time_range.elapsed_us()else:return0@propertydefself_cpu_time_total(self):ifself.is_asyncorself.device_type!=DeviceType.CPU:return0returnself.cpu_time_total-sum(child.cpu_time_totalforchildinself.cpu_children)@propertydefdevice_time_total(self):ifself.is_asyncornotself.use_device:return0ifself.device_type==DeviceType.CPU:ifnotself.is_legacy:# account for the kernels in the children opsreturnsum(kinfo.durationforkinfoinself.kernels)+sum(ch.device_time_totalforchinself.cpu_children)else:# each legacy cpu events has a single (fake) kernelreturnsum(kinfo.durationforkinfoinself.kernels)else:assertself.device_typein[DeviceType.CUDA,DeviceType.PrivateUse1,DeviceType.MTIA,]returnself.time_range.elapsed_us()@property@deprecated("`cuda_time_total` is deprecated. Use `device_time_total` instead.",category=FutureWarning,)defcuda_time_total(self):# To be deprecatedreturnself.device_time_total@propertydefself_device_time_total(self):ifself.is_asyncornotself.use_device:return0ifself.device_type==DeviceType.CPU:returnself.device_time_total-sum(child.device_time_totalforchildinself.cpu_children)else:assertself.device_typein[DeviceType.CUDA,DeviceType.PrivateUse1,DeviceType.MTIA,]returnself.device_time_total@property@deprecated("`self_cuda_time_total` is deprecated. Use `self_device_time_total` instead.",category=FutureWarning,)defself_cuda_time_total(self):# To be deprecatedreturnself.self_device_time_total@propertydefkey(self):returnself.namedef__repr__(self):device_name=self.use_devicedevice_time=self.device_time_strdevice_memory_usage=self.device_memory_usagereturn(f"<FunctionEvent id={self.id} name={self.name} device_type={self.device_type} node_id={self.node_id} "f"cpu_time={self.cpu_time_str} start_us={self.time_range.start} end_us={self.time_range.end} "f"cpu_children={str([child.idforchildinself.cpu_children])}{device_name}_time={device_time} "f"name={self.name} thread={self.thread} input_shapes={str(self.input_shapes)} "f"cpu_memory_usage={self.cpu_memory_usage}{device_name}_memory_usage={device_memory_usage} "f"is_async={self.is_async} is_remote={self.is_remote} seq_nr={self.sequence_nr} is_legacy={self.is_legacy}>")classFunctionEventAvg(FormattedTimesMixin):"""Used to average stats over multiple FunctionEvent objects."""def__init__(self)->None:self.key:Optional[str]=Noneself.count:int=0self.node_id:int=0self.is_async:bool=Falseself.is_remote:bool=Falseself.use_device:Optional[str]=Noneself.cpu_time_total:int=0self.device_time_total:int=0self.self_cpu_time_total:int=0self.self_device_time_total:int=0self.input_shapes:Optional[List[List[int]]]=Noneself.stack:Optional[List]=Noneself.scope:Optional[int]=Noneself.cpu_memory_usage:int=0self.device_memory_usage:int=0self.self_cpu_memory_usage:int=0self.self_device_memory_usage:int=0self.cpu_children:Optional[List[FunctionEvent]]=Noneself.cpu_parent:Optional[FunctionEvent]=Noneself.device_type:DeviceType=DeviceType.CPUself.is_legacy:bool=Falseself.flops:int=0defadd(self,other):ifself.keyisNone:# First function being recorded as part of FunctionEventAvg, propagate# fields.self.key=other.keyself.node_id=other.node_idself.is_async=other.is_asyncself.is_remote=other.is_remoteself.cpu_parent=other.cpu_parentself.cpu_children=other.cpu_childrenself.input_shapes=other.input_shapesself.stack=other.stackself.scope=other.scopeself.device_type=other.device_typeself.is_legacy=other.is_legacyself.use_device=other.use_deviceself.is_user_annotation=other.is_user_annotationassertisinstance(other,(FunctionEvent,FunctionEventAvg))assertother.key==self.keyself.cpu_time_total+=other.cpu_time_totalself.device_time_total+=other.device_time_totalself.self_cpu_time_total+=other.self_cpu_time_totalself.self_device_time_total+=other.self_device_time_totalself.cpu_memory_usage+=other.cpu_memory_usageself.device_memory_usage+=other.device_memory_usageself.self_cpu_memory_usage+=other.self_cpu_memory_usageself.self_device_memory_usage+=other.self_device_memory_usageself.count+=other.countifself.flopsisNone:self.flops=other.flopselifother.flopsisnotNone:self.flops+=other.flopsreturnselfdef__iadd__(self,other):returnself.add(other)def__repr__(self):device_name="cuda"ifnotself.use_deviceelseself.use_deviceself_device_time=self.self_device_time_total_strdevice_time=self.device_time_strdevice_memory=self.device_memory_usagereturn(f"<FunctionEventAvg key={self.key} self_cpu_time={self.self_cpu_time_total_str} cpu_time={self.cpu_time_str} "f" self_{device_name}_time={self_device_time}{device_name}_time={device_time} input_shapes={str(self.input_shapes)} "f"cpu_memory_usage={self.cpu_memory_usage}{device_name}_memory_usage={device_memory}>")
[docs]classStringTable(defaultdict):def__missing__(self,key):# manage cases like 't' (demangled to 'unsigned short') separately,# for now simply check the length to avoid unexpected results for# the short sequencesself[key]=torch._C._demangle(key)iflen(key)>1elsekeyreturnself[key]
[docs]classMemRecordsAcc:"""Acceleration structure for accessing mem_records in interval."""def__init__(self,mem_records):self._mem_records=mem_recordsself._start_nses:List[int]=[]self._indices:List[int]=[]iflen(mem_records)>0:tmp=sorted([(r[0].start_ns(),i)fori,rinenumerate(mem_records)])self._start_nses,self._indices=zip(*tmp)# type: ignore[assignment]
[docs]defin_interval(self,start_us,end_us):r""" Return all records in the given interval To maintain backward compatibility, convert us to ns in function """start_idx=bisect.bisect_left(self._start_nses,start_us*1000)end_idx=bisect.bisect_right(self._start_nses,end_us*1000)foriinrange(start_idx,end_idx):yieldself._mem_records[self._indices[i]]
def_filter_stack_entry(entry):filtered_entries=[("autograd/__init__","_make_grads"),("autograd/__init__","backward"),("torch/tensor","backward"),("_internal/common_utils","prof_callable"),("_internal/common_utils","prof_func_call"),("_internal/common_utils","prof_meth_call"),]returnall(not(f[0]inentryandf[1]inentry)forfinfiltered_entries)MEMORY_EVENT_NAME="[memory]"OUT_OF_MEMORY_EVENT_NAME="[OutOfMemory]"def_filter_name(name):# ignoring the following utility opsfiltered_out_names=[MEMORY_EVENT_NAME,# used only for the top-level memory eventsOUT_OF_MEMORY_EVENT_NAME,"profiler::_record_function_enter","profiler::_record_function_enter_new","profiler::_record_function_exit","aten::is_leaf","aten::output_nr","aten::_version",]returnnameinfiltered_out_names# Demangles and optionally rewrites the provided event name,# with_wildcard - whether to replace certain numbered event names# with a wildcard name to aggregate them together in the profiler table# outputdef_rewrite_name(name,with_wildcard=False):string_table=StringTable()name=string_table[name]ifwith_wildcard:ifname.startswith("ProfilerStep#"):name="ProfilerStep*"returnnamedef_build_table(events,sort_by=None,header=None,row_limit=100,max_src_column_width=75,max_name_column_width=55,max_shapes_column_width=80,with_flops=False,profile_memory=False,top_level_events_only=False,):"""Print a summary of events (which can be a list of FunctionEvent or FunctionEventAvg)."""iflen(events)==0:return""has_device_time=any(event.self_device_time_total>0foreventinevents)has_device_mem=any(event.self_device_memory_usage>0foreventinevents)use_device=events[0].use_device# Running on PrivateUse1 device with profiler but not enable# ProfilerActivity.PrivateUse1 can also catch privateuse1 memory usage.# Here only need to check has_privateuse1_time if not use_device.ifnotuse_deviceandhas_device_time:raiseRuntimeError("use_device is None, but there is device performance data.")has_input_shapes=any((event.input_shapesisnotNoneandlen(event.input_shapes)>0)foreventinevents)ifsort_byisnotNone:events=EventList(sorted(events,key=lambdaevt:getattr(evt,sort_by.replace("cuda","device").replace("xpu","device").replace("privateuse1","device"),),reverse=True,),use_device=use_device,profile_memory=profile_memory,with_flops=with_flops,)name_column_width=max(len(evt.key)forevtinevents)+4ifmax_name_column_widthisnotNone:name_column_width=min(name_column_width,max_name_column_width)shapes_column_width=max(len(str(evt.input_shapes))forevtinevents)+4ifmax_shapes_column_widthisnotNone:shapes_column_width=min(shapes_column_width,max_shapes_column_width)DEFAULT_COLUMN_WIDTH=12flops_column_width=DEFAULT_COLUMN_WIDTHsrc_column_width=Nonestacks=[evt.stackforevtineventsifevt.stackisnotNoneandlen(evt.stack)>0]has_stack=len(stacks)>0ifhas_stack:src_column_width=(max(max(len(entry)forentryinstack)forstackinstacks)+4)ifmax_src_column_widthisnotNone:src_column_width=min(src_column_width,max_src_column_width)headers=["Name","Self CPU %","Self CPU","CPU total %","CPU total","CPU time avg",]device_name=use_device.upper()ifuse_deviceisnotNoneelse"None"ifhas_device_time:headers.extend([f"Self {device_name}",f"Self {device_name} %",f"{device_name} total",f"{device_name} time avg",])ifprofile_memory:headers.extend(["CPU Mem","Self CPU Mem",])ifuse_deviceandhas_device_mem:headers.extend([f"{device_name} Mem",f"Self {device_name} Mem",])headers.append("# of Calls")# Only append Node ID if any event has a valid (>= 0) Node IDappend_node_id=any(evt.node_id!=-1forevtinevents)ifappend_node_id:headers.append("Node ID")# Have to use a list because nonlocal is Py3 only...SPACING_SIZE=2row_format_lst=[""]header_sep_lst=[""]line_length_lst=[-SPACING_SIZE]defadd_column(padding,text_dir=">"):row_format_lst[0]+=("{: "+text_dir+str(padding)+"}"+(" "*SPACING_SIZE))header_sep_lst[0]+="-"*padding+(" "*SPACING_SIZE)line_length_lst[0]+=padding+SPACING_SIZEdefauto_scale_flops(flops):flop_headers=["FLOPs","KFLOPs","MFLOPs","GFLOPs","TFLOPs","PFLOPs",]assertflops>0log_flops=max(0,min(math.log10(flops)/3,float(len(flop_headers)-1)))assertlog_flops>=0andlog_flops<len(flop_headers)return(pow(10,(math.floor(log_flops)*-3.0)),flop_headers[int(log_flops)])add_column(name_column_width)for_inheaders[1:]:add_column(DEFAULT_COLUMN_WIDTH)ifhas_input_shapes:headers.append("Input Shapes")add_column(shapes_column_width)ifhas_stack:headers.append("Source Location")add_column(src_column_width,text_dir="<")ifwith_flops:# Auto-scaling of flops headerraw_flops=[evt.flopsforevtineventsifevt.flops>0]iflen(raw_flops)!=0:(flops_scale,flops_header)=auto_scale_flops(min(raw_flops))headers.append(f"Total {flops_header}")add_column(flops_column_width)else:with_flops=False# can't find any valid flopsrow_format=row_format_lst[0]header_sep=header_sep_lst[0]line_length=line_length_lst[0]add_column=None# type: ignore[assignment]# Have to use a list because nonlocal is Py3 only...result=[]defappend(s):result.append(s)result.append("\n")# Yes, newline after the end as wellsum_self_cpu_time_total=0sum_self_device_time_total=0forevtinevents:sum_self_cpu_time_total+=evt.self_cpu_time_totalifevt.device_type==DeviceType.CPUandevt.is_legacy:# in legacy profiler, kernel info is stored in cpu eventssum_self_device_time_total+=evt.self_device_time_totalelif(evt.device_typein[DeviceType.CUDA,DeviceType.PrivateUse1,DeviceType.MTIA,]andnotevt.is_user_annotation):# in kineto profiler, there're events with the correct device type (e.g. CUDA)sum_self_device_time_total+=evt.self_device_time_total# Actual printingifheaderisnotNone:append("="*line_length)append(header)iftop_level_events_only:append("="*line_length)append("This report only display top-level ops statistics")append(header_sep)append(row_format.format(*headers))append(header_sep)deftrim_path(path,src_column_width):iflen(path)>src_column_width:offset=len(path)-src_column_widthpath=path[offset:]iflen(path)>3:path="..."+path[3:]returnpathevent_limit=0forevtinevents:ifevent_limit==row_limit:breakiftop_level_events_onlyandevt.cpu_parentisnotNone:continueelse:event_limit+=1name=evt.keyifmax_name_column_widthisnotNoneandlen(name)>=max_name_column_width-3:name=name[:(max_name_column_width-3)]+"..."evt.self_cpu_percent=_format_time_share(evt.self_cpu_time_total,sum_self_cpu_time_total)evt.total_cpu_percent=(_format_time_share(evt.cpu_time_total,sum_self_cpu_time_total)ifnotevt.is_asyncelse0)row_values=[name,# Self CPU total %, 0 for async events.evt.self_cpu_percent,evt.self_cpu_time_total_str,# Self CPU total# CPU total %, 0 for async events.evt.total_cpu_percent,evt.cpu_time_total_str,# CPU totalevt.cpu_time_str,# CPU time avg]ifhas_device_time:evt.total_device_percent=_format_time_share(evt.self_device_time_total,sum_self_device_time_total)row_values.extend([evt.self_device_time_total_str,# device time total %evt.total_device_percent,evt.device_time_total_str,evt.device_time_str,# device time avg])ifprofile_memory:row_values.extend([# CPU Mem Total_format_memory(evt.cpu_memory_usage),# Self CPU Mem Total_format_memory(evt.self_cpu_memory_usage),])ifuse_deviceandhas_device_mem:row_values.extend([# Device Mem Total_format_memory(evt.device_memory_usage),# Self Device Mem Total_format_memory(evt.self_device_memory_usage),])row_values.append(evt.count,# Number of calls)ifappend_node_id:row_values.append(evt.node_id)ifhas_input_shapes:row_values.append(str(evt.input_shapes)[:shapes_column_width])ifwith_flops:ifevt.flops<=0:row_values.append("--")else:row_values.append(f"{evt.flops*flops_scale:8.3f}")# type: ignore[possibly-undefined]ifhas_stack:src_field=""iflen(evt.stack)>0:src_field=trim_path(evt.stack[0],src_column_width)row_values.append(src_field)append(row_format.format(*row_values))ifhas_stack:empty_headers=[""]*(len(headers)-1)forentryinevt.stack[1:]:append(row_format.format(*(empty_headers+[trim_path(entry,src_column_width)])))empty_headers.append("")append(row_format.format(*empty_headers))append(header_sep)append(f"Self CPU time total: {_format_time(sum_self_cpu_time_total)}")ifhas_device_time:append(f"Self {use_device.upper()ifuse_deviceisnotNoneelse'None'} "f"time total: {_format_time(sum_self_device_time_total)}")return"".join(result)
Docs
Access comprehensive developer documentation for PyTorch
To analyze traffic and optimize your experience, we serve cookies on this site. By clicking or navigating, you agree to allow our usage of cookies. As the current maintainers of this site, Facebook’s Cookies Policy applies. Learn more, including about available controls: Cookies Policy.