[docs]classGpuInfo(Metric):"""Provides GPU information: a) used memory percentage, b) gpu utilization percentage values as Metric on each iterations. .. Note :: In case if gpu utilization reports "N/A" on a given GPU, corresponding metric value is not set. Examples: .. code-block:: python # Default GPU measurements GpuInfo().attach(trainer, name='gpu') # metric names are 'gpu:X mem(%)', 'gpu:X util(%)' # Logging with TQDM ProgressBar(persist=True).attach(trainer, metric_names=['gpu:0 mem(%)', 'gpu:0 util(%)']) # Progress bar will looks like # Epoch [2/10]: [12/24] 50%|█████ , gpu:0 mem(%)=79, gpu:0 util(%)=59 [00:17<1:23] # Logging with Tensorboard tb_logger.attach(trainer, log_handler=OutputHandler(tag="training", metric_names='all'), event_name=Events.ITERATION_COMPLETED) """def__init__(self)->None:try:frompynvml.smiimportnvidia_smiexceptImportError:raiseModuleNotFoundError("This contrib module requires pynvml to be installed. ""Please install it with command: \n pip install pynvml")# Let's check available devicesifnottorch.cuda.is_available():raiseRuntimeError("This contrib module requires available GPU")# Let it fail if no libnvidia drivers or NMVL library foundself.nvsmi=nvidia_smi.getInstance()super(GpuInfo,self).__init__()
[docs]defcompute(self)->List[Dict[str,Any]]:data:Dict[str,List[Dict[str,Any]]]=self.nvsmi.DeviceQuery("memory.used, memory.total, utilization.gpu")iflen(data)==0or("gpu"notindata):warnings.warn("No GPU information available")return[]returndata["gpu"]
[docs]defcompleted(self,engine:Engine,name:str)->None:data=self.compute()iflen(data)<1:warnings.warn("No GPU information available")returnfori,data_by_rankinenumerate(data):mem_name=f"{name}:{i} mem(%)"if"fb_memory_usage"notindata_by_rank:warnings.warn(f"No GPU memory usage information available in {data_by_rank}")continuemem_report=data_by_rank["fb_memory_usage"]ifnot("used"inmem_reportand"total"inmem_report):warnings.warn("GPU memory usage information does not provide used/total "f"memory consumption information in {mem_report}")continueengine.state.metrics[mem_name]=int(mem_report["used"]*100.0/mem_report["total"])fori,data_by_rankinenumerate(data):util_name=f"{name}:{i} util(%)"if"utilization"notindata_by_rank:warnings.warn(f"No GPU utilization information available in {data_by_rank}")continueutil_report=data_by_rank["utilization"]ifnot("gpu_util"inutil_report):warnings.warn(f"GPU utilization information does not provide 'gpu_util' information in {util_report}")continuetry:engine.state.metrics[util_name]=int(util_report["gpu_util"])exceptValueError:# Do not set GPU utilization informationpass
# TODO: see issue https://github.com/pytorch/ignite/issues/1405