diff --git a/dask-metrics/README.md b/dask-metrics/README.md index 535cc54..1f5148b 100644 --- a/dask-metrics/README.md +++ b/dask-metrics/README.md @@ -177,21 +177,21 @@ Additionally, the box plot has the `freq` parameter (default 15), which controls In the case that you might want to track some metric that is not provided by default, you can use custom metric functions to get the job done. -Each custom metric function must take 2 arguments, even if they are unused: a `worker` object and a pynvml device `handle` (or handles). +Each custom metric function must take 2 arguments, even if they are unused: a `worker` object and a `cuda.core.system.Device` `device` (or devices). -An example custom metric function that tracks power usage across GPUs in a cluster might look like this: +An example custom metric function that tracks fan speed across GPUs in a cluster might look like this: ```python from dask_metrics import custom_metric @custom_metric('power') -def power_usage(worker, handle): - return pynvml.nvmlDeviceGetPowerUsage(handle) +def fan_speed(worker, device): + return device.get_fan(0).speed ``` You use the `custom_metric` decorator to indicate the name of this metric (what will show up as the column name) and whether this metric is run for all devices on a worker separately or all together at once (by setting the kwarg `per_device` in the decorator to `False`). -The previous example is a case of the former and `handle` respresents a single device handle the function is run for. In the case of the latter, `handle` is actually passed a list of the pynvml device handles instead and allows you to make comparisons between all the values for each GPU at a point in time. +The previous example is a case of the former and `device` respresents a single device handle the function is run for. In the case of the latter, `device` is actually passed a list of the `cuda.core.system.Device` objects instead and allows you to make comparisons between all the values for each GPU at a point in time. Note that custom metric functions are also passed a reference to the `Worker` object representing each worker, giving you access to track information about task states and and anything else Dask is up to. diff --git a/dask-metrics/dask_metrics/monitor.py b/dask-metrics/dask_metrics/monitor.py index fb6fb0a..a8ac4a8 100644 --- a/dask-metrics/dask_metrics/monitor.py +++ b/dask-metrics/dask_metrics/monitor.py @@ -11,10 +11,12 @@ import cudf import asyncio import time -import pynvml import re +from cuda.core import system + + def custom_metric(name, per_device=True): def metric(func): return {"op": func, "name": name, "per_device": per_device} @@ -595,7 +597,6 @@ def setup(self, worker): def start_recording(self): self.start = time.time() self.stop = 0 - pynvml.nvmlInit() # start loop self.loop = asyncio.ensure_future(self.log_metrics()) @@ -604,7 +605,6 @@ def stop_recording(self): self.stop = time.time() if not self.loop.cancelled(): self.loop.cancel() # shut down the loop - pynvml.nvmlShutdown() async def shutdown(self): self.stop_recording() @@ -666,16 +666,16 @@ def op(func): return op @operation("total-mem") - def total_mem(worker, handle): - return pynvml.nvmlDeviceGetMemoryInfo(handle).used + def total_mem(worker, device): + return device.memory_info.used @operation("mem-util") - def mem_util(worker, handle): - return pynvml.nvmlDeviceGetUtilizationRates(handle).memory + def mem_util(worker, device): + return device.utilization.memory @operation("compute-util") - def compute_util(worker, handle): - return pynvml.nvmlDeviceGetUtilizationRates(handle).gpu + def compute_util(worker, device): + return device.utilization.gpu self.tracking_list = [o for o in operations if o["name"] in tracking] self.tracking_list += custom @@ -698,7 +698,7 @@ def dump_partial(self): self.metrics_on_disk = True async def log_metrics(self): - ## the loop that polls the gpu for metrics using pynvml + ## the loop that polls the gpu for metrics using cuda.core.system while self.stop == 0: # universal metrics always tracked self.metrics["job"].append(self.job_number) @@ -716,13 +716,12 @@ async def log_metrics(self): def device_info(self, operation): ## applies operation over all device handles and returns the results - device_count = pynvml.nvmlDeviceGetCount() - handles = [pynvml.nvmlDeviceGetHandleByIndex(i) for i in range(device_count)] + devices = list(system.Device.get_all_devices()) if operation["per_device"]: - results = [str(operation["op"](self.worker, handle)) for handle in handles] + results = [str(operation["op"](self.worker, device)) for device in devices] return ", ".join(results) # join together with commas else: - return operation["op"](self.worker, handles) + return operation["op"](self.worker, devices) class MetricState(Enum):