Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 13 additions & 24 deletions gpustack_runtime/detector/amd.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,19 +177,8 @@ def detect(self) -> Devices | None:
dev_mem_status = DeviceMemoryStatusEnum.HEALTHY
try:
dev_gpu_vram_usage = pyamdsmi.amdsmi_get_gpu_vram_usage(dev)
dev_mem = dev_gpu_vram_usage.get("vram_total")
dev_mem_used = dev_gpu_vram_usage.get("vram_used")
# On APUs with unified memory (e.g., AMD Strix Halo), VRAM
# reports only the BIOS carveout (~512 MiB); VIS_VRAM reports
# the full usable system memory. Use VIS_VRAM when larger.
with contextlib.suppress(pyrocmsmi.ROCMSMIError):
dev_mem_vis_vram = byte_to_mebibyte(
pyrocmsmi.rsmi_dev_memory_total_get(
dev_idx,
pyrocmsmi.RSMI_MEM_TYPE_VIS_VRAM,
),
)
dev_mem = max(dev_mem, dev_mem_vis_vram)
dev_mem = dev_gpu_vram_usage.get("vram_total") or 0
dev_mem_used = dev_gpu_vram_usage.get("vram_used") or 0
dev_ecc_count = pyamdsmi.amdsmi_get_gpu_ecc_count(
dev,
pyamdsmi.AmdSmiGpuBlock.UMC,
Expand All @@ -200,17 +189,6 @@ def detect(self) -> Devices | None:
dev_mem = byte_to_mebibyte( # byte to MiB
pyrocmsmi.rsmi_dev_memory_total_get(dev_idx),
)
# On APUs with unified memory (e.g., AMD Strix Halo), VRAM
# reports only the BIOS carveout (~512 MiB); VIS_VRAM reports
# the full usable system memory. Use VIS_VRAM when larger.
with contextlib.suppress(pyrocmsmi.ROCMSMIError):
dev_mem_vis_vram = byte_to_mebibyte(
pyrocmsmi.rsmi_dev_memory_total_get(
dev_idx,
pyrocmsmi.RSMI_MEM_TYPE_VIS_VRAM,
),
)
dev_mem = max(dev_mem, dev_mem_vis_vram)
dev_mem_used = byte_to_mebibyte( # byte to MiB
pyrocmsmi.rsmi_dev_memory_usage_get(dev_idx),
)
Expand All @@ -220,6 +198,17 @@ def detect(self) -> Devices | None:
)
if dev_ecc_count.uncorrectable_err > 0:
dev_mem_status = DeviceMemoryStatusEnum.UNHEALTHY
# On APUs with unified memory (e.g., AMD Strix Halo), VRAM
# reports only the BIOS carveout (~512 MiB); VIS_VRAM reports
# the full usable system memory. Use VIS_VRAM when larger.
with contextlib.suppress(pyrocmsmi.ROCMSMIError):
dev_mem_vis_vram = byte_to_mebibyte(
pyrocmsmi.rsmi_dev_memory_total_get(
dev_idx,
pyrocmsmi.RSMI_MEM_TYPE_VIS_VRAM,
),
)
dev_mem = max(dev_mem, dev_mem_vis_vram)

dev_power = None
dev_power_used = None
Expand Down
8 changes: 4 additions & 4 deletions gpustack_runtime/detector/pyrocmsmi/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,8 +64,8 @@

## Memory Types ##
RSMI_MEM_TYPE_VRAM = 0
RSMI_MEM_TYPE_GTT = 1
RSMI_MEM_TYPE_VIS_VRAM = 2
RSMI_MEM_TYPE_VIS_VRAM = 1
RSMI_MEM_TYPE_GTT = 2

## Error Codes ##
ROCMSMI_ERROR_UNINITIALIZED = -99997
Expand Down Expand Up @@ -231,7 +231,7 @@ def rsmi_dev_busy_percent_get(device=0):

def rsmi_dev_memory_usage_get(device=0, memory_type=None):
if memory_type is None:
memory_type = rsmi_memory_type_t.RSMI_MEM_TYPE_VRAM
memory_type = RSMI_MEM_TYPE_VRAM
c_used = c_uint64()
fn = _rocmsmiGetFunctionPointer("rsmi_dev_memory_usage_get")
ret = fn(device, memory_type, byref(c_used))
Expand All @@ -241,7 +241,7 @@ def rsmi_dev_memory_usage_get(device=0, memory_type=None):

def rsmi_dev_memory_total_get(device=0, memory_type=None):
if memory_type is None:
memory_type = rsmi_memory_type_t.RSMI_MEM_TYPE_VRAM
memory_type = RSMI_MEM_TYPE_VRAM
c_total = c_uint64()
fn = _rocmsmiGetFunctionPointer("rsmi_dev_memory_total_get")
ret = fn(device, memory_type, byref(c_total))
Expand Down
Loading