Skip to content
12 changes: 12 additions & 0 deletions frontend/packages/console-app/locales/en/console-app.json
Original file line number Diff line number Diff line change
Expand Up @@ -416,6 +416,18 @@
"Reason": "Reason",
"Updated": "Updated",
"Changed": "Changed",
"GPU metrics": "GPU metrics",
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

419: Confirming "GPU metrics" intended as section heading. if so, fine. want to confirm though that it's visually distinct from "GPU metrics per device" label (424) in UI so users don't conflate them.

"GPU count": "GPU count",
"GPU model": "GPU model",
"GPU capacity": "GPU capacity",
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

422: "GPU capacity" ambiguous. refers to memory, compute, something else? SR (for ex.): "GPU memory capacity".

if it refers to something other than memory, name accordingly.

"Allocatable GPUs": "Allocatable GPUs",
"Per-device GPU metrics": "Per-device GPU metrics",
"GPU device": "GPU device",
"Temperature": "Temperature",
"Power usage": "Power usage",
"Framebuffer memory used": "Framebuffer memory used",
"Framebuffer memory free": "Framebuffer memory free",
"GPU metrics are not available. Make sure the NVIDIA DCGM exporter is scraping metrics and labeling them with the node name.": "GPU metrics are not available. Make sure the NVIDIA DCGM exporter is scraping metrics and labeling them with the node name.",
"Node details": "Node details",
"External ID": "External ID",
"Labels": "Labels",
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import type { FC } from 'react';
import { PROMETHEUS_BASE_PATH } from '@console/internal/components/graphs/consts';
import type { NodeKind } from '@console/internal/module/k8s';
import NodeDetailsConditions from './NodeDetailsConditions';
import NodeDetailsGpuMetrics from './NodeDetailsGpuMetrics';
import NodeDetailsImages from './NodeDetailsImages';
import NodeDetailsOverview from './NodeDetailsOverview';

Expand All @@ -11,6 +13,7 @@ type NodeDetailsProps = {
const NodeDetails: FC<NodeDetailsProps> = ({ obj: node }) => (
<>
<NodeDetailsOverview node={node} />
{PROMETHEUS_BASE_PATH && <NodeDetailsGpuMetrics node={node} />}
<NodeDetailsConditions node={node} />
<NodeDetailsImages node={node} />
</>
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,289 @@
import type { FC } from 'react';
import { useMemo } from 'react';
import {
Bullseye,
DescriptionList,
DescriptionListDescription,
DescriptionListGroup,
DescriptionListTerm,
Grid,
GridItem,
Spinner,
} from '@patternfly/react-core';
import { useTranslation } from 'react-i18next';
import type { PrometheusResponse, PrometheusResult } from '@console/internal/components/graphs';
import { PrometheusEndpoint } from '@console/internal/components/graphs/helpers';
import { usePrometheusPoll } from '@console/internal/components/graphs/prometheus-poll-hook';
import { SectionHeading } from '@console/internal/components/utils/headings';
import type { NodeKind } from '@console/internal/module/k8s';
import PaneBody from '@console/shared/src/components/layout/PaneBody';
import {
GpuMetricQuery,
getGpuMetricQueries,
nodeHasGpuCapacity,
GPU_RESOURCE_KEYS,
} from './nodeGpuMetricsQueries';

type GpuMetricResult = {
value: string;
modelName?: string;
device?: string;
};

type GpuDeviceRow = {
id: string;
label: string;
utilization: string;
temperature: string;
power: string;
fbUsed: string;
fbFree: string;
};

const resultsByGpu = (
response: PrometheusResponse | undefined,
): Record<string, GpuMetricResult> => {
if (!response?.data?.result?.length) {
return {};
}
return response.data.result.reduce<Record<string, GpuMetricResult>>(
(acc, r: PrometheusResult) => {
const gpu = r.metric?.gpu ?? r.metric?.GPU_I_ID ?? r.metric?.UUID ?? r.metric?.device ?? '';
if (!gpu) {
return acc;
}
acc[gpu] = {
value: r.value?.[1] ?? '',
modelName: r.metric?.modelName,
device: r.metric?.device,
};
return acc;
},
{},
);
};
Comment thread
coderabbitai[bot] marked this conversation as resolved.

const collectGpuIds = (...maps: Record<string, GpuMetricResult>[]): string[] => {
const ids = new Set<string>();
maps.forEach((m) => Object.keys(m).forEach((k) => ids.add(k)));
return [...ids].sort();
};

const gpuDeviceLabel = (gpuId: string, meta: GpuMetricResult | undefined): string => {
const index = `GPU ${gpuId}`;
const model = meta?.modelName;
if (model) {
return `${index} \u2014 ${model}`;
}
const dev = meta?.device;
if (dev) {
return `${index} (${dev})`;
}
return index;
};

const findFirstMeta = (...maps: Record<string, GpuMetricResult>[]): GpuMetricResult | undefined => {
for (const m of maps) {
for (const entry of Object.values(m)) {
if (entry.modelName) return entry;
}
}
return Object.values(maps[0] ?? {})[0];
};

const formatValue = (val: string | undefined, suffix: string): string => {
if (val === undefined || val === '') return '-';
const num = parseFloat(val);
if (Number.isNaN(num)) return '-';
return `${Math.round(num * 10) / 10} ${suffix}`;
};

const formatMemMiB = (val: string | undefined): string => {
if (val === undefined || val === '') return '-';
const mib = parseFloat(val);
if (Number.isNaN(mib)) return '-';
if (mib >= 1024) return `${(mib / 1024).toFixed(1)} GiB`;
return `${Math.round(mib)} MiB`;
};

type NodeDetailsGpuMetricsProps = {
node: NodeKind;
};

const NodeDetailsGpuMetrics: FC<NodeDetailsGpuMetricsProps> = ({ node }) => {
const { t } = useTranslation();
const nodeName = node.metadata.name;

const hasCapacity = nodeHasGpuCapacity(node.status?.capacity);

const queries = useMemo(() => getGpuMetricQueries(nodeName), [nodeName]);

const [countResponse, , countLoading] = usePrometheusPoll({
endpoint: PrometheusEndpoint.QUERY,
query: queries[GpuMetricQuery.GPU_COUNT],
});
const [utilResponse, , utilLoading] = usePrometheusPoll({
endpoint: PrometheusEndpoint.QUERY,
query: queries[GpuMetricQuery.GPU_UTILIZATION],
});
const [tempResponse, , tempLoading] = usePrometheusPoll({
endpoint: PrometheusEndpoint.QUERY,
query: queries[GpuMetricQuery.GPU_TEMPERATURE],
});
const [powerResponse, , powerLoading] = usePrometheusPoll({
endpoint: PrometheusEndpoint.QUERY,
query: queries[GpuMetricQuery.GPU_POWER_USAGE],
});
const [fbUsedResponse, , fbUsedLoading] = usePrometheusPoll({
endpoint: PrometheusEndpoint.QUERY,
query: queries[GpuMetricQuery.GPU_FB_USED],
});
const [fbFreeResponse, , fbFreeLoading] = usePrometheusPoll({
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The component makes 6 separate usePrometheusPoll calls (one per DCGM metric). Was consolidating the 6 usePrometheusPoll calls into a single query considered? Would that reduce Prometheus load, or is keeping them separate intentional for clearer response handling?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Considered this as well. The reason for 6 separate calls is that each metric (DCGM_FI_DEV_GPU_UTIL, DCGM_FI_DEV_GPU_TEMP, etc.) is a distinct Prometheus metric name — there's no single PromQL instant query that can return multiple different metric names in one call.

We could technically use a wrapper function that fires all 6 fetches concurrently via Promise.all, but that would mean moving away from usePrometheusPoll (which the console standardizes on for metric polling, caching, and lifecycle management).

This pattern is consistent with other console components that fetch multiple Prometheus metrics (e.g., node overview, project dashboards). Each usePrometheusPoll call is lightweight — a single instant query with no range — and the browser's HTTP/2 multiplexing handles the concurrent requests efficiently.

Happy to consolidate if there's a preferred console pattern for batching, but for now this aligns with the existing codebase conventions.

endpoint: PrometheusEndpoint.QUERY,
query: queries[GpuMetricQuery.GPU_FB_FREE],
});

const isLoading =
countLoading || utilLoading || tempLoading || powerLoading || fbUsedLoading || fbFreeLoading;

const utilMap = useMemo(() => resultsByGpu(utilResponse), [utilResponse]);
const tempMap = useMemo(() => resultsByGpu(tempResponse), [tempResponse]);
const powerMap = useMemo(() => resultsByGpu(powerResponse), [powerResponse]);
const fbUsedMap = useMemo(() => resultsByGpu(fbUsedResponse), [fbUsedResponse]);
const fbFreeMap = useMemo(() => resultsByGpu(fbFreeResponse), [fbFreeResponse]);

const gpuIds = useMemo(() => collectGpuIds(utilMap, tempMap, powerMap, fbUsedMap, fbFreeMap), [
utilMap,
tempMap,
powerMap,
fbUsedMap,
fbFreeMap,
]);

const hasMetrics = gpuIds.length > 0;

if (!hasCapacity && !isLoading && !hasMetrics) {
return null;
}

const gpuCountValue = countResponse?.data?.result?.[0]?.value?.[1];
const gpuCountStr = (() => {
if (gpuCountValue === undefined || gpuCountValue === '') return undefined;
const parsed = parseFloat(gpuCountValue);
return Number.isNaN(parsed) ? undefined : String(Math.round(parsed));
})();

const gpuCapacityStr = GPU_RESOURCE_KEYS.map((key) => node.status?.capacity?.[key])
.filter(Boolean)
.join(', ');
const gpuAllocatableStr = GPU_RESOURCE_KEYS.map((key) => node.status?.allocatable?.[key])
.filter(Boolean)
.join(', ');

const firstMeta = findFirstMeta(utilMap, tempMap, powerMap, fbUsedMap, fbFreeMap);
const gpuModelStr = firstMeta?.modelName;

const rows: GpuDeviceRow[] = gpuIds.map((id) => {
const meta = utilMap[id] ?? tempMap[id] ?? powerMap[id] ?? fbUsedMap[id] ?? fbFreeMap[id];
return {
id,
label: gpuDeviceLabel(id, meta),
utilization: formatValue(utilMap[id]?.value, '%'),
temperature: formatValue(tempMap[id]?.value, '°C'),
power: formatValue(powerMap[id]?.value, 'W'),
fbUsed: formatMemMiB(fbUsedMap[id]?.value),
fbFree: formatMemMiB(fbFreeMap[id]?.value),
};
});

return (
<PaneBody>
<SectionHeading text={t('console-app~GPU metrics')} />

{(gpuCountStr || gpuCapacityStr || gpuAllocatableStr || gpuModelStr) && (
<Grid hasGutter>
<GridItem md={6}>
<DescriptionList isHorizontal>
{gpuCountStr && (
<DescriptionListGroup>
<DescriptionListTerm>{t('console-app~GPU count')}</DescriptionListTerm>
<DescriptionListDescription>{gpuCountStr}</DescriptionListDescription>
</DescriptionListGroup>
)}
{gpuModelStr && (
<DescriptionListGroup>
<DescriptionListTerm>{t('console-app~GPU model')}</DescriptionListTerm>
<DescriptionListDescription>{gpuModelStr}</DescriptionListDescription>
</DescriptionListGroup>
)}
</DescriptionList>
</GridItem>
<GridItem md={6}>
<DescriptionList isHorizontal>
{gpuCapacityStr && (
<DescriptionListGroup>
<DescriptionListTerm>{t('console-app~GPU capacity')}</DescriptionListTerm>
<DescriptionListDescription>{gpuCapacityStr}</DescriptionListDescription>
</DescriptionListGroup>
)}
{gpuAllocatableStr && (
<DescriptionListGroup>
<DescriptionListTerm>{t('console-app~Allocatable GPUs')}</DescriptionListTerm>
<DescriptionListDescription>{gpuAllocatableStr}</DescriptionListDescription>
</DescriptionListGroup>
)}
</DescriptionList>
</GridItem>
</Grid>
)}

{isLoading && (
<Bullseye>
<Spinner size="lg" />
</Bullseye>
)}

{!isLoading && hasMetrics && (
<div className="co-table-container pf-v6-u-mt-md">
<table
className="pf-v6-c-table pf-m-compact pf-m-border-rows"
aria-label={t('console-app~Per-device GPU metrics')}
>
<thead className="pf-v6-c-table__thead">
<tr className="pf-v6-c-table__tr">
<th className="pf-v6-c-table__th">{t('console-app~GPU device')}</th>
<th className="pf-v6-c-table__th">{t('console-app~Utilization')}</th>
<th className="pf-v6-c-table__th">{t('console-app~Temperature')}</th>
<th className="pf-v6-c-table__th">{t('console-app~Power usage')}</th>
<th className="pf-v6-c-table__th">{t('console-app~Framebuffer memory used')}</th>
<th className="pf-v6-c-table__th">{t('console-app~Framebuffer memory free')}</th>
</tr>
</thead>
<tbody className="pf-v6-c-table__tbody">
{rows.map((row) => (
<tr className="pf-v6-c-table__tr" key={row.id}>
<td className="pf-v6-c-table__td">{row.label}</td>
<td className="pf-v6-c-table__td">{row.utilization}</td>
<td className="pf-v6-c-table__td">{row.temperature}</td>
<td className="pf-v6-c-table__td">{row.power}</td>
<td className="pf-v6-c-table__td">{row.fbUsed}</td>
<td className="pf-v6-c-table__td">{row.fbFree}</td>
</tr>
))}
</tbody>
</table>
</div>
Comment on lines +247 to +275
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not PF react components instead of classnames?

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point! Wondering whether switching only the GPU table to PF <Table> would make it look slightly different from the Node conditions and Images tables, which still use native

with PF classes. Refactoring those as well would be out of scope for this change, but I can open a follow-up story to migrate all three tables to PF Table.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Created a story for the change - CONSOLE-5309.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@TheRealJon @cajieh shall I work on this change real quick ? Or how do you suggest to proceed particularly for this PR ?

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Better to work on that separately since it touches other tables, unless @TheRealJon thinks otherwise.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@cajieh agree, for this atleast the objective of this PR / JIRA is almost achieved for GPU details.
@TheRealJon please suggest!

I can work on CONSOLE-5309 too (later) if that is ok.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@swshende-cmd @cajieh Yeah, no worries. This can be follow-on work, just wanted to get the question out there.

)}

{!isLoading && !hasMetrics && hasCapacity && (
<p className="text-secondary">
{t(
'console-app~GPU metrics are not available. Make sure the NVIDIA DCGM exporter is scraping metrics and labeling them with the node name.',
)}
</p>
)}
</PaneBody>
);
};

export default NodeDetailsGpuMetrics;
Loading