-
Notifications
You must be signed in to change notification settings - Fork 705
CONSOLE-5297: Display GPU metrics on the Node Details page #16456
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
6e19f8c
cf2c07d
c794438
2bd33e1
081f503
1ec1138
625b8fa
af4ed83
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -416,6 +416,18 @@ | |
| "Reason": "Reason", | ||
| "Updated": "Updated", | ||
| "Changed": "Changed", | ||
| "GPU metrics": "GPU metrics", | ||
| "GPU count": "GPU count", | ||
| "GPU model": "GPU model", | ||
| "GPU capacity": "GPU capacity", | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 422: "GPU capacity" ambiguous. refers to memory, compute, something else? SR (for ex.): "GPU memory capacity". if it refers to something other than memory, name accordingly. |
||
| "Allocatable GPUs": "Allocatable GPUs", | ||
| "Per-device GPU metrics": "Per-device GPU metrics", | ||
| "GPU device": "GPU device", | ||
| "Temperature": "Temperature", | ||
| "Power usage": "Power usage", | ||
| "Framebuffer memory used": "Framebuffer memory used", | ||
| "Framebuffer memory free": "Framebuffer memory free", | ||
| "GPU metrics are not available. Make sure the NVIDIA DCGM exporter is scraping metrics and labeling them with the node name.": "GPU metrics are not available. Make sure the NVIDIA DCGM exporter is scraping metrics and labeling them with the node name.", | ||
| "Node details": "Node details", | ||
| "External ID": "External ID", | ||
| "Labels": "Labels", | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,289 @@ | ||
| import type { FC } from 'react'; | ||
| import { useMemo } from 'react'; | ||
| import { | ||
| Bullseye, | ||
| DescriptionList, | ||
| DescriptionListDescription, | ||
| DescriptionListGroup, | ||
| DescriptionListTerm, | ||
| Grid, | ||
| GridItem, | ||
| Spinner, | ||
| } from '@patternfly/react-core'; | ||
| import { useTranslation } from 'react-i18next'; | ||
| import type { PrometheusResponse, PrometheusResult } from '@console/internal/components/graphs'; | ||
| import { PrometheusEndpoint } from '@console/internal/components/graphs/helpers'; | ||
| import { usePrometheusPoll } from '@console/internal/components/graphs/prometheus-poll-hook'; | ||
| import { SectionHeading } from '@console/internal/components/utils/headings'; | ||
| import type { NodeKind } from '@console/internal/module/k8s'; | ||
| import PaneBody from '@console/shared/src/components/layout/PaneBody'; | ||
| import { | ||
| GpuMetricQuery, | ||
| getGpuMetricQueries, | ||
| nodeHasGpuCapacity, | ||
| GPU_RESOURCE_KEYS, | ||
| } from './nodeGpuMetricsQueries'; | ||
|
|
||
| type GpuMetricResult = { | ||
| value: string; | ||
| modelName?: string; | ||
| device?: string; | ||
| }; | ||
|
|
||
| type GpuDeviceRow = { | ||
| id: string; | ||
| label: string; | ||
| utilization: string; | ||
| temperature: string; | ||
| power: string; | ||
| fbUsed: string; | ||
| fbFree: string; | ||
| }; | ||
|
|
||
| const resultsByGpu = ( | ||
| response: PrometheusResponse | undefined, | ||
| ): Record<string, GpuMetricResult> => { | ||
| if (!response?.data?.result?.length) { | ||
| return {}; | ||
| } | ||
| return response.data.result.reduce<Record<string, GpuMetricResult>>( | ||
| (acc, r: PrometheusResult) => { | ||
| const gpu = r.metric?.gpu ?? r.metric?.GPU_I_ID ?? r.metric?.UUID ?? r.metric?.device ?? ''; | ||
| if (!gpu) { | ||
| return acc; | ||
| } | ||
| acc[gpu] = { | ||
| value: r.value?.[1] ?? '', | ||
| modelName: r.metric?.modelName, | ||
| device: r.metric?.device, | ||
| }; | ||
| return acc; | ||
| }, | ||
| {}, | ||
| ); | ||
| }; | ||
|
coderabbitai[bot] marked this conversation as resolved.
|
||
|
|
||
| const collectGpuIds = (...maps: Record<string, GpuMetricResult>[]): string[] => { | ||
| const ids = new Set<string>(); | ||
| maps.forEach((m) => Object.keys(m).forEach((k) => ids.add(k))); | ||
| return [...ids].sort(); | ||
| }; | ||
|
|
||
| const gpuDeviceLabel = (gpuId: string, meta: GpuMetricResult | undefined): string => { | ||
| const index = `GPU ${gpuId}`; | ||
| const model = meta?.modelName; | ||
| if (model) { | ||
| return `${index} \u2014 ${model}`; | ||
| } | ||
| const dev = meta?.device; | ||
| if (dev) { | ||
| return `${index} (${dev})`; | ||
| } | ||
| return index; | ||
| }; | ||
|
|
||
| const findFirstMeta = (...maps: Record<string, GpuMetricResult>[]): GpuMetricResult | undefined => { | ||
| for (const m of maps) { | ||
| for (const entry of Object.values(m)) { | ||
| if (entry.modelName) return entry; | ||
| } | ||
| } | ||
| return Object.values(maps[0] ?? {})[0]; | ||
| }; | ||
|
|
||
| const formatValue = (val: string | undefined, suffix: string): string => { | ||
| if (val === undefined || val === '') return '-'; | ||
| const num = parseFloat(val); | ||
| if (Number.isNaN(num)) return '-'; | ||
| return `${Math.round(num * 10) / 10} ${suffix}`; | ||
| }; | ||
|
|
||
| const formatMemMiB = (val: string | undefined): string => { | ||
| if (val === undefined || val === '') return '-'; | ||
| const mib = parseFloat(val); | ||
| if (Number.isNaN(mib)) return '-'; | ||
| if (mib >= 1024) return `${(mib / 1024).toFixed(1)} GiB`; | ||
| return `${Math.round(mib)} MiB`; | ||
| }; | ||
|
|
||
| type NodeDetailsGpuMetricsProps = { | ||
| node: NodeKind; | ||
| }; | ||
|
|
||
| const NodeDetailsGpuMetrics: FC<NodeDetailsGpuMetricsProps> = ({ node }) => { | ||
| const { t } = useTranslation(); | ||
| const nodeName = node.metadata.name; | ||
|
|
||
| const hasCapacity = nodeHasGpuCapacity(node.status?.capacity); | ||
|
|
||
| const queries = useMemo(() => getGpuMetricQueries(nodeName), [nodeName]); | ||
|
|
||
| const [countResponse, , countLoading] = usePrometheusPoll({ | ||
| endpoint: PrometheusEndpoint.QUERY, | ||
| query: queries[GpuMetricQuery.GPU_COUNT], | ||
| }); | ||
| const [utilResponse, , utilLoading] = usePrometheusPoll({ | ||
| endpoint: PrometheusEndpoint.QUERY, | ||
| query: queries[GpuMetricQuery.GPU_UTILIZATION], | ||
| }); | ||
| const [tempResponse, , tempLoading] = usePrometheusPoll({ | ||
| endpoint: PrometheusEndpoint.QUERY, | ||
| query: queries[GpuMetricQuery.GPU_TEMPERATURE], | ||
| }); | ||
| const [powerResponse, , powerLoading] = usePrometheusPoll({ | ||
| endpoint: PrometheusEndpoint.QUERY, | ||
| query: queries[GpuMetricQuery.GPU_POWER_USAGE], | ||
| }); | ||
| const [fbUsedResponse, , fbUsedLoading] = usePrometheusPoll({ | ||
| endpoint: PrometheusEndpoint.QUERY, | ||
| query: queries[GpuMetricQuery.GPU_FB_USED], | ||
| }); | ||
| const [fbFreeResponse, , fbFreeLoading] = usePrometheusPoll({ | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The component makes 6 separate
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Considered this as well. The reason for 6 separate calls is that each metric ( We could technically use a wrapper function that fires all 6 fetches concurrently via This pattern is consistent with other console components that fetch multiple Prometheus metrics (e.g., node overview, project dashboards). Each Happy to consolidate if there's a preferred console pattern for batching, but for now this aligns with the existing codebase conventions. |
||
| endpoint: PrometheusEndpoint.QUERY, | ||
| query: queries[GpuMetricQuery.GPU_FB_FREE], | ||
| }); | ||
|
|
||
| const isLoading = | ||
| countLoading || utilLoading || tempLoading || powerLoading || fbUsedLoading || fbFreeLoading; | ||
|
|
||
| const utilMap = useMemo(() => resultsByGpu(utilResponse), [utilResponse]); | ||
| const tempMap = useMemo(() => resultsByGpu(tempResponse), [tempResponse]); | ||
| const powerMap = useMemo(() => resultsByGpu(powerResponse), [powerResponse]); | ||
| const fbUsedMap = useMemo(() => resultsByGpu(fbUsedResponse), [fbUsedResponse]); | ||
| const fbFreeMap = useMemo(() => resultsByGpu(fbFreeResponse), [fbFreeResponse]); | ||
|
|
||
| const gpuIds = useMemo(() => collectGpuIds(utilMap, tempMap, powerMap, fbUsedMap, fbFreeMap), [ | ||
| utilMap, | ||
| tempMap, | ||
| powerMap, | ||
| fbUsedMap, | ||
| fbFreeMap, | ||
| ]); | ||
|
|
||
| const hasMetrics = gpuIds.length > 0; | ||
|
|
||
| if (!hasCapacity && !isLoading && !hasMetrics) { | ||
| return null; | ||
| } | ||
|
|
||
| const gpuCountValue = countResponse?.data?.result?.[0]?.value?.[1]; | ||
| const gpuCountStr = (() => { | ||
| if (gpuCountValue === undefined || gpuCountValue === '') return undefined; | ||
| const parsed = parseFloat(gpuCountValue); | ||
| return Number.isNaN(parsed) ? undefined : String(Math.round(parsed)); | ||
| })(); | ||
|
|
||
| const gpuCapacityStr = GPU_RESOURCE_KEYS.map((key) => node.status?.capacity?.[key]) | ||
| .filter(Boolean) | ||
| .join(', '); | ||
| const gpuAllocatableStr = GPU_RESOURCE_KEYS.map((key) => node.status?.allocatable?.[key]) | ||
| .filter(Boolean) | ||
| .join(', '); | ||
|
|
||
| const firstMeta = findFirstMeta(utilMap, tempMap, powerMap, fbUsedMap, fbFreeMap); | ||
| const gpuModelStr = firstMeta?.modelName; | ||
|
|
||
| const rows: GpuDeviceRow[] = gpuIds.map((id) => { | ||
| const meta = utilMap[id] ?? tempMap[id] ?? powerMap[id] ?? fbUsedMap[id] ?? fbFreeMap[id]; | ||
| return { | ||
| id, | ||
| label: gpuDeviceLabel(id, meta), | ||
| utilization: formatValue(utilMap[id]?.value, '%'), | ||
| temperature: formatValue(tempMap[id]?.value, '°C'), | ||
| power: formatValue(powerMap[id]?.value, 'W'), | ||
| fbUsed: formatMemMiB(fbUsedMap[id]?.value), | ||
| fbFree: formatMemMiB(fbFreeMap[id]?.value), | ||
| }; | ||
| }); | ||
|
|
||
| return ( | ||
| <PaneBody> | ||
| <SectionHeading text={t('console-app~GPU metrics')} /> | ||
|
|
||
| {(gpuCountStr || gpuCapacityStr || gpuAllocatableStr || gpuModelStr) && ( | ||
| <Grid hasGutter> | ||
| <GridItem md={6}> | ||
| <DescriptionList isHorizontal> | ||
| {gpuCountStr && ( | ||
| <DescriptionListGroup> | ||
| <DescriptionListTerm>{t('console-app~GPU count')}</DescriptionListTerm> | ||
| <DescriptionListDescription>{gpuCountStr}</DescriptionListDescription> | ||
| </DescriptionListGroup> | ||
| )} | ||
| {gpuModelStr && ( | ||
| <DescriptionListGroup> | ||
| <DescriptionListTerm>{t('console-app~GPU model')}</DescriptionListTerm> | ||
| <DescriptionListDescription>{gpuModelStr}</DescriptionListDescription> | ||
| </DescriptionListGroup> | ||
| )} | ||
| </DescriptionList> | ||
| </GridItem> | ||
| <GridItem md={6}> | ||
| <DescriptionList isHorizontal> | ||
| {gpuCapacityStr && ( | ||
| <DescriptionListGroup> | ||
| <DescriptionListTerm>{t('console-app~GPU capacity')}</DescriptionListTerm> | ||
| <DescriptionListDescription>{gpuCapacityStr}</DescriptionListDescription> | ||
| </DescriptionListGroup> | ||
| )} | ||
| {gpuAllocatableStr && ( | ||
| <DescriptionListGroup> | ||
| <DescriptionListTerm>{t('console-app~Allocatable GPUs')}</DescriptionListTerm> | ||
| <DescriptionListDescription>{gpuAllocatableStr}</DescriptionListDescription> | ||
| </DescriptionListGroup> | ||
| )} | ||
| </DescriptionList> | ||
| </GridItem> | ||
| </Grid> | ||
| )} | ||
|
|
||
| {isLoading && ( | ||
| <Bullseye> | ||
| <Spinner size="lg" /> | ||
| </Bullseye> | ||
| )} | ||
|
|
||
| {!isLoading && hasMetrics && ( | ||
| <div className="co-table-container pf-v6-u-mt-md"> | ||
| <table | ||
| className="pf-v6-c-table pf-m-compact pf-m-border-rows" | ||
| aria-label={t('console-app~Per-device GPU metrics')} | ||
| > | ||
| <thead className="pf-v6-c-table__thead"> | ||
| <tr className="pf-v6-c-table__tr"> | ||
| <th className="pf-v6-c-table__th">{t('console-app~GPU device')}</th> | ||
| <th className="pf-v6-c-table__th">{t('console-app~Utilization')}</th> | ||
| <th className="pf-v6-c-table__th">{t('console-app~Temperature')}</th> | ||
| <th className="pf-v6-c-table__th">{t('console-app~Power usage')}</th> | ||
| <th className="pf-v6-c-table__th">{t('console-app~Framebuffer memory used')}</th> | ||
| <th className="pf-v6-c-table__th">{t('console-app~Framebuffer memory free')}</th> | ||
| </tr> | ||
| </thead> | ||
| <tbody className="pf-v6-c-table__tbody"> | ||
| {rows.map((row) => ( | ||
| <tr className="pf-v6-c-table__tr" key={row.id}> | ||
| <td className="pf-v6-c-table__td">{row.label}</td> | ||
| <td className="pf-v6-c-table__td">{row.utilization}</td> | ||
| <td className="pf-v6-c-table__td">{row.temperature}</td> | ||
| <td className="pf-v6-c-table__td">{row.power}</td> | ||
| <td className="pf-v6-c-table__td">{row.fbUsed}</td> | ||
| <td className="pf-v6-c-table__td">{row.fbFree}</td> | ||
| </tr> | ||
| ))} | ||
| </tbody> | ||
| </table> | ||
| </div> | ||
|
Comment on lines
+247
to
+275
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why not PF react components instead of classnames?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good point! Wondering whether switching only the GPU table to PF
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Created a story for the change - CONSOLE-5309.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @TheRealJon @cajieh shall I work on this change real quick ? Or how do you suggest to proceed particularly for this PR ?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Better to work on that separately since it touches other tables, unless @TheRealJon thinks otherwise.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @cajieh agree, for this atleast the objective of this PR / JIRA is almost achieved for GPU details. I can work on CONSOLE-5309 too (later) if that is ok.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @swshende-cmd @cajieh Yeah, no worries. This can be follow-on work, just wanted to get the question out there. |
||
| )} | ||
|
|
||
| {!isLoading && !hasMetrics && hasCapacity && ( | ||
| <p className="text-secondary"> | ||
| {t( | ||
| 'console-app~GPU metrics are not available. Make sure the NVIDIA DCGM exporter is scraping metrics and labeling them with the node name.', | ||
| )} | ||
| </p> | ||
| )} | ||
| </PaneBody> | ||
| ); | ||
| }; | ||
|
|
||
| export default NodeDetailsGpuMetrics; | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
419: Confirming "GPU metrics" intended as section heading. if so, fine. want to confirm though that it's visually distinct from "GPU metrics per device" label (424) in UI so users don't conflate them.