Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions api/nvidia/v1/clusterpolicy_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -949,6 +949,13 @@ type DCGMExporterSpec struct {
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="HPC Job Mapping Configuration"
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:advanced"
HPCJobMapping *DCGMExporterHPCJobMappingConfig `json:"hpcJobMapping,omitempty"`

// Optional: Pod metrics enrichment configuration for DCGM Exporter
// +kubebuilder:validation:Optional
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Pod Metrics Configuration"
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:advanced"
PodMetrics *DCGMExporterPodMetricsConfig `json:"podMetrics,omitempty"`
}

// DCGMExporterHPCJobMappingConfig defines HPC job mapping configuration for NVIDIA DCGM Exporter
Expand All @@ -969,6 +976,18 @@ type DCGMExporterHPCJobMappingConfig struct {
Directory string `json:"directory,omitempty"`
}

// DCGMExporterPodMetricsConfig defines pod metrics enrichment configuration for DCGM Exporter
type DCGMExporterPodMetricsConfig struct {
// EnablePodLabels enables Kubernetes pod labels in metrics.
// When enabled, metrics will include labels from pods using GPUs.
// This requires cluster-wide read permissions to pods.
// +kubebuilder:validation:Optional
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Enable Pod Labels in Metrics"
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:booleanSwitch"
EnablePodLabels *bool `json:"enablePodLabels,omitempty"`
}

// DCGMExporterMetricsConfig defines metrics to be collected by NVIDIA DCGM Exporter
type DCGMExporterMetricsConfig struct {
// ConfigMap name with file dcgm-metrics.csv for metrics to be collected by NVIDIA DCGM Exporter
Expand Down Expand Up @@ -2015,6 +2034,14 @@ func (e *DCGMExporterSpec) GetHPCJobMappingDirectory() string {
return e.HPCJobMapping.Directory
}

// IsPodLabelsEnabled returns true if pod label enrichment is enabled for DCGM Exporter
func (e *DCGMExporterSpec) IsPodLabelsEnabled() bool {
if e.PodMetrics == nil || e.PodMetrics.EnablePodLabels == nil {
return false
}
return *e.PodMetrics.EnablePodLabels
}

// IsEnabled returns true if gpu-feature-discovery is enabled(default) through gpu-operator
func (g *GPUFeatureDiscoverySpec) IsEnabled() bool {
if g.Enabled == nil {
Expand Down
15 changes: 15 additions & 0 deletions assets/state-dcgm-exporter/0210_clusterrole.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: nvidia-dcgm-exporter
labels:
app: nvidia-dcgm-exporter
rules:
- apiGroups:
- ""
resources:
- pods
verbs:
- get
- list
- watch
14 changes: 14 additions & 0 deletions assets/state-dcgm-exporter/0220_clusterrolebinding.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: nvidia-dcgm-exporter
labels:
app: nvidia-dcgm-exporter
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: nvidia-dcgm-exporter
subjects:
- kind: ServiceAccount
name: nvidia-dcgm-exporter
namespace: "FILLED BY THE OPERATOR"
26 changes: 26 additions & 0 deletions controllers/object_controls.go
Original file line number Diff line number Diff line change
Expand Up @@ -454,6 +454,16 @@ func ClusterRole(n ClusterPolicyController) (gpuv1.State, error) {
return gpuv1.Disabled, nil
}

// For dcgm-exporter, ClusterRole is only needed when pod labels are enabled
if n.stateNames[n.idx] == "state-dcgm-exporter" && !n.singleton.Spec.DCGMExporter.IsPodLabelsEnabled() {
err := n.client.Delete(ctx, obj)
if err != nil && !apierrors.IsNotFound(err) {
logger.Info("Couldn't delete", "Error", err)
return gpuv1.NotReady, err
}
return gpuv1.Disabled, nil
}

if err := controllerutil.SetControllerReference(n.singleton, obj, n.scheme); err != nil {
return gpuv1.NotReady, err
}
Expand Down Expand Up @@ -495,6 +505,16 @@ func ClusterRoleBinding(n ClusterPolicyController) (gpuv1.State, error) {
return gpuv1.Disabled, nil
}

// For dcgm-exporter, ClusterRoleBinding is only needed when pod labels are enabled
if n.stateNames[n.idx] == "state-dcgm-exporter" && !n.singleton.Spec.DCGMExporter.IsPodLabelsEnabled() {
err := n.client.Delete(ctx, obj)
if err != nil && !apierrors.IsNotFound(err) {
logger.Info("Couldn't delete", "Error", err)
return gpuv1.NotReady, err
}
return gpuv1.Disabled, nil
}

for idx := range obj.Subjects {
obj.Subjects[idx].Namespace = n.operatorNamespace
}
Expand Down Expand Up @@ -1850,6 +1870,12 @@ func TransformDCGMExporter(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpe
setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), "DCGM_EXPORTER_COLLECTORS", MetricsConfigMountPath)
}

// configure Kubernetes pod label enrichment if enabled
if config.DCGMExporter.IsPodLabelsEnabled() {
setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), "DCGM_EXPORTER_KUBERNETES_ENABLE_POD_LABELS", "true")
obj.Spec.Template.Spec.AutomountServiceAccountToken = ptr.To(true)
}

for _, env := range config.DCGMExporter.Env {
setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), env.Name, env.Value)
}
Expand Down
76 changes: 76 additions & 0 deletions controllers/transforms_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,11 @@ func (d Daemonset) WithVolume(volume corev1.Volume) Daemonset {
return d
}

func (d Daemonset) WithAutomountServiceAccountToken(enabled bool) Daemonset {
d.Spec.Template.Spec.AutomountServiceAccountToken = &enabled
return d
}

// Pod is a Pod wrapper used for testing
type Pod struct {
*corev1.Pod
Expand Down Expand Up @@ -1447,6 +1452,77 @@ func TestTransformDCGMExporter(t *testing.T) {
WithRuntimeClassName("nvidia").
WithHostPathVolume("hpc-job-mapping", "/run/nvidia/dcgm-job-mapping", ptr.To(corev1.HostPathDirectoryOrCreate)),
},
{
description: "transform dcgm exporter with kubernetes pod labels enabled",
ds: NewDaemonset().
WithContainer(corev1.Container{Name: "dcgm-exporter"}).
WithContainer(corev1.Container{Name: "dummy"}),
cpSpec: &gpuv1.ClusterPolicySpec{
DCGMExporter: gpuv1.DCGMExporterSpec{
Repository: "nvcr.io/nvidia/cloud-native",
Image: "dcgm-exporter",
Version: "v1.0.0",
ImagePullPolicy: "IfNotPresent",
ImagePullSecrets: []string{"pull-secret"},
Args: []string{"--fail-on-init-error=false"},
Env: []gpuv1.EnvVar{
{Name: "foo", Value: "bar"},
},
PodMetrics: &gpuv1.DCGMExporterPodMetricsConfig{
EnablePodLabels: newBoolPtr(true),
},
},
DCGM: gpuv1.DCGMSpec{
Enabled: newBoolPtr(true),
},
},
expectedDs: NewDaemonset().WithContainer(corev1.Container{
Name: "dcgm-exporter",
Image: "nvcr.io/nvidia/cloud-native/dcgm-exporter:v1.0.0",
ImagePullPolicy: corev1.PullIfNotPresent,
Args: []string{"--fail-on-init-error=false"},
Env: []corev1.EnvVar{
{Name: "DCGM_REMOTE_HOSTENGINE_INFO", Value: "nvidia-dcgm:5555"},
{Name: "DCGM_EXPORTER_KUBERNETES_ENABLE_POD_LABELS", Value: "true"},
{Name: "foo", Value: "bar"},
},
}).WithContainer(corev1.Container{Name: "dummy"}).WithPullSecret("pull-secret").WithRuntimeClassName("nvidia").WithAutomountServiceAccountToken(true),
},
{
description: "transform dcgm exporter with kubernetes pod labels disabled",
ds: NewDaemonset().
WithContainer(corev1.Container{Name: "dcgm-exporter"}).
WithContainer(corev1.Container{Name: "dummy"}),
cpSpec: &gpuv1.ClusterPolicySpec{
DCGMExporter: gpuv1.DCGMExporterSpec{
Repository: "nvcr.io/nvidia/cloud-native",
Image: "dcgm-exporter",
Version: "v1.0.0",
ImagePullPolicy: "IfNotPresent",
ImagePullSecrets: []string{"pull-secret"},
Args: []string{"--fail-on-init-error=false"},
Env: []gpuv1.EnvVar{
{Name: "foo", Value: "bar"},
},
PodMetrics: &gpuv1.DCGMExporterPodMetricsConfig{
EnablePodLabels: newBoolPtr(false),
},
},
DCGM: gpuv1.DCGMSpec{
Enabled: newBoolPtr(true),
},
},
expectedDs: NewDaemonset().WithContainer(corev1.Container{
Name: "dcgm-exporter",
Image: "nvcr.io/nvidia/cloud-native/dcgm-exporter:v1.0.0",
ImagePullPolicy: corev1.PullIfNotPresent,
Args: []string{"--fail-on-init-error=false"},
Env: []corev1.EnvVar{
{Name: "DCGM_REMOTE_HOSTENGINE_INFO", Value: "nvidia-dcgm:5555"},
{Name: "foo", Value: "bar"},
},
}).WithContainer(corev1.Container{Name: "dummy"}).WithPullSecret("pull-secret").WithRuntimeClassName("nvidia"),
},
}

for _, tc := range testCases {
Expand Down
3 changes: 3 additions & 0 deletions deployments/gpu-operator/templates/clusterpolicy.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -551,6 +551,9 @@ spec:
{{- if .Values.dcgmExporter.hpcJobMapping }}
hpcJobMapping: {{ toYaml .Values.dcgmExporter.hpcJobMapping | nindent 6 }}
{{- end }}
{{- if .Values.dcgmExporter.podMetrics }}
podMetrics: {{ toYaml .Values.dcgmExporter.podMetrics | nindent 6 }}
{{- end }}
gfd:
enabled: {{ .Values.gfd.enabled }}
{{- if .Values.gfd.repository }}
Expand Down
5 changes: 5 additions & 0 deletions deployments/gpu-operator/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -285,6 +285,11 @@ dcgmExporter:
# hpcJobMapping:
# enabled: true
# directory: /var/lib/dcgm-exporter/job-mapping
# Pod metrics enrichment settings
podMetrics:
# Enable pod labels in metrics. When enabled, metrics will include labels from pods using GPUs.
# This requires cluster-wide read permissions to pods.
enablePodLabels: false
service:
internalTrafficPolicy: Cluster
serviceMonitor:
Expand Down
Loading