diff --git a/api/nvidia/v1/clusterpolicy_types.go b/api/nvidia/v1/clusterpolicy_types.go index b17533c0f..b236321a8 100644 --- a/api/nvidia/v1/clusterpolicy_types.go +++ b/api/nvidia/v1/clusterpolicy_types.go @@ -949,6 +949,13 @@ type DCGMExporterSpec struct { // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="HPC Job Mapping Configuration" // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:advanced" HPCJobMapping *DCGMExporterHPCJobMappingConfig `json:"hpcJobMapping,omitempty"` + + // Optional: Pod metrics enrichment configuration for DCGM Exporter + // +kubebuilder:validation:Optional + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Pod Metrics Configuration" + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:advanced" + PodMetrics *DCGMExporterPodMetricsConfig `json:"podMetrics,omitempty"` } // DCGMExporterHPCJobMappingConfig defines HPC job mapping configuration for NVIDIA DCGM Exporter @@ -969,6 +976,18 @@ type DCGMExporterHPCJobMappingConfig struct { Directory string `json:"directory,omitempty"` } +// DCGMExporterPodMetricsConfig defines pod metrics enrichment configuration for DCGM Exporter +type DCGMExporterPodMetricsConfig struct { + // EnablePodLabels enables Kubernetes pod labels in metrics. + // When enabled, metrics will include labels from pods using GPUs. + // This requires cluster-wide read permissions to pods. + // +kubebuilder:validation:Optional + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Enable Pod Labels in Metrics" + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:booleanSwitch" + EnablePodLabels *bool `json:"enablePodLabels,omitempty"` +} + // DCGMExporterMetricsConfig defines metrics to be collected by NVIDIA DCGM Exporter type DCGMExporterMetricsConfig struct { // ConfigMap name with file dcgm-metrics.csv for metrics to be collected by NVIDIA DCGM Exporter @@ -2015,6 +2034,14 @@ func (e *DCGMExporterSpec) GetHPCJobMappingDirectory() string { return e.HPCJobMapping.Directory } +// IsPodLabelsEnabled returns true if pod label enrichment is enabled for DCGM Exporter +func (e *DCGMExporterSpec) IsPodLabelsEnabled() bool { + if e.PodMetrics == nil || e.PodMetrics.EnablePodLabels == nil { + return false + } + return *e.PodMetrics.EnablePodLabels +} + // IsEnabled returns true if gpu-feature-discovery is enabled(default) through gpu-operator func (g *GPUFeatureDiscoverySpec) IsEnabled() bool { if g.Enabled == nil { diff --git a/assets/state-dcgm-exporter/0210_clusterrole.yaml b/assets/state-dcgm-exporter/0210_clusterrole.yaml new file mode 100644 index 000000000..d2f90ea74 --- /dev/null +++ b/assets/state-dcgm-exporter/0210_clusterrole.yaml @@ -0,0 +1,15 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: nvidia-dcgm-exporter + labels: + app: nvidia-dcgm-exporter +rules: +- apiGroups: + - "" + resources: + - pods + verbs: + - get + - list + - watch diff --git a/assets/state-dcgm-exporter/0220_clusterrolebinding.yaml b/assets/state-dcgm-exporter/0220_clusterrolebinding.yaml new file mode 100644 index 000000000..b2b32553a --- /dev/null +++ b/assets/state-dcgm-exporter/0220_clusterrolebinding.yaml @@ -0,0 +1,14 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: nvidia-dcgm-exporter + labels: + app: nvidia-dcgm-exporter +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: nvidia-dcgm-exporter +subjects: +- kind: ServiceAccount + name: nvidia-dcgm-exporter + namespace: "FILLED BY THE OPERATOR" diff --git a/controllers/object_controls.go b/controllers/object_controls.go index 63e9f03c9..87285048a 100644 --- a/controllers/object_controls.go +++ b/controllers/object_controls.go @@ -454,6 +454,16 @@ func ClusterRole(n ClusterPolicyController) (gpuv1.State, error) { return gpuv1.Disabled, nil } + // For dcgm-exporter, ClusterRole is only needed when pod labels are enabled + if n.stateNames[n.idx] == "state-dcgm-exporter" && !n.singleton.Spec.DCGMExporter.IsPodLabelsEnabled() { + err := n.client.Delete(ctx, obj) + if err != nil && !apierrors.IsNotFound(err) { + logger.Info("Couldn't delete", "Error", err) + return gpuv1.NotReady, err + } + return gpuv1.Disabled, nil + } + if err := controllerutil.SetControllerReference(n.singleton, obj, n.scheme); err != nil { return gpuv1.NotReady, err } @@ -495,6 +505,16 @@ func ClusterRoleBinding(n ClusterPolicyController) (gpuv1.State, error) { return gpuv1.Disabled, nil } + // For dcgm-exporter, ClusterRoleBinding is only needed when pod labels are enabled + if n.stateNames[n.idx] == "state-dcgm-exporter" && !n.singleton.Spec.DCGMExporter.IsPodLabelsEnabled() { + err := n.client.Delete(ctx, obj) + if err != nil && !apierrors.IsNotFound(err) { + logger.Info("Couldn't delete", "Error", err) + return gpuv1.NotReady, err + } + return gpuv1.Disabled, nil + } + for idx := range obj.Subjects { obj.Subjects[idx].Namespace = n.operatorNamespace } @@ -1850,6 +1870,12 @@ func TransformDCGMExporter(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpe setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), "DCGM_EXPORTER_COLLECTORS", MetricsConfigMountPath) } + // configure Kubernetes pod label enrichment if enabled + if config.DCGMExporter.IsPodLabelsEnabled() { + setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), "DCGM_EXPORTER_KUBERNETES_ENABLE_POD_LABELS", "true") + obj.Spec.Template.Spec.AutomountServiceAccountToken = ptr.To(true) + } + for _, env := range config.DCGMExporter.Env { setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), env.Name, env.Value) } diff --git a/controllers/transforms_test.go b/controllers/transforms_test.go index 73255b17d..590411edd 100644 --- a/controllers/transforms_test.go +++ b/controllers/transforms_test.go @@ -183,6 +183,11 @@ func (d Daemonset) WithVolume(volume corev1.Volume) Daemonset { return d } +func (d Daemonset) WithAutomountServiceAccountToken(enabled bool) Daemonset { + d.Spec.Template.Spec.AutomountServiceAccountToken = &enabled + return d +} + // Pod is a Pod wrapper used for testing type Pod struct { *corev1.Pod @@ -1447,6 +1452,77 @@ func TestTransformDCGMExporter(t *testing.T) { WithRuntimeClassName("nvidia"). WithHostPathVolume("hpc-job-mapping", "/run/nvidia/dcgm-job-mapping", ptr.To(corev1.HostPathDirectoryOrCreate)), }, + { + description: "transform dcgm exporter with kubernetes pod labels enabled", + ds: NewDaemonset(). + WithContainer(corev1.Container{Name: "dcgm-exporter"}). + WithContainer(corev1.Container{Name: "dummy"}), + cpSpec: &gpuv1.ClusterPolicySpec{ + DCGMExporter: gpuv1.DCGMExporterSpec{ + Repository: "nvcr.io/nvidia/cloud-native", + Image: "dcgm-exporter", + Version: "v1.0.0", + ImagePullPolicy: "IfNotPresent", + ImagePullSecrets: []string{"pull-secret"}, + Args: []string{"--fail-on-init-error=false"}, + Env: []gpuv1.EnvVar{ + {Name: "foo", Value: "bar"}, + }, + PodMetrics: &gpuv1.DCGMExporterPodMetricsConfig{ + EnablePodLabels: newBoolPtr(true), + }, + }, + DCGM: gpuv1.DCGMSpec{ + Enabled: newBoolPtr(true), + }, + }, + expectedDs: NewDaemonset().WithContainer(corev1.Container{ + Name: "dcgm-exporter", + Image: "nvcr.io/nvidia/cloud-native/dcgm-exporter:v1.0.0", + ImagePullPolicy: corev1.PullIfNotPresent, + Args: []string{"--fail-on-init-error=false"}, + Env: []corev1.EnvVar{ + {Name: "DCGM_REMOTE_HOSTENGINE_INFO", Value: "nvidia-dcgm:5555"}, + {Name: "DCGM_EXPORTER_KUBERNETES_ENABLE_POD_LABELS", Value: "true"}, + {Name: "foo", Value: "bar"}, + }, + }).WithContainer(corev1.Container{Name: "dummy"}).WithPullSecret("pull-secret").WithRuntimeClassName("nvidia").WithAutomountServiceAccountToken(true), + }, + { + description: "transform dcgm exporter with kubernetes pod labels disabled", + ds: NewDaemonset(). + WithContainer(corev1.Container{Name: "dcgm-exporter"}). + WithContainer(corev1.Container{Name: "dummy"}), + cpSpec: &gpuv1.ClusterPolicySpec{ + DCGMExporter: gpuv1.DCGMExporterSpec{ + Repository: "nvcr.io/nvidia/cloud-native", + Image: "dcgm-exporter", + Version: "v1.0.0", + ImagePullPolicy: "IfNotPresent", + ImagePullSecrets: []string{"pull-secret"}, + Args: []string{"--fail-on-init-error=false"}, + Env: []gpuv1.EnvVar{ + {Name: "foo", Value: "bar"}, + }, + PodMetrics: &gpuv1.DCGMExporterPodMetricsConfig{ + EnablePodLabels: newBoolPtr(false), + }, + }, + DCGM: gpuv1.DCGMSpec{ + Enabled: newBoolPtr(true), + }, + }, + expectedDs: NewDaemonset().WithContainer(corev1.Container{ + Name: "dcgm-exporter", + Image: "nvcr.io/nvidia/cloud-native/dcgm-exporter:v1.0.0", + ImagePullPolicy: corev1.PullIfNotPresent, + Args: []string{"--fail-on-init-error=false"}, + Env: []corev1.EnvVar{ + {Name: "DCGM_REMOTE_HOSTENGINE_INFO", Value: "nvidia-dcgm:5555"}, + {Name: "foo", Value: "bar"}, + }, + }).WithContainer(corev1.Container{Name: "dummy"}).WithPullSecret("pull-secret").WithRuntimeClassName("nvidia"), + }, } for _, tc := range testCases { diff --git a/deployments/gpu-operator/templates/clusterpolicy.yaml b/deployments/gpu-operator/templates/clusterpolicy.yaml index 543089e17..7b68ff64d 100644 --- a/deployments/gpu-operator/templates/clusterpolicy.yaml +++ b/deployments/gpu-operator/templates/clusterpolicy.yaml @@ -551,6 +551,9 @@ spec: {{- if .Values.dcgmExporter.hpcJobMapping }} hpcJobMapping: {{ toYaml .Values.dcgmExporter.hpcJobMapping | nindent 6 }} {{- end }} + {{- if .Values.dcgmExporter.podMetrics }} + podMetrics: {{ toYaml .Values.dcgmExporter.podMetrics | nindent 6 }} + {{- end }} gfd: enabled: {{ .Values.gfd.enabled }} {{- if .Values.gfd.repository }} diff --git a/deployments/gpu-operator/values.yaml b/deployments/gpu-operator/values.yaml index 68ed8eb23..b8114349f 100644 --- a/deployments/gpu-operator/values.yaml +++ b/deployments/gpu-operator/values.yaml @@ -285,6 +285,11 @@ dcgmExporter: # hpcJobMapping: # enabled: true # directory: /var/lib/dcgm-exporter/job-mapping + # Pod metrics enrichment settings + podMetrics: + # Enable pod labels in metrics. When enabled, metrics will include labels from pods using GPUs. + # This requires cluster-wide read permissions to pods. + enablePodLabels: false service: internalTrafficPolicy: Cluster serviceMonitor: