From 2b3bf753a158380e61b98d537e39748b2509b0d2 Mon Sep 17 00:00:00 2001 From: David Caravello <119438707+dcaravel@users.noreply.github.com> Date: Fri, 6 Feb 2026 15:16:26 -0600 Subject: [PATCH 1/4] optimze temp DB --- scripts/ci/lib.sh | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/scripts/ci/lib.sh b/scripts/ci/lib.sh index 366bb31ba..af4703a62 100755 --- a/scripts/ci/lib.sh +++ b/scripts/ci/lib.sh @@ -911,6 +911,22 @@ generate_db_dump() { # The PATH is not completely preserved, so set the PATH here to ensure postgres-related commands can be found. runuser -l pg -c "PATH=$PATH $SCRIPTS_ROOT/scripts/ci/postgres.sh start_postgres" + # Configure PostgreSQL for bulk loading performance + # These settings are safe for CI because: + # - The database is temporary (destroyed after dump creation) + # - Transaction commits ensure data visibility regardless of disk sync + # - Any failure causes the entire CI job to fail + info "Configuring PostgreSQL for bulk loading" + psql -U postgres -h 127.0.0.1 -c "ALTER SYSTEM SET fsync = off;" + psql -U postgres -h 127.0.0.1 -c "ALTER SYSTEM SET synchronous_commit = off;" + psql -U postgres -h 127.0.0.1 -c "ALTER SYSTEM SET full_page_writes = off;" + psql -U postgres -h 127.0.0.1 -c "ALTER SYSTEM SET maintenance_work_mem = '1GB';" + psql -U postgres -h 127.0.0.1 -c "ALTER SYSTEM SET max_wal_size = '2GB';" + psql -U postgres -h 127.0.0.1 -c "ALTER SYSTEM SET checkpoint_timeout = '30min';" + psql -U postgres -h 127.0.0.1 -c "ALTER SYSTEM SET autovacuum = off;" + psql -U postgres -h 127.0.0.1 -c "SELECT pg_reload_conf();" + info "PostgreSQL configured for bulk loading" + bin/updater load-dump --postgres-host 127.0.0.1 --postgres-port 5432 --dump-file /tmp/genesis-dump/genesis-dump.zip mkdir /tmp/postgres From 6faa8c637d51b890508eb1dd31b9f43e5f9afcd4 Mon Sep 17 00:00:00 2001 From: David Caravello <119438707+dcaravel@users.noreply.github.com> Date: Fri, 6 Feb 2026 15:37:46 -0600 Subject: [PATCH 2/4] empty commit to trigger CI From 6bf5c9f287f80966084cae9556a439ea5429baf7 Mon Sep 17 00:00:00 2001 From: David Caravello <119438707+dcaravel@users.noreply.github.com> Date: Fri, 6 Feb 2026 17:59:34 -0600 Subject: [PATCH 3/4] insert via batches --- database/pgsql/vulnerability.go | 7 + database/pgsql/vulnerability_batch.go | 190 ++++++++++++++++++++++++++ scripts/ci/lib.sh | 6 +- 3 files changed, 202 insertions(+), 1 deletion(-) create mode 100644 database/pgsql/vulnerability_batch.go diff --git a/database/pgsql/vulnerability.go b/database/pgsql/vulnerability.go index 6ebadc5a9..4c4bb9cc7 100644 --- a/database/pgsql/vulnerability.go +++ b/database/pgsql/vulnerability.go @@ -164,6 +164,13 @@ func (pgSQL *pgSQL) InsertVulnerabilities(vulnerabilities []database.Vulnerabili if len(vulnerabilities) > printEveryWhenInsertingVulns { log.WithField("count", len(vulnerabilities)).Info("Inserting vulnerabilities") } + + // Use batched inserts in CI mode for better performance + if shouldUseBatchInsert() { + return pgSQL.insertVulnerabilitiesBatched(vulnerabilities) + } + + // Standard path: one transaction per vulnerability (production mode) for i, vulnerability := range vulnerabilities { err := pgSQL.insertVulnerability(vulnerability, false) if err != nil { diff --git a/database/pgsql/vulnerability_batch.go b/database/pgsql/vulnerability_batch.go new file mode 100644 index 000000000..694d987ab --- /dev/null +++ b/database/pgsql/vulnerability_batch.go @@ -0,0 +1,190 @@ +package pgsql + +import ( + "database/sql" + "os" + "reflect" + "strconv" + + log "github.com/sirupsen/logrus" + "github.com/stackrox/scanner/database" + "github.com/stackrox/scanner/ext/versionfmt" + "github.com/stackrox/scanner/pkg/commonerr" +) + +const ( + // batchInsertEnvVar Environment variable to enable batched vulnerability inserts (CI mode) + batchInsertEnvVar = "SCANNER_BATCH_INSERT" + // defaultBatchSize Default batch size for bulk inserts + defaultBatchSize = 1000 +) + +// shouldUseBatchInsert determines if batched inserts should be used based on environment variable. +// This is intended for CI environments where we're loading into an empty database. +func shouldUseBatchInsert() bool { + value := os.Getenv(batchInsertEnvVar) + return value == "true" || value == "1" +} + +// getBatchSize returns the batch size from environment or default. +func getBatchSize() int { + if sizeStr := os.Getenv("SCANNER_BATCH_SIZE"); sizeStr != "" { + if size, err := strconv.Atoi(sizeStr); err == nil && size > 0 { + return size + } + } + return defaultBatchSize +} + +// insertVulnerabilitiesBatched inserts vulnerabilities in batches using a single transaction per batch. +// This is significantly faster for bulk loading scenarios (like CI) where the database is empty +// and we don't need to handle complex merge logic. +// +// IMPORTANT: This optimization is only safe when: +// 1. Loading into an empty database (no existing vulnerabilities to merge) +// 2. All vulnerabilities are new inserts (not updates) +// 3. The SCANNER_BATCH_INSERT environment variable is explicitly set +func (pgSQL *pgSQL) insertVulnerabilitiesBatched(vulnerabilities []database.Vulnerability) error { + batchSize := getBatchSize() + totalVulns := len(vulnerabilities) + + log.WithFields(log.Fields{ + "total": totalVulns, + "batch_size": batchSize, + }).Info("Using batched vulnerability inserts (CI mode)") + + for batchStart := 0; batchStart < totalVulns; batchStart += batchSize { + batchEnd := batchStart + batchSize + if batchEnd > totalVulns { + batchEnd = totalVulns + } + + batch := vulnerabilities[batchStart:batchEnd] + + if err := pgSQL.insertVulnerabilityBatch(batch); err != nil { + return err + } + + if (batchEnd)%printEveryWhenInsertingVulns == 0 { + log.Infof("Inserted %d/%d vulns (batched)", batchEnd, totalVulns) + } + } + + return nil +} + +// insertVulnerabilityBatch inserts a batch of vulnerabilities within a single transaction. +// This reuses the existing insertVulnerability logic but amortizes transaction overhead +// across multiple vulnerabilities. +func (pgSQL *pgSQL) insertVulnerabilityBatch(batch []database.Vulnerability) error { + // Begin a single transaction for the entire batch + tx, err := pgSQL.Begin() + if err != nil { + return handleError("insertVulnerabilityBatch.Begin()", err) + } + + // Process each vulnerability using existing logic, but within the shared transaction + for i := range batch { + // We use the existing insertVulnerability logic, but we need to work around + // the fact that it creates its own transaction. The approach is to inline + // the core logic here. + if err := pgSQL.insertSingleVulnerabilityInTx(tx, batch[i]); err != nil { + tx.Rollback() + return err + } + } + + // Commit the entire batch at once + if err := tx.Commit(); err != nil { + tx.Rollback() + return handleError("insertVulnerabilityBatch.Commit()", err) + } + + return nil +} + +// insertSingleVulnerabilityInTx inserts a single vulnerability within an existing transaction. +// This is the core logic from insertVulnerability, but adapted to work with an externally-managed transaction. +func (pgSQL *pgSQL) insertSingleVulnerabilityInTx(tx *sql.Tx, vulnerability database.Vulnerability) error { + // Verify parameters + if vulnerability.Name == "" || vulnerability.Namespace.Name == "" { + return commonerr.NewBadRequestError("insertVulnerability needs at least the Name and the Namespace") + } + + // Validate FixedIn namespaces + for i := 0; i < len(vulnerability.FixedIn); i++ { + fifv := &vulnerability.FixedIn[i] + if fifv.Feature.Namespace.Name == "" { + fifv.Feature.Namespace = vulnerability.Namespace + } else if fifv.Feature.Namespace.Name != vulnerability.Namespace.Name { + msg := "could not insert an invalid vulnerability that contains FixedIn FeatureVersion that are not in the same namespace as the Vulnerability" + log.Warning(msg) + return commonerr.NewBadRequestError(msg) + } + } + + // Find existing vulnerability (for update case) + existingVulnerability, err := findVulnerability(tx, vulnerability.Namespace.Name, vulnerability.Name, true) + if err != nil && err != commonerr.ErrNotFound { + return err + } + + // Handle update case (merge with existing) + if existingVulnerability.ID != 0 { + updateMetadata := vulnerability.Description != existingVulnerability.Description || + vulnerability.Link != existingVulnerability.Link || + vulnerability.Severity != existingVulnerability.Severity || + !reflect.DeepEqual(castMetadata(vulnerability.Metadata), existingVulnerability.Metadata) + + var updateFixedIn bool + vulnerability.FixedIn, updateFixedIn = applyFixedInDiff(existingVulnerability.FixedIn, vulnerability.FixedIn) + + if !updateMetadata && !updateFixedIn { + // No changes needed + return nil + } + + // Mark old vulnerability as non-latest + _, err = tx.Exec(removeVulnerability, vulnerability.Namespace.Name, vulnerability.Name) + if err != nil { + return handleError("removeVulnerability", err) + } + } else { + // New vulnerability - remove MinVersion entries + var fixedIn []database.FeatureVersion + for _, fv := range vulnerability.FixedIn { + if fv.Version != versionfmt.MinVersion { + fixedIn = append(fixedIn, fv) + } + } + vulnerability.FixedIn = fixedIn + } + + // Find or insert namespace + namespaceID, err := pgSQL.insertNamespace(vulnerability.Namespace) + if err != nil { + return err + } + + // Insert vulnerability + err = tx.QueryRow( + insertVulnerability, + namespaceID, + vulnerability.Name, + vulnerability.Description, + vulnerability.Link, + &vulnerability.Severity, + &vulnerability.Metadata, + ).Scan(&vulnerability.ID) + if err != nil { + return handleError("insertVulnerability", err) + } + + // Insert FixedIn feature versions + err = pgSQL.insertVulnerabilityFixedInFeatureVersions(tx, vulnerability.ID, vulnerability.FixedIn) + if err != nil { + return err + } + + return nil +} diff --git a/scripts/ci/lib.sh b/scripts/ci/lib.sh index af4703a62..97637fee2 100755 --- a/scripts/ci/lib.sh +++ b/scripts/ci/lib.sh @@ -921,12 +921,16 @@ generate_db_dump() { psql -U postgres -h 127.0.0.1 -c "ALTER SYSTEM SET synchronous_commit = off;" psql -U postgres -h 127.0.0.1 -c "ALTER SYSTEM SET full_page_writes = off;" psql -U postgres -h 127.0.0.1 -c "ALTER SYSTEM SET maintenance_work_mem = '1GB';" - psql -U postgres -h 127.0.0.1 -c "ALTER SYSTEM SET max_wal_size = '2GB';" + psql -U postgres -h 127.0.0.1 -c "ALTER SYSTEM SET max_wal_size = '6GB';" psql -U postgres -h 127.0.0.1 -c "ALTER SYSTEM SET checkpoint_timeout = '30min';" psql -U postgres -h 127.0.0.1 -c "ALTER SYSTEM SET autovacuum = off;" psql -U postgres -h 127.0.0.1 -c "SELECT pg_reload_conf();" info "PostgreSQL configured for bulk loading" + # Enable batched inserts for CI performance (loads into empty database) + export SCANNER_BATCH_INSERT=true + export SCANNER_BATCH_SIZE=1000 + bin/updater load-dump --postgres-host 127.0.0.1 --postgres-port 5432 --dump-file /tmp/genesis-dump/genesis-dump.zip mkdir /tmp/postgres From b3c0f2de4f939f4570d09320e3e38e1182f515b1 Mon Sep 17 00:00:00 2001 From: David Caravello <119438707+dcaravel@users.noreply.github.com> Date: Fri, 6 Feb 2026 23:06:32 -0600 Subject: [PATCH 4/4] drop/recreate indexes --- scripts/ci/lib.sh | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/scripts/ci/lib.sh b/scripts/ci/lib.sh index 97637fee2..413e97b8e 100755 --- a/scripts/ci/lib.sh +++ b/scripts/ci/lib.sh @@ -927,12 +927,33 @@ generate_db_dump() { psql -U postgres -h 127.0.0.1 -c "SELECT pg_reload_conf();" info "PostgreSQL configured for bulk loading" + # Drop indexes before bulk loading to avoid index bloat during inserts + # This provides ~5-10x speedup for large vulnerability datasets (340k+) + # Indexes will be recreated after loading (faster to build in one pass) + info "Dropping indexes for bulk loading" + psql -U postgres -h 127.0.0.1 -c "DROP INDEX IF EXISTS vulnerability_name_idx;" + psql -U postgres -h 127.0.0.1 -c "DROP INDEX IF EXISTS vulnerability_namespace_id_name_idx;" + psql -U postgres -h 127.0.0.1 -c "DROP INDEX IF EXISTS vulnerability_fixedin_feature_feature_id_vulnerability_id_idx;" + psql -U postgres -h 127.0.0.1 -c "DROP INDEX IF EXISTS vulnerability_affects_featureversion_fixedin_id_idx;" + psql -U postgres -h 127.0.0.1 -c "DROP INDEX IF EXISTS vulnerability_affects_featureversion_featureversion_id_vulnera_idx;" + info "Indexes dropped" + # Enable batched inserts for CI performance (loads into empty database) export SCANNER_BATCH_INSERT=true export SCANNER_BATCH_SIZE=1000 bin/updater load-dump --postgres-host 127.0.0.1 --postgres-port 5432 --dump-file /tmp/genesis-dump/genesis-dump.zip + # Recreate indexes after bulk loading + # Building indexes on complete data is much faster than maintaining them during inserts + info "Recreating indexes after bulk load" + psql -U postgres -h 127.0.0.1 -c "CREATE INDEX vulnerability_name_idx ON Vulnerability (name);" + psql -U postgres -h 127.0.0.1 -c "CREATE INDEX vulnerability_namespace_id_name_idx ON Vulnerability (namespace_id, name);" + psql -U postgres -h 127.0.0.1 -c "CREATE INDEX ON Vulnerability_FixedIn_Feature (feature_id, vulnerability_id);" + psql -U postgres -h 127.0.0.1 -c "CREATE INDEX ON Vulnerability_Affects_FeatureVersion (fixedin_id);" + psql -U postgres -h 127.0.0.1 -c "CREATE INDEX ON Vulnerability_Affects_FeatureVersion (featureversion_id, vulnerability_id);" + info "Indexes recreated" + mkdir /tmp/postgres pg_dump -U postgres postgres://127.0.0.1:5432 > /tmp/postgres/pg-definitions.sql ls -lrt /tmp/postgres