From 6e29469f95208e98de9f7d95b4d4baa07bfc1ac4 Mon Sep 17 00:00:00 2001 From: Andrii Dema Date: Thu, 17 Apr 2025 17:18:46 +0300 Subject: [PATCH 1/5] K8SPG-703: add `internal.percona.com/keep-job` finalizer for backups https://perconadev.atlassian.net/browse/K8SPG-703 --- percona/controller/pgbackup/controller.go | 38 +++++++++++++++++++---- percona/controller/pgcluster/backup.go | 17 ++++++---- percona/naming/finalizers.go | 5 +++ 3 files changed, 48 insertions(+), 12 deletions(-) diff --git a/percona/controller/pgbackup/controller.go b/percona/controller/pgbackup/controller.go index 203b354ea..6d523b586 100644 --- a/percona/controller/pgbackup/controller.go +++ b/percona/controller/pgbackup/controller.go @@ -78,7 +78,21 @@ func (r *PGBackupReconciler) Reconcile(ctx context.Context, request reconcile.Re pgBackup.Default() - if !pgBackup.DeletionTimestamp.IsZero() { + if !pgBackup.DeletionTimestamp.IsZero() || pgBackup.Status.State == v2.BackupFailed || pgBackup.Status.State == v2.BackupSucceeded { + job, err := findBackupJob(ctx, r.Client, pgBackup) + if err == nil && len(job.Finalizers) > 0 { + if err := retry.RetryOnConflict(retry.DefaultBackoff, func() error { + j := new(batchv1.Job) + if err := r.Client.Get(ctx, client.ObjectKeyFromObject(job), j); err != nil { + return errors.Wrap(err, "get job") + } + j.Finalizers = []string{} + + return r.Client.Update(ctx, j) + }); err != nil { + return reconcile.Result{}, errors.Wrap(err, "update PGBackup status") + } + } if _, err := runFinalizers(ctx, r.Client, pgBackup); err != nil { return reconcile.Result{}, errors.Wrap(err, "failed to run finalizers") } @@ -123,7 +137,7 @@ func (r *PGBackupReconciler) Reconcile(ctx context.Context, request reconcile.Re } // start backup only if backup job doesn't exist - _, err = findBackupJob(ctx, r.Client, pgCluster, pgBackup) + _, err = findBackupJob(ctx, r.Client, pgBackup) if err != nil { if !errors.Is(err, ErrBackupJobNotFound) { return reconcile.Result{}, errors.Wrap(err, "find backup job") @@ -185,7 +199,7 @@ func (r *PGBackupReconciler) Reconcile(ctx context.Context, request reconcile.Re return reconcile.Result{}, errors.Wrap(err, "get PostgresCluster") } - job, err := findBackupJob(ctx, r.Client, pgCluster, pgBackup) + job, err := findBackupJob(ctx, r.Client, pgBackup) if err != nil { if errors.Is(err, ErrBackupJobNotFound) { log.Info("Waiting for backup to start") @@ -216,6 +230,18 @@ func (r *PGBackupReconciler) Reconcile(ctx context.Context, request reconcile.Re return reconcile.Result{}, errors.Wrap(err, "update PGBackup") } + if err := retry.RetryOnConflict(retry.DefaultBackoff, func() error { + j := new(batchv1.Job) + if err := r.Client.Get(ctx, client.ObjectKeyFromObject(job), j); err != nil { + return errors.Wrap(err, "get job") + } + j.Finalizers = append(j.Finalizers, pNaming.FinalizerKeepJob) + + return r.Client.Update(ctx, j) + }); err != nil { + return reconcile.Result{}, errors.Wrap(err, "update PGBackup status") + } + if err := retry.RetryOnConflict(retry.DefaultBackoff, func() error { bcp := new(v2.PerconaPGBackup) if err := r.Client.Get(ctx, types.NamespacedName{Name: pgBackup.Name, Namespace: pgBackup.Namespace}, bcp); err != nil { @@ -669,7 +695,7 @@ func startBackup(ctx context.Context, c client.Client, pb *v2.PerconaPGBackup) e return nil } -func findBackupJob(ctx context.Context, c client.Client, pg *v2.PerconaPGCluster, pb *v2.PerconaPGBackup) (*batchv1.Job, error) { +func findBackupJob(ctx context.Context, c client.Client, pb *v2.PerconaPGBackup) (*batchv1.Job, error) { if jobName := pb.GetAnnotations()[pNaming.AnnotationPGBackrestBackupJobName]; jobName != "" { job := new(batchv1.Job) err := c.Get(ctx, types.NamespacedName{Name: jobName, Namespace: pb.Namespace}, job) @@ -681,9 +707,9 @@ func findBackupJob(ctx context.Context, c client.Client, pg *v2.PerconaPGCluster jobList := &batchv1.JobList{} err := c.List(ctx, jobList, - client.InNamespace(pg.Namespace), + client.InNamespace(pb.Namespace), client.MatchingLabelsSelector{ - Selector: naming.PGBackRestBackupJobSelector(pg.Name, pb.Spec.RepoName, naming.BackupManual), + Selector: naming.PGBackRestBackupJobSelector(pb.Spec.PGCluster, pb.Spec.RepoName, naming.BackupManual), }) if err != nil { return nil, errors.Wrap(err, "get backup jobs") diff --git a/percona/controller/pgcluster/backup.go b/percona/controller/pgcluster/backup.go index 44639f1b1..5b432283e 100644 --- a/percona/controller/pgcluster/backup.go +++ b/percona/controller/pgcluster/backup.go @@ -5,6 +5,7 @@ import ( "github.com/pkg/errors" batchv1 "k8s.io/api/batch/v1" + k8serrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/util/retry" @@ -87,14 +88,18 @@ func (r *PGClusterReconciler) cleanupOutdatedBackups(ctx context.Context, cr *v2 // After the pg-backup is deleted, the job is not deleted immediately. // We need to set the DeletionTimestamp for a job so that `reconcileBackupJob` doesn't create a new pg-backup before the job deletion. job := new(batchv1.Job) - if err := r.Client.Get(ctx, types.NamespacedName{Name: pgBackup.Status.JobName, Namespace: pgBackup.Namespace}, job); err != nil { + err := r.Client.Get(ctx, types.NamespacedName{Name: pgBackup.Status.JobName, Namespace: pgBackup.Namespace}, job) + if client.IgnoreNotFound(err) != nil { return errors.Wrap(err, "get backup job") } - prop := metav1.DeletePropagationForeground - if err := r.Client.Delete(ctx, job, &client.DeleteOptions{ - PropagationPolicy: &prop, - }); err != nil { - return errors.Wrapf(err, "delete job %s/%s", job.Name, job.Namespace) + // The job may be deleted earlier due to ttlSecondsAfterFinished + if !k8serrors.IsNotFound(err) { + prop := metav1.DeletePropagationForeground + if err := r.Client.Delete(ctx, job, &client.DeleteOptions{ + PropagationPolicy: &prop, + }); err != nil { + return errors.Wrapf(err, "delete job %s/%s", job.Name, job.Namespace) + } } if err := r.Client.Delete(ctx, &pgBackup); err != nil { return errors.Wrapf(err, "delete backup %s/%s", pgBackup.Name, pgBackup.Namespace) diff --git a/percona/naming/finalizers.go b/percona/naming/finalizers.go index 604f809b3..d2fa2858b 100644 --- a/percona/naming/finalizers.go +++ b/percona/naming/finalizers.go @@ -19,3 +19,8 @@ const ( const ( FinalizerDeleteBackup = PrefixPerconaInternal + "delete-backup" //nolint:gosec ) + +// PerconaPGBackup job finalizers +const ( + FinalizerKeepJob = PrefixPerconaInternal + "keep-job" //nolint:gosec +) From c65991f84dce368235636341593ce2ab08a7003d Mon Sep 17 00:00:00 2001 From: Andrii Dema Date: Fri, 18 Apr 2025 17:28:49 +0300 Subject: [PATCH 2/5] fix --- percona/controller/pgbackup/controller.go | 49 +++++++++++++++-------- 1 file changed, 33 insertions(+), 16 deletions(-) diff --git a/percona/controller/pgbackup/controller.go b/percona/controller/pgbackup/controller.go index 6d523b586..c6a26e067 100644 --- a/percona/controller/pgbackup/controller.go +++ b/percona/controller/pgbackup/controller.go @@ -78,21 +78,7 @@ func (r *PGBackupReconciler) Reconcile(ctx context.Context, request reconcile.Re pgBackup.Default() - if !pgBackup.DeletionTimestamp.IsZero() || pgBackup.Status.State == v2.BackupFailed || pgBackup.Status.State == v2.BackupSucceeded { - job, err := findBackupJob(ctx, r.Client, pgBackup) - if err == nil && len(job.Finalizers) > 0 { - if err := retry.RetryOnConflict(retry.DefaultBackoff, func() error { - j := new(batchv1.Job) - if err := r.Client.Get(ctx, client.ObjectKeyFromObject(job), j); err != nil { - return errors.Wrap(err, "get job") - } - j.Finalizers = []string{} - - return r.Client.Update(ctx, j) - }); err != nil { - return reconcile.Result{}, errors.Wrap(err, "update PGBackup status") - } - } + if !pgBackup.DeletionTimestamp.IsZero() || pgBackup.Status.State == v2.BackupFailed { if _, err := runFinalizers(ctx, r.Client, pgBackup); err != nil { return reconcile.Result{}, errors.Wrap(err, "failed to run finalizers") } @@ -261,6 +247,22 @@ func (r *PGBackupReconciler) Reconcile(ctx context.Context, request reconcile.Re job := &batchv1.Job{} err := r.Client.Get(ctx, types.NamespacedName{Name: pgBackup.Status.JobName, Namespace: pgBackup.Namespace}, job) if err != nil { + // If something has deleted the job even with the finalizer, we should fail the backup. + if k8serrors.IsNotFound(err) { + if err := retry.RetryOnConflict(retry.DefaultBackoff, func() error { + bcp := new(v2.PerconaPGBackup) + if err := r.Client.Get(ctx, client.ObjectKeyFromObject(pgBackup), bcp); err != nil { + return errors.Wrap(err, "get PGBackup") + } + + bcp.Status.State = v2.BackupFailed + + return r.Client.Status().Update(ctx, bcp) + }); err != nil { + return reconcile.Result{}, errors.Wrap(err, "update PGBackup status") + } + return reconcile.Result{}, nil + } return reconcile.Result{}, errors.Wrap(err, "get backup job") } @@ -310,8 +312,23 @@ func (r *PGBackupReconciler) Reconcile(ctx context.Context, request reconcile.Re return reconcile.Result{}, nil case v2.BackupSucceeded: + job, err := findBackupJob(ctx, r.Client, pgBackup) + if err == nil && len(job.Finalizers) > 0 { + if err := retry.RetryOnConflict(retry.DefaultBackoff, func() error { + j := new(batchv1.Job) + if err := r.Client.Get(ctx, client.ObjectKeyFromObject(job), j); err != nil { + return errors.Wrap(err, "get job") + } + j.Finalizers = []string{} + + return r.Client.Update(ctx, j) + }); err != nil { + return reconcile.Result{}, errors.Wrap(err, "update PGBackup status") + } + } + pgCluster := &v2.PerconaPGCluster{} - err := r.Client.Get(ctx, types.NamespacedName{Name: pgBackup.Spec.PGCluster, Namespace: request.Namespace}, pgCluster) + err = r.Client.Get(ctx, types.NamespacedName{Name: pgBackup.Spec.PGCluster, Namespace: request.Namespace}, pgCluster) if err != nil { if k8serrors.IsNotFound(err) { return reconcile.Result{}, nil From 3cdef5964c8df81870bc797a3fbd603f559f4015 Mon Sep 17 00:00:00 2001 From: Andrii Dema Date: Mon, 21 Apr 2025 21:06:43 +0300 Subject: [PATCH 3/5] fix tests --- .../controller/postgrescluster/pgbackrest.go | 2 ++ percona/controller/pgbackup/controller.go | 17 ++++------------- 2 files changed, 6 insertions(+), 13 deletions(-) diff --git a/internal/controller/postgrescluster/pgbackrest.go b/internal/controller/postgrescluster/pgbackrest.go index 21dbcc84f..e9af9db9d 100644 --- a/internal/controller/postgrescluster/pgbackrest.go +++ b/internal/controller/postgrescluster/pgbackrest.go @@ -2491,6 +2491,8 @@ func (r *Reconciler) reconcileManualBackup(ctx context.Context, }) backupJob.ObjectMeta.Labels = labels backupJob.ObjectMeta.Annotations = annotations + // K8SPG-703 + backupJob.ObjectMeta.Finalizers = []string{pNaming.FinalizerKeepJob} // K8SPG-613 initImage, err := k8s.InitImage(ctx, r.Client, postgresCluster, &postgresCluster.Spec.Backups.PGBackRest) diff --git a/percona/controller/pgbackup/controller.go b/percona/controller/pgbackup/controller.go index 8754011d3..57c617f29 100644 --- a/percona/controller/pgbackup/controller.go +++ b/percona/controller/pgbackup/controller.go @@ -219,18 +219,6 @@ func (r *PGBackupReconciler) Reconcile(ctx context.Context, request reconcile.Re return reconcile.Result{}, errors.Wrap(err, "update PGBackup") } - if err := retry.RetryOnConflict(retry.DefaultBackoff, func() error { - j := new(batchv1.Job) - if err := r.Client.Get(ctx, client.ObjectKeyFromObject(job), j); err != nil { - return errors.Wrap(err, "get job") - } - j.Finalizers = append(j.Finalizers, pNaming.FinalizerKeepJob) - - return r.Client.Update(ctx, j) - }); err != nil { - return reconcile.Result{}, errors.Wrap(err, "update PGBackup status") - } - if err := updateStatus(ctx, r.Client, pgBackup, func(bcp *v2.PerconaPGBackup) { bcp.Status.State = v2.BackupRunning bcp.Status.JobName = job.Name @@ -673,7 +661,10 @@ func startBackup(ctx context.Context, c client.Client, pb *v2.PerconaPGBackup) e } func findBackupJob(ctx context.Context, c client.Client, pb *v2.PerconaPGBackup) (*batchv1.Job, error) { - if jobName := pb.GetAnnotations()[pNaming.AnnotationPGBackrestBackupJobName]; jobName != "" { + if jobName := pb.GetAnnotations()[pNaming.AnnotationPGBackrestBackupJobName]; jobName != "" || pb.Status.JobName != "" { + if jobName == "" { + jobName = pb.Status.JobName + } job := new(batchv1.Job) err := c.Get(ctx, types.NamespacedName{Name: jobName, Namespace: pb.Namespace}, job) if err != nil { From 94b3bad43f04a1293a3745cb4da04c1fa943c970 Mon Sep 17 00:00:00 2001 From: Andrii Dema Date: Tue, 22 Apr 2025 09:48:45 +0300 Subject: [PATCH 4/5] fix lint --- internal/controller/postgrescluster/pgbackrest.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/controller/postgrescluster/pgbackrest.go b/internal/controller/postgrescluster/pgbackrest.go index e9af9db9d..d470daab3 100644 --- a/internal/controller/postgrescluster/pgbackrest.go +++ b/internal/controller/postgrescluster/pgbackrest.go @@ -2492,7 +2492,7 @@ func (r *Reconciler) reconcileManualBackup(ctx context.Context, backupJob.ObjectMeta.Labels = labels backupJob.ObjectMeta.Annotations = annotations // K8SPG-703 - backupJob.ObjectMeta.Finalizers = []string{pNaming.FinalizerKeepJob} + backupJob.Finalizers = []string{pNaming.FinalizerKeepJob} // K8SPG-613 initImage, err := k8s.InitImage(ctx, r.Client, postgresCluster, &postgresCluster.Spec.Backups.PGBackRest) From 3e3831927501b9d6602d394ff1a8ca126c053968 Mon Sep 17 00:00:00 2001 From: Andrii Dema Date: Fri, 9 May 2025 13:21:12 +0300 Subject: [PATCH 5/5] fix --- percona/controller/pgbackup/controller.go | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/percona/controller/pgbackup/controller.go b/percona/controller/pgbackup/controller.go index 57c617f29..0661635c9 100644 --- a/percona/controller/pgbackup/controller.go +++ b/percona/controller/pgbackup/controller.go @@ -3,6 +3,7 @@ package pgbackup import ( "context" "path" + "slices" "strings" "time" @@ -275,13 +276,13 @@ func (r *PGBackupReconciler) Reconcile(ctx context.Context, request reconcile.Re return reconcile.Result{}, nil case v2.BackupSucceeded: job, err := findBackupJob(ctx, r.Client, pgBackup) - if err == nil && len(job.Finalizers) > 0 { + if err == nil && slices.Contains(job.Finalizers, pNaming.FinalizerKeepJob) { if err := retry.RetryOnConflict(retry.DefaultBackoff, func() error { j := new(batchv1.Job) if err := r.Client.Get(ctx, client.ObjectKeyFromObject(job), j); err != nil { return errors.Wrap(err, "get job") } - j.Finalizers = []string{} + j.Finalizers = slices.DeleteFunc(j.Finalizers, func(s string) bool { return s == pNaming.FinalizerKeepJob }) return r.Client.Update(ctx, j) }); err != nil {