Skip to content

Commit

Permalink
Proper shutdown
Browse files Browse the repository at this point in the history
  • Loading branch information
shayonj committed Oct 15, 2024
1 parent daf88f5 commit 185dd69
Show file tree
Hide file tree
Showing 6 changed files with 122 additions and 332 deletions.
71 changes: 29 additions & 42 deletions pkg/replicator/base_replicator.go
Original file line number Diff line number Diff line change
Expand Up @@ -123,9 +123,10 @@ func (r *BaseReplicator) checkPublicationExists(publicationName string) (bool, e
}

// StartReplicationFromLSN initiates the replication process from a given LSN
func (r *BaseReplicator) StartReplicationFromLSN(ctx context.Context, startLSN pglogrepl.LSN) error {
func (r *BaseReplicator) StartReplicationFromLSN(ctx context.Context, startLSN pglogrepl.LSN, stopChan <-chan struct{}) error {
publicationName := GeneratePublicationName(r.Config.Group)
r.Logger.Info().Str("startLSN", startLSN.String()).Str("publication", publicationName).Msg("Starting replication")

err := r.ReplicationConn.StartReplication(ctx, publicationName, startLSN, pglogrepl.StartReplicationOptions{
PluginArgs: []string{
"proto_version '1'",
Expand All @@ -138,30 +139,21 @@ func (r *BaseReplicator) StartReplicationFromLSN(ctx context.Context, startLSN p

r.Logger.Info().Str("startLSN", startLSN.String()).Msg("Replication started successfully")

errChan := make(chan error, 1)
go func() {
errChan <- r.StreamChanges(ctx)
}()

select {
case <-ctx.Done():
return r.gracefulShutdown()
case err := <-errChan:
if err != nil && !errors.Is(err, context.Canceled) {
return err
}
return nil
}
return r.StreamChanges(ctx, stopChan)
}

// StreamChanges continuously processes replication messages
func (r *BaseReplicator) StreamChanges(ctx context.Context) error {
func (r *BaseReplicator) StreamChanges(ctx context.Context, stopChan <-chan struct{}) error {
lastStatusUpdate := time.Now()
standbyMessageTimeout := time.Second * 10

for {
select {
case <-ctx.Done():
r.Logger.Info().Msg("Context canceled, stopping StreamChanges")
return nil
case <-stopChan:
r.Logger.Info().Msg("Stop signal received, exiting StreamChanges")
return nil
default:
if err := r.ProcessNextMessage(ctx, &lastStatusUpdate, standbyMessageTimeout); err != nil {
Expand All @@ -178,10 +170,12 @@ func (r *BaseReplicator) StreamChanges(ctx context.Context) error {
func (r *BaseReplicator) ProcessNextMessage(ctx context.Context, lastStatusUpdate *time.Time, standbyMessageTimeout time.Duration) error {
msg, err := r.ReplicationConn.ReceiveMessage(ctx)
if err != nil {
if ctx.Err() != nil {
return fmt.Errorf("context error while receiving message: %w", ctx.Err())
if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) {
r.Logger.Info().Msg("Context canceled or deadline exceeded, stopping message processing")
return nil
}
return fmt.Errorf("failed to receive message: %w", err)
r.Logger.Error().Err(err).Msg("Error processing next message")
return err
}

switch msg := msg.(type) {
Expand Down Expand Up @@ -414,19 +408,6 @@ func (r *BaseReplicator) SendStandbyStatusUpdate(ctx context.Context) error {
return nil
}

// SendFinalStandbyStatusUpdate sends a final status update before shutting down
func (r *BaseReplicator) SendFinalStandbyStatusUpdate() error {
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()

if err := r.SendStandbyStatusUpdate(ctx); err != nil {
return fmt.Errorf("failed to send standby status update: %w", err)
}

r.Logger.Info().Msg("Sent final standby status update")
return nil
}

// CreateReplicationSlot ensures that a replication slot exists, creating one if necessary
func (r *BaseReplicator) CreateReplicationSlot(ctx context.Context) error {
publicationName := GeneratePublicationName(r.Config.Group)
Expand Down Expand Up @@ -463,28 +444,34 @@ func (r *BaseReplicator) CheckReplicationSlotExists(slotName string) (bool, erro
return exists, nil
}

// gracefulShutdown performs a graceful shutdown of the replicator
func (r *BaseReplicator) gracefulShutdown() error {
// GracefulShutdown performs a graceful shutdown of the replicator
func (r *BaseReplicator) GracefulShutdown(ctx context.Context) error {
r.Logger.Info().Msg("Initiating graceful shutdown")

if err := r.SendFinalStandbyStatusUpdate(); err != nil {
r.Logger.Error().Err(err).Msg("Failed to send final standby status update")
if err := r.SendStandbyStatusUpdate(ctx); err != nil {
r.Logger.Warn().Err(err).Msg("Failed to send final standby status update")
}

if err := r.closeConnections(); err != nil {
r.Logger.Error().Err(err).Msg("Failed to close connections")
if err := r.SaveState(ctx, r.LastLSN); err != nil {
r.Logger.Warn().Err(err).Msg("Failed to save final state")
}

r.Logger.Info().Msg("Graceful shutdown completed")
if err := r.closeConnections(ctx); err != nil {
r.Logger.Warn().Err(err).Msg("Failed to close connections")
}

r.Logger.Info().Msg("Base replicator shutdown completed")
return nil
}

// closeConnections closes all open database connections
func (r *BaseReplicator) closeConnections() error {
if err := r.ReplicationConn.Close(context.Background()); err != nil {
func (r *BaseReplicator) closeConnections(ctx context.Context) error {
r.Logger.Info().Msg("Closing database connections")

if err := r.ReplicationConn.Close(ctx); err != nil {
return fmt.Errorf("failed to close replication connection: %w", err)
}
if err := r.StandardConn.Close(context.Background()); err != nil {
if err := r.StandardConn.Close(ctx); err != nil {
return fmt.Errorf("failed to close standard connection: %w", err)
}
return nil
Expand Down
45 changes: 32 additions & 13 deletions pkg/replicator/copy_and_stream_replicator.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,14 +28,11 @@ type CopyAndStreamReplicator struct {

// StartReplication begins the replication process.
func (r *CopyAndStreamReplicator) StartReplication() error {
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
ctx := context.Background()

sigChan := make(chan os.Signal, 1)
signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM)

go r.handleShutdownSignal(sigChan, cancel)

if err := r.BaseReplicator.CreatePublication(); err != nil {
return fmt.Errorf("failed to create publication: %v", err)
}
Expand All @@ -53,7 +50,6 @@ func (r *CopyAndStreamReplicator) StartReplication() error {
ddlCtx, ddlCancel = context.WithCancel(ctx)
go r.DDLReplicator.StartDDLReplication(ddlCtx)
}

defer func() {
if r.Config.TrackDDL {
ddlCancel()
Expand All @@ -63,20 +59,43 @@ func (r *CopyAndStreamReplicator) StartReplication() error {
}
}()

if copyErr := r.ParallelCopy(context.Background()); copyErr != nil {
if copyErr := r.ParallelCopy(ctx); copyErr != nil {
return fmt.Errorf("failed to perform parallel copy: %v", copyErr)
}
startLSN := r.BaseReplicator.LastLSN

r.Logger.Info().Str("startLSN", startLSN.String()).Msg("Starting replication from LSN")
return r.BaseReplicator.StartReplicationFromLSN(ctx, startLSN)
}

// handleShutdownSignal waits for a shutdown signal and cancels the context.
func (r *CopyAndStreamReplicator) handleShutdownSignal(sigChan <-chan os.Signal, cancel context.CancelFunc) {
sig := <-sigChan
r.Logger.Info().Str("signal", sig.String()).Msg("Received shutdown signal")
cancel()
// Create a stop channel for graceful shutdown
stopChan := make(chan struct{})
errChan := make(chan error, 1)
go func() {
errChan <- r.BaseReplicator.StartReplicationFromLSN(ctx, startLSN, stopChan)
}()

select {
case <-sigChan:
r.Logger.Info().Msg("Received shutdown signal")
// Signal replication loop to stop
close(stopChan)
// Wait for replication loop to exit
<-errChan

// Proceed with graceful shutdown
shutdownCtx, cancelShutdown := context.WithTimeout(context.Background(), 10*time.Second)
defer cancelShutdown()
if err := r.BaseReplicator.GracefulShutdown(shutdownCtx); err != nil {
r.Logger.Error().Err(err).Msg("Error during graceful shutdown")
return err
}
case err := <-errChan:
if err != nil {
r.Logger.Error().Err(err).Msg("Replication ended with error")
return err
}
}

return nil
}

// ParallelCopy performs a parallel copy of all specified tables.
Expand Down
77 changes: 46 additions & 31 deletions pkg/replicator/stream_replicator.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
"os"
"os/signal"
"syscall"
"time"

"github.com/jackc/pglogrepl"
)
Expand All @@ -17,15 +18,54 @@ type StreamReplicator struct {

// StartReplication begins the replication process.
func (r *StreamReplicator) StartReplication() error {
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
ctx := context.Background()

// Set up signal handling for graceful shutdown
sigChan := make(chan os.Signal, 1)
signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM)

go r.handleShutdownSignal(sigChan, cancel)
if err := r.setup(ctx); err != nil {
return err
}

startLSN, err := r.getStartLSN(ctx)
if err != nil {
return err
}

r.Logger.Info().Str("startLSN", startLSN.String()).Msg("Starting replication from LSN")

stopChan := make(chan struct{})
errChan := make(chan error, 1)
go func() {
errChan <- r.BaseReplicator.StartReplicationFromLSN(ctx, startLSN, stopChan)
}()

select {
case <-sigChan:
r.Logger.Info().Msg("Received shutdown signal")
// Signal the replication loop to stop
close(stopChan)
// Wait for the replication loop to exit
<-errChan

// Proceed with graceful shutdown
shutdownCtx, cancelShutdown := context.WithTimeout(context.Background(), 10*time.Second)
defer cancelShutdown()
if err := r.BaseReplicator.GracefulShutdown(shutdownCtx); err != nil {
r.Logger.Error().Err(err).Msg("Error during graceful shutdown")
return err
}
case err := <-errChan:
if err != nil {
r.Logger.Error().Err(err).Msg("Replication ended with error")
return err
}
}

return nil
}

func (r *StreamReplicator) setup(ctx context.Context) error {
if err := r.BaseReplicator.CreatePublication(); err != nil {
return fmt.Errorf("failed to create publication: %v", err)
}
Expand All @@ -34,43 +74,18 @@ func (r *StreamReplicator) StartReplication() error {
return fmt.Errorf("failed to create replication slot: %v", err)
}

var ddlCancel context.CancelFunc
if r.Config.TrackDDL {
if err := r.DDLReplicator.SetupDDLTracking(ctx); err != nil {
return fmt.Errorf("failed to set up DDL tracking: %v", err)
}
var ddlCtx context.Context
ddlCtx, ddlCancel = context.WithCancel(ctx)
go r.DDLReplicator.StartDDLReplication(ddlCtx)
go r.DDLReplicator.StartDDLReplication(ctx)
}

defer func() {
if r.Config.TrackDDL {
ddlCancel()
if err := r.DDLReplicator.Shutdown(context.Background()); err != nil {
r.Logger.Error().Err(err).Msg("Failed to shutdown DDL replicator")
}
}
}()

if err := r.BaseReplicator.CheckReplicationSlotStatus(ctx); err != nil {
return fmt.Errorf("failed to check replication slot status: %v", err)
}

startLSN, err := r.getStartLSN(ctx)
if err != nil {
return err
}

r.Logger.Info().Str("startLSN", startLSN.String()).Msg("Starting replication from LSN")
return r.BaseReplicator.StartReplicationFromLSN(ctx, startLSN)
}

// handleShutdownSignal waits for a shutdown signal and cancels the context.
func (r *StreamReplicator) handleShutdownSignal(sigChan <-chan os.Signal, cancel context.CancelFunc) {
sig := <-sigChan
r.Logger.Info().Str("signal", sig.String()).Msg("Received shutdown signal")
cancel()
return nil
}

// getStartLSN determines the starting LSN for replication.
Expand Down
Loading

0 comments on commit 185dd69

Please sign in to comment.