feat: implement loader error handling and refactor chunk processing in migration job

This commit is contained in:
2026-04-08 23:42:31 -05:00
parent 0ee5d9032c
commit dc632361e5
4 changed files with 134 additions and 40 deletions

View File

@@ -81,20 +81,18 @@ func ExtractorErrorFromLastRowMssql(lastRow UnknownRowValues, indexPrimaryKey in
if !ok { if !ok {
currentBatch := *batch currentBatch := *batch
currentBatch.RetryCounter = maxRetryAttempts currentBatch.RetryCounter = maxRetryAttempts
exError := ExtractorError{ return ExtractorError{
Batch: currentBatch, Batch: currentBatch,
HasLastId: true, HasLastId: true,
Msg: fmt.Sprintf("Couldn't cast last id value as int: %s", previousError.Error()), Msg: fmt.Sprintf("Couldn't cast last id value as int: %s", previousError.Error()),
} }
return exError
} }
exError := ExtractorError{ return ExtractorError{
Batch: *batch, Batch: *batch,
HasLastId: true, HasLastId: true,
LastId: lastId, LastId: lastId,
Msg: previousError.Error(), Msg: previousError.Error(),
} }
return exError
} }

View File

@@ -0,0 +1,61 @@
package main
import (
"context"
"fmt"
)
type LoaderError struct {
Chunk
Msg string
}
func (e *LoaderError) Error() string {
return e.Msg
}
func loaderErrorHandler(
ctx context.Context,
chErrorsIn <-chan LoaderError,
chChunksOut chan<- Chunk,
chJobErrorsOut chan<- JobError,
) {
for {
if ctx.Err() != nil {
return
}
select {
case <-ctx.Done():
return
case err, ok := <-chErrorsIn:
if !ok {
return
}
if err.RetryCounter >= maxRetryAttempts {
jobError := JobError{
ShouldCancelJob: false,
Msg: fmt.Sprintf("chunk %v reached max retries (%d)", err.Id, maxRetryAttempts),
Prev: &err,
}
select {
case chJobErrorsOut <- jobError:
case <-ctx.Done():
return
}
continue
}
err.RetryCounter++
select {
case chChunksOut <- err.Chunk:
case <-ctx.Done():
return
}
}
}
}

View File

@@ -12,18 +12,48 @@ import (
log "github.com/sirupsen/logrus" log "github.com/sirupsen/logrus"
) )
func loadRowsPostgres(ctx context.Context, job MigrationJob, columns []ColumnType, db *pgxpool.Pool, chChunksIn <-chan Chunk) error { func loadRowsPostgres(
chunkCount := 0 ctx context.Context,
totalRowsLoaded := 0 db *pgxpool.Pool,
job MigrationJob,
for chunk := range chChunksIn { columns []ColumnType,
chunkStartTime := time.Now() chChunksIn <-chan Chunk,
identifier := pgx.Identifier{job.Schema, job.Table} chErrorsOut chan<- LoaderError,
) {
tableId := pgx.Identifier{job.Schema, job.Table}
colNames := Map(columns, func(col ColumnType) string { colNames := Map(columns, func(col ColumnType) string {
return col.name return col.name
}) })
copyStartTime := time.Now() for {
if ctx.Err() != nil {
return
}
select {
case <-ctx.Done():
return
case chunk, ok := <-chChunksIn:
if !ok {
return
}
if abort := loadChunkPostgres(ctx, db, tableId, colNames, chunk, chErrorsOut); abort {
return
}
}
}
}
func loadChunkPostgres(
ctx context.Context,
db *pgxpool.Pool,
identifier pgx.Identifier,
colNames []string,
chunk Chunk,
chErrorsOut chan<- LoaderError,
) (abort bool) {
chunkStartTime := time.Now()
_, err := db.CopyFrom( _, err := db.CopyFrom(
ctx, ctx,
identifier, identifier,
@@ -32,19 +62,20 @@ func loadRowsPostgres(ctx context.Context, job MigrationJob, columns []ColumnTyp
) )
if err != nil { if err != nil {
return err select {
case chErrorsOut <- LoaderError{Chunk: chunk, Msg: err.Error()}:
case <-ctx.Done():
return true
}
return false
} }
chunkCount++
totalRowsLoaded += len(chunk.Data)
copyDuration := time.Since(copyStartTime)
chunkDuration := time.Since(chunkStartTime) chunkDuration := time.Since(chunkStartTime)
rowsPerSec := float64(len(chunk.Data)) / chunkDuration.Seconds() rowsPerSec := float64(len(chunk.Data)) / chunkDuration.Seconds()
log.Infof("Loaded chunk #%d: %d rows in %v (copy: %v, %.0f rows/sec) - Total: %d rows", chunkCount, len(chunk.Data), chunkDuration, copyDuration, rowsPerSec, totalRowsLoaded) log.Infof("Loaded chunk: %d rows in %v (%.0f rows/sec)", len(chunk.Data), chunkDuration, rowsPerSec)
}
return nil return false
} }
func loadRowsMssql(ctx context.Context, job MigrationJob, columns []ColumnType, db *sql.DB, in <-chan []UnknownRowValues) error { func loadRowsMssql(ctx context.Context, job MigrationJob, columns []ColumnType, db *sql.DB, in <-chan []UnknownRowValues) error {

View File

@@ -50,7 +50,7 @@ func processMigrationJob(sourceDb *sql.DB, targetDb *pgxpool.Pool, job Migration
extractorErrorHandler(ctx, chExtractorErrors, chBatches, chJobErrors) extractorErrorHandler(ctx, chExtractorErrors, chBatches, chJobErrors)
}() }()
chChunks := make(chan Chunk, QueueSize) chChunksRaw := make(chan Chunk, QueueSize)
maxExtractors := min(NumExtractors, len(batches)) maxExtractors := min(NumExtractors, len(batches))
var wgMssqlExtractors sync.WaitGroup var wgMssqlExtractors sync.WaitGroup
@@ -58,7 +58,7 @@ func processMigrationJob(sourceDb *sql.DB, targetDb *pgxpool.Pool, job Migration
extractStartTime := time.Now() extractStartTime := time.Now()
for range maxExtractors { for range maxExtractors {
wgMssqlExtractors.Go(func() { wgMssqlExtractors.Go(func() {
extractFromMssql(ctx, sourceDb, job, sourceColTypes, ChunkSize, chBatches, chChunks, chExtractorErrors, chJobErrors) extractFromMssql(ctx, sourceDb, job, sourceColTypes, ChunkSize, chBatches, chChunksRaw, chExtractorErrors, chJobErrors)
}) })
} }
@@ -72,41 +72,45 @@ func processMigrationJob(sourceDb *sql.DB, targetDb *pgxpool.Pool, job Migration
go func() { go func() {
wgMssqlExtractors.Wait() wgMssqlExtractors.Wait()
close(chChunks) close(chChunksRaw)
log.Infof("Extraction completed in %v", time.Since(extractStartTime)) log.Infof("Extraction completed in %v", time.Since(extractStartTime))
}() }()
chChunksTransform := make(chan Chunk, QueueSize) chChunksTransformed := make(chan Chunk, QueueSize)
var wgMssqlTransformers sync.WaitGroup var wgMssqlTransformers sync.WaitGroup
log.Infof("Starting %d MSSQL transformers...", maxExtractors) log.Infof("Starting %d MSSQL transformers...", maxExtractors)
transformStartTime := time.Now() transformStartTime := time.Now()
for range maxExtractors { for range maxExtractors {
wgMssqlTransformers.Go(func() { wgMssqlTransformers.Go(func() {
transformRowsMssql(ctx, sourceColTypes, chChunks, chChunksTransform, chJobErrors) transformRowsMssql(ctx, sourceColTypes, chChunksRaw, chChunksTransformed, chJobErrors)
}) })
} }
go func() { go func() {
wgMssqlTransformers.Wait() wgMssqlTransformers.Wait()
close(chChunksTransform) close(chChunksTransformed)
log.Infof("Transformation completed in %v", time.Since(transformStartTime)) log.Infof("Transformation completed in %v", time.Since(transformStartTime))
}() }()
var wgPostgresLoaders sync.WaitGroup var wgPostgresLoaders sync.WaitGroup
chLoadersErrors := make(chan LoaderError)
go func() {
loaderErrorHandler(ctx, chLoadersErrors, chChunksTransformed, chJobErrors)
}()
log.Infof("Starting %d PostgreSQL loader(s)...", NumLoaders) log.Infof("Starting %d PostgreSQL loader(s)...", NumLoaders)
loaderStartTime := time.Now() loaderStartTime := time.Now()
for range NumLoaders { for range NumLoaders {
wgPostgresLoaders.Go(func() { wgPostgresLoaders.Go(func() {
if err := loadRowsPostgres(ctx, job, targetColTypes, targetDb, chChunksTransform); err != nil { loadRowsPostgres(ctx, targetDb, job, targetColTypes, chChunksTransformed, chLoadersErrors)
log.Error("Unexpected error loading data into postgres: ", err)
}
}) })
} }
wgPostgresLoaders.Wait() wgPostgresLoaders.Wait()
close(chLoadersErrors)
log.Infof("Loading completed in %v", time.Since(loaderStartTime)) log.Infof("Loading completed in %v", time.Since(loaderStartTime))
totalDuration := time.Since(jobStartTime) totalDuration := time.Since(jobStartTime)