feat: enhance concurrency management by adding WaitGroup support in extractors and loaders

This commit is contained in:
2026-04-09 00:22:30 -05:00
parent dc632361e5
commit 51480015ba
8 changed files with 82 additions and 68 deletions

View File

@@ -3,6 +3,7 @@ package main
import ( import (
"context" "context"
"fmt" "fmt"
"sync"
"github.com/google/uuid" "github.com/google/uuid"
) )
@@ -25,6 +26,7 @@ func extractorErrorHandler(
chErrorsIn <-chan ExtractorError, chErrorsIn <-chan ExtractorError,
chBatchesOut chan<- Batch, chBatchesOut chan<- Batch,
chJobErrorsOut chan<- JobError, chJobErrorsOut chan<- JobError,
wgActiveBatches *sync.WaitGroup,
) { ) {
for { for {
if ctx.Err() != nil { if ctx.Err() != nil {
@@ -52,6 +54,8 @@ func extractorErrorHandler(
case <-ctx.Done(): case <-ctx.Done():
return return
} }
wgActiveBatches.Done()
continue continue
} }

View File

@@ -6,6 +6,7 @@ import (
"errors" "errors"
"slices" "slices"
"strings" "strings"
"sync"
"time" "time"
"github.com/google/uuid" "github.com/google/uuid"
@@ -33,6 +34,7 @@ func extractFromMssql(
chChunksOut chan<- Chunk, chChunksOut chan<- Chunk,
chErrorsOut chan<- ExtractorError, chErrorsOut chan<- ExtractorError,
chJobErrorsOut chan<- JobError, chJobErrorsOut chan<- JobError,
wgActiveBatches *sync.WaitGroup,
) { ) {
indexPrimaryKey := slices.IndexFunc(columns, func(col ColumnType) bool { indexPrimaryKey := slices.IndexFunc(columns, func(col ColumnType) bool {
return strings.EqualFold(col.name, job.PrimaryKey) return strings.EqualFold(col.name, job.PrimaryKey)
@@ -66,7 +68,7 @@ func extractFromMssql(
return return
} }
if abort := processBatch(ctx, db, job, columns, chunkSize, batch, indexPrimaryKey, chChunksOut, chErrorsOut); abort { if abort := processBatch(ctx, db, job, columns, chunkSize, batch, indexPrimaryKey, chChunksOut, chErrorsOut, wgActiveBatches); abort {
return return
} }
} }
@@ -83,6 +85,7 @@ func processBatch(
indexPrimaryKey int, indexPrimaryKey int,
chChunksOut chan<- Chunk, chChunksOut chan<- Chunk,
chErrorsOut chan<- ExtractorError, chErrorsOut chan<- ExtractorError,
wgActiveBatches *sync.WaitGroup,
) (abort bool) { ) (abort bool) {
query := buildExtractQueryMssql(job, columns, batch.ShouldUseRange, batch.IsLowerLimitInclusive) query := buildExtractQueryMssql(job, columns, batch.ShouldUseRange, batch.IsLowerLimitInclusive)
log.Debug("Query used to extract data from mssql: ", query) log.Debug("Query used to extract data from mssql: ", query)
@@ -199,6 +202,7 @@ func processBatch(
} }
} }
wgActiveBatches.Done()
return false return false
} }

View File

@@ -40,7 +40,7 @@ func jobErrorHandler(ctx context.Context, chErrorsIn <-chan JobError) error {
return &err return &err
} }
log.Error(err) log.Error(err.Msg, " - ", err.Prev)
} }
} }
} }

View File

@@ -3,6 +3,7 @@ package main
import ( import (
"context" "context"
"fmt" "fmt"
"sync"
) )
type LoaderError struct { type LoaderError struct {
@@ -19,6 +20,7 @@ func loaderErrorHandler(
chErrorsIn <-chan LoaderError, chErrorsIn <-chan LoaderError,
chChunksOut chan<- Chunk, chChunksOut chan<- Chunk,
chJobErrorsOut chan<- JobError, chJobErrorsOut chan<- JobError,
wgActiveChunks *sync.WaitGroup,
) { ) {
for { for {
if ctx.Err() != nil { if ctx.Err() != nil {
@@ -46,6 +48,8 @@ func loaderErrorHandler(
case <-ctx.Done(): case <-ctx.Done():
return return
} }
wgActiveChunks.Done()
continue continue
} }

View File

@@ -4,6 +4,7 @@ import (
"context" "context"
"database/sql" "database/sql"
"fmt" "fmt"
"sync"
"time" "time"
"github.com/jackc/pgx/v5" "github.com/jackc/pgx/v5"
@@ -19,6 +20,7 @@ func loadRowsPostgres(
columns []ColumnType, columns []ColumnType,
chChunksIn <-chan Chunk, chChunksIn <-chan Chunk,
chErrorsOut chan<- LoaderError, chErrorsOut chan<- LoaderError,
wgActiveChunks *sync.WaitGroup,
) { ) {
tableId := pgx.Identifier{job.Schema, job.Table} tableId := pgx.Identifier{job.Schema, job.Table}
colNames := Map(columns, func(col ColumnType) string { colNames := Map(columns, func(col ColumnType) string {
@@ -38,7 +40,7 @@ func loadRowsPostgres(
return return
} }
if abort := loadChunkPostgres(ctx, db, tableId, colNames, chunk, chErrorsOut); abort { if abort := loadChunkPostgres(ctx, db, tableId, colNames, chunk, chErrorsOut, wgActiveChunks); abort {
return return
} }
} }
@@ -52,6 +54,7 @@ func loadChunkPostgres(
colNames []string, colNames []string,
chunk Chunk, chunk Chunk,
chErrorsOut chan<- LoaderError, chErrorsOut chan<- LoaderError,
wgActiveChunks *sync.WaitGroup,
) (abort bool) { ) (abort bool) {
chunkStartTime := time.Now() chunkStartTime := time.Now()
_, err := db.CopyFrom( _, err := db.CopyFrom(
@@ -75,6 +78,7 @@ func loadChunkPostgres(
log.Infof("Loaded chunk: %d rows in %v (%.0f rows/sec)", len(chunk.Data), chunkDuration, rowsPerSec) log.Infof("Loaded chunk: %d rows in %v (%.0f rows/sec)", len(chunk.Data), chunkDuration, rowsPerSec)
wgActiveChunks.Done()
return false return false
} }

View File

@@ -13,5 +13,5 @@ func configureLog() {
DisableSorting: false, DisableSorting: false,
PadLevelText: true, PadLevelText: true,
}) })
log.SetLevel(log.DebugLevel) log.SetLevel(log.InfoLevel)
} }

View File

@@ -32,89 +32,82 @@ func processMigrationJob(sourceDb *sql.DB, targetDb *pgxpool.Pool, job Migration
log.Error("Unexpected error calculating batch ranges: ", err) log.Error("Unexpected error calculating batch ranges: ", err)
} }
chJobErrors := make(chan JobError) chJobErrors := make(chan JobError, 100)
defer close(chJobErrors) chBatches := make(chan Batch, QueueSize)
chExtractorErrors := make(chan ExtractorError, QueueSize)
go func() {
if err := jobErrorHandler(ctx, chJobErrors); err != nil {
if ctx.Err() == nil {
cancel()
}
}
}()
chBatches := make(chan Batch, len(batches))
chExtractorErrors := make(chan ExtractorError, len(batches))
go func() {
extractorErrorHandler(ctx, chExtractorErrors, chBatches, chJobErrors)
}()
chChunksRaw := make(chan Chunk, QueueSize) chChunksRaw := make(chan Chunk, QueueSize)
maxExtractors := min(NumExtractors, len(batches))
var wgMssqlExtractors sync.WaitGroup
log.Infof("Starting %d MSSQL extractors...", maxExtractors)
extractStartTime := time.Now()
for range maxExtractors {
wgMssqlExtractors.Go(func() {
extractFromMssql(ctx, sourceDb, job, sourceColTypes, ChunkSize, chBatches, chChunksRaw, chExtractorErrors, chJobErrors)
})
}
go func() {
for _, br := range batches {
chBatches <- br
}
close(chBatches)
close(chExtractorErrors)
}()
go func() {
wgMssqlExtractors.Wait()
close(chChunksRaw)
log.Infof("Extraction completed in %v", time.Since(extractStartTime))
}()
chChunksTransformed := make(chan Chunk, QueueSize) chChunksTransformed := make(chan Chunk, QueueSize)
var wgMssqlTransformers sync.WaitGroup chLoadersErrors := make(chan LoaderError, QueueSize)
var wgActiveBatches sync.WaitGroup
var wgActiveChunks sync.WaitGroup
var wgExtractors sync.WaitGroup
var wgTransformers sync.WaitGroup
var wgLoaders sync.WaitGroup
go jobErrorHandler(ctx, chJobErrors)
go extractorErrorHandler(ctx, chExtractorErrors, chBatches, chJobErrors, &wgActiveBatches)
go loaderErrorHandler(ctx, chLoadersErrors, chChunksTransformed, chJobErrors, &wgActiveChunks)
maxExtractors := min(NumExtractors, len(batches))
log.Infof("Starting %d extractors...", maxExtractors)
extractStartTime := time.Now()
log.Infof("Starting %d MSSQL transformers...", maxExtractors)
transformStartTime := time.Now()
for range maxExtractors { for range maxExtractors {
wgMssqlTransformers.Go(func() { wgExtractors.Go(func() {
transformRowsMssql(ctx, sourceColTypes, chChunksRaw, chChunksTransformed, chJobErrors) extractFromMssql(ctx, sourceDb, job, sourceColTypes, ChunkSize, chBatches, chChunksRaw, chExtractorErrors, chJobErrors, &wgActiveBatches)
}) })
} }
wgActiveBatches.Add(len(batches))
go func() { go func() {
wgMssqlTransformers.Wait() for _, batch := range batches {
close(chChunksTransformed) chBatches <- batch
log.Infof("Transformation completed in %v", time.Since(transformStartTime)) }
}() }()
var wgPostgresLoaders sync.WaitGroup log.Infof("Starting %d transformers...", maxExtractors)
chLoadersErrors := make(chan LoaderError) transformStartTime := time.Now()
go func() { for range maxExtractors {
loaderErrorHandler(ctx, chLoadersErrors, chChunksTransformed, chJobErrors) wgTransformers.Go(func() {
}() transformRowsMssql(ctx, sourceColTypes, chChunksRaw, chChunksTransformed, chJobErrors, &wgActiveChunks)
})
}
log.Infof("Starting %d PostgreSQL loader(s)...", NumLoaders) log.Infof("Starting %d PostgreSQL loader(s)...", NumLoaders)
loaderStartTime := time.Now() loadStartTime := time.Now()
for range NumLoaders { for range NumLoaders {
wgPostgresLoaders.Go(func() { wgLoaders.Go(func() {
loadRowsPostgres(ctx, targetDb, job, targetColTypes, chChunksTransformed, chLoadersErrors) loadRowsPostgres(ctx, targetDb, job, targetColTypes, chChunksTransformed, chLoadersErrors, &wgActiveChunks)
}) })
} }
wgPostgresLoaders.Wait() go func() {
close(chLoadersErrors) wgActiveBatches.Wait()
log.Infof("Loading completed in %v", time.Since(loaderStartTime)) close(chBatches)
close(chExtractorErrors)
totalDuration := time.Since(jobStartTime) wgExtractors.Wait()
log.Infof("Migration job completed successfully! Total time: %v", totalDuration) log.Infof("Extraction completed in %v", time.Since(extractStartTime))
close(chChunksRaw)
wgTransformers.Wait()
log.Infof("Transformation completed in %v", time.Since(transformStartTime))
wgActiveChunks.Wait()
close(chChunksTransformed)
close(chLoadersErrors)
wgLoaders.Wait()
log.Infof("Loading completed in %v", time.Since(loadStartTime))
cancel()
}()
<-ctx.Done()
log.Infof("Migration job completed. Total time: %v", time.Since(jobStartTime))
} }
func logColumnTypes(columnTypes []ColumnType, label string) { func logColumnTypes(columnTypes []ColumnType, label string) {

View File

@@ -3,6 +3,7 @@ package main
import ( import (
"context" "context"
"errors" "errors"
"sync"
"time" "time"
log "github.com/sirupsen/logrus" log "github.com/sirupsen/logrus"
@@ -21,6 +22,7 @@ func transformRowsMssql(
chChunksIn <-chan Chunk, chChunksIn <-chan Chunk,
chChunksOut chan<- Chunk, chChunksOut chan<- Chunk,
chJobErrorsOut chan<- JobError, chJobErrorsOut chan<- JobError,
wgActiveChunks *sync.WaitGroup,
) { ) {
transformationPlan := computeTransformationPlan(columns) transformationPlan := computeTransformationPlan(columns)
@@ -41,6 +43,7 @@ func transformRowsMssql(
if len(transformationPlan) == 0 { if len(transformationPlan) == 0 {
select { select {
case chChunksOut <- chunk: case chChunksOut <- chunk:
wgActiveChunks.Add(1)
continue continue
case <-ctx.Done(): case <-ctx.Done():
return return
@@ -69,6 +72,8 @@ func transformRowsMssql(
case <-ctx.Done(): case <-ctx.Done():
return return
} }
wgActiveChunks.Add(1)
} }
} }
} }