feat: implement Postgres loader and refactor migration job processing

This commit is contained in:
2026-04-10 20:40:01 -05:00
parent 053e6bd673
commit c1bae79f98
5 changed files with 172 additions and 202 deletions

View File

@@ -1,197 +0,0 @@
package main
import (
"context"
"database/sql"
"errors"
"fmt"
"sync"
"sync/atomic"
"time"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/config"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/custom_errors"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/models"
"github.com/jackc/pgx/v5"
"github.com/jackc/pgx/v5/pgconn"
"github.com/jackc/pgx/v5/pgxpool"
mssql "github.com/microsoft/go-mssqldb"
log "github.com/sirupsen/logrus"
)
func loadRowsPostgres(
ctx context.Context,
db *pgxpool.Pool,
tableInfo config.TargetTableInfo,
columns []models.ColumnType,
chChunksIn <-chan models.Chunk,
chErrorsOut chan<- custom_errors.LoaderError,
chJobErrorsOut chan<- custom_errors.JobError,
wgActiveChunks *sync.WaitGroup,
rowsLoaded *int64,
) {
tableId := pgx.Identifier{tableInfo.Schema, tableInfo.Table}
colNames := Map(columns, func(col models.ColumnType) string {
return col.Name()
})
for {
if ctx.Err() != nil {
return
}
select {
case <-ctx.Done():
return
case chunk, ok := <-chChunksIn:
if !ok {
return
}
if abort := loadChunkPostgres(ctx, db, tableId, colNames, chunk, chErrorsOut, chJobErrorsOut, wgActiveChunks, rowsLoaded); abort {
return
}
}
}
}
func loadChunkPostgres(
ctx context.Context,
db *pgxpool.Pool,
identifier pgx.Identifier,
colNames []string,
chunk models.Chunk,
chErrorsOut chan<- custom_errors.LoaderError,
chJobErrorsOut chan<- custom_errors.JobError,
wgActiveChunks *sync.WaitGroup,
rowsLoaded *int64,
) (abort bool) {
chunkStartTime := time.Now()
_, err := db.CopyFrom(
ctx,
identifier,
colNames,
pgx.CopyFromRows(chunk.Data),
)
if err != nil {
var pgErr *pgconn.PgError
if errors.As(err, &pgErr) {
if pgErr.Code == "23505" {
select {
case chJobErrorsOut <- custom_errors.JobError{
ShouldCancelJob: true,
Msg: fmt.Sprintf("Fatal error in table %s", identifier.Sanitize()),
Prev: err,
}:
case <-ctx.Done():
}
wgActiveChunks.Done()
return true
}
}
select {
case chErrorsOut <- custom_errors.LoaderError{Chunk: chunk, Msg: err.Error()}:
case <-ctx.Done():
return true
}
return false
}
chunkDuration := time.Since(chunkStartTime)
rowsPerSec := float64(len(chunk.Data)) / chunkDuration.Seconds()
log.Infof("Loaded chunk: %d rows in %v (%.0f rows/sec)", len(chunk.Data), chunkDuration, rowsPerSec)
atomic.AddInt64(rowsLoaded, int64(len(chunk.Data)))
wgActiveChunks.Done()
return false
}
func loadRowsMssql(ctx context.Context, tableInfo config.TargetTableInfo, columns []ColumnType, db *sql.DB, in <-chan []models.UnknownRowValues) error {
chunkCount := 0
totalRowsLoaded := 0
for rows := range in {
chunkStartTime := time.Now()
tx, err := db.BeginTx(ctx, nil)
if err != nil {
return fmt.Errorf("error starting transaction: %w", err)
}
fullTableName := fmt.Sprintf("[%s].[%s]", tableInfo.Schema, tableInfo.Table)
colNames := Map(columns, func(col ColumnType) string {
return col.name
})
stmt, err := tx.PrepareContext(ctx, mssql.CopyIn(fullTableName, mssql.BulkOptions{}, colNames...))
if err != nil {
tx.Rollback()
return fmt.Errorf("error preparing bulk copy statement: %w", err)
}
copyStartTime := time.Now()
for _, row := range rows {
_, err = stmt.ExecContext(ctx, row...)
if err != nil {
stmt.Close()
tx.Rollback()
return fmt.Errorf("error executing row insert: %w", err)
}
}
result, err := stmt.ExecContext(ctx)
if err != nil {
stmt.Close()
tx.Rollback()
return fmt.Errorf("error flushing bulk data: %w", err)
}
err = stmt.Close()
if err != nil {
tx.Rollback()
return fmt.Errorf("error closing statement: %w", err)
}
if err := tx.Commit(); err != nil {
return fmt.Errorf("error committing transaction: %w", err)
}
rowsAffected, _ := result.RowsAffected()
chunkCount++
totalRowsLoaded += int(rowsAffected)
copyDuration := time.Since(copyStartTime)
chunkDuration := time.Since(chunkStartTime)
rowsPerSec := float64(len(rows)) / chunkDuration.Seconds()
log.Infof("Loaded chunk #%d (MSSQL): %d rows in %v (copy: %v, %.0f rows/sec) - Total: %d rows", chunkCount, len(rows), chunkDuration, copyDuration, rowsPerSec, totalRowsLoaded)
}
return nil
}
func Map[T any, V any](input []T, mapper func(T) V) []V {
result := make([]V, len(input))
for i, v := range input {
result[i] = mapper(v)
}
return result
}
func fakeLoader(tableInfo config.TargetTableInfo, columns []ColumnType, in <-chan [][]any) {
for rows := range in {
log.Debugf("Chunk received, loading data into...")
for i, rowValues := range rows {
if i%100 == 0 {
logSampleRow(tableInfo.Schema, tableInfo.Table, columns, rowValues, fmt.Sprintf("row %d", i))
}
}
}
}

View File

@@ -10,6 +10,7 @@ import (
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/config" "git.ksdemosapps.com/kylesoda/go-migrate/internal/app/config"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/custom_errors" "git.ksdemosapps.com/kylesoda/go-migrate/internal/app/custom_errors"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/etl/extractor" "git.ksdemosapps.com/kylesoda/go-migrate/internal/app/etl/extractor"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/etl/loader"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/etl/transformer" "git.ksdemosapps.com/kylesoda/go-migrate/internal/app/etl/transformer"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/models" "git.ksdemosapps.com/kylesoda/go-migrate/internal/app/models"
"github.com/jackc/pgx/v5/pgxpool" "github.com/jackc/pgx/v5/pgxpool"
@@ -63,6 +64,7 @@ func processMigrationJob(
mssqlExtractor := extractor.NewMssqlExtractor(sourceDb) mssqlExtractor := extractor.NewMssqlExtractor(sourceDb)
mssqlToPostgresTransformer := transformer.NewMssqlTransformer() mssqlToPostgresTransformer := transformer.NewMssqlTransformer()
postgresLoader := loader.NewPostgresLoader(targetDb)
go func() { go func() {
if err := custom_errors.JobErrorHandler(jobCtx, chJobErrors); err != nil { if err := custom_errors.JobErrorHandler(jobCtx, chJobErrors); err != nil {
@@ -120,7 +122,16 @@ func processMigrationJob(
for range job.MaxLoaders { for range job.MaxLoaders {
wgLoaders.Go(func() { wgLoaders.Go(func() {
loadRowsPostgres(jobCtx, targetDb, job.TargetTable, targetColTypes, chChunksTransformed, chLoadersErrors, chJobErrors, &wgActiveChunks, &rowsLoaded) postgresLoader.Exec(
jobCtx,
job.TargetTable,
targetColTypes,
chChunksTransformed,
chLoadersErrors,
chJobErrors,
&wgActiveChunks,
&rowsLoaded,
)
}) })
} }

View File

@@ -254,10 +254,13 @@ func (mssqlEx *MssqlExtractor) Exec(
} }
} }
var jobError *custom_errors.JobError
if errors.As(err, &jobError) {
select { select {
case <-ctx.Done(): case <-ctx.Done():
return return
case chJobErrorsOut <- custom_errors.JobError{ShouldCancelJob: false, Prev: err}: case chJobErrorsOut <- *jobError:
}
} }
return return

View File

@@ -0,0 +1,123 @@
package loader
import (
"context"
"errors"
"fmt"
"sync"
"sync/atomic"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/config"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/custom_errors"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/models"
"github.com/jackc/pgx/v5"
"github.com/jackc/pgx/v5/pgconn"
"github.com/jackc/pgx/v5/pgxpool"
)
type PostgresLoader struct {
db *pgxpool.Pool
}
func NewPostgresLoader(pool *pgxpool.Pool) Loader {
return &PostgresLoader{db: pool}
}
func mapSlice[T any, V any](input []T, mapper func(T) V) []V {
result := make([]V, len(input))
for i, v := range input {
result[i] = mapper(v)
}
return result
}
func (postgresLd *PostgresLoader) ProcessChunk(
ctx context.Context,
tableInfo config.TargetTableInfo,
colNames []string,
chunk models.Chunk,
) (int, error) {
tableId := pgx.Identifier{tableInfo.Schema, tableInfo.Table}
_, err := postgresLd.db.CopyFrom(
ctx,
tableId,
colNames,
pgx.CopyFromRows(chunk.Data),
)
if err != nil {
var pgErr *pgconn.PgError
if errors.As(err, &pgErr) {
if pgErr.Code == "23505" {
return 0, &custom_errors.JobError{
ShouldCancelJob: true,
Msg: fmt.Sprintf("Fatal error in table %s", tableId.Sanitize()),
Prev: err,
}
}
}
return 0, &custom_errors.LoaderError{Chunk: chunk, Msg: err.Error()}
}
return len(chunk.Data), nil
}
func (postgresLd *PostgresLoader) Exec(
ctx context.Context,
tableInfo config.TargetTableInfo,
columns []models.ColumnType,
chChunksIn <-chan models.Chunk,
chErrorsOut chan<- custom_errors.LoaderError,
chJobErrorsOut chan<- custom_errors.JobError,
wgActiveChunks *sync.WaitGroup,
rowsLoaded *int64,
) {
colNames := mapSlice(columns, func(col models.ColumnType) string {
return col.Name()
})
for {
if ctx.Err() != nil {
return
}
select {
case <-ctx.Done():
return
case chunk, ok := <-chChunksIn:
if !ok {
return
}
processedRows, err := postgresLd.ProcessChunk(ctx, tableInfo, colNames, chunk)
if err != nil {
var ldError *custom_errors.LoaderError
if errors.As(err, &ldError) {
select {
case <-ctx.Done():
return
case chErrorsOut <- *ldError:
}
}
var jobError *custom_errors.JobError
if errors.As(err, &jobError) {
select {
case <-ctx.Done():
return
case chJobErrorsOut <- *jobError:
}
}
return
}
wgActiveChunks.Done()
atomic.AddInt64(rowsLoaded, int64(processedRows))
}
}
}

View File

@@ -0,0 +1,30 @@
package loader
import (
"context"
"sync"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/config"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/custom_errors"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/models"
)
type Loader interface {
ProcessChunk(
ctx context.Context,
tableInfo config.TargetTableInfo,
colNames []string,
chunk models.Chunk,
) (int, error)
Exec(
ctx context.Context,
tableInfo config.TargetTableInfo,
columns []models.ColumnType,
chChunksIn <-chan models.Chunk,
chErrorsOut chan<- custom_errors.LoaderError,
chJobErrorsOut chan<- custom_errors.JobError,
wgActiveChunks *sync.WaitGroup,
rowsLoaded *int64,
)
}