feat: implement batch processing for MSSQL with improved structure and logging

This commit is contained in:
2026-04-08 17:35:12 -05:00
parent 51d83661a4
commit bc6f9a6a70
8 changed files with 343 additions and 168 deletions

View File

@@ -3,6 +3,8 @@ package main
import (
"context"
"database/sql"
"slices"
"strings"
"time"
"github.com/jackc/pgx/v5/pgxpool"
@@ -12,67 +14,126 @@ import (
type UnknownRowValues = []any
func extractFromMssql(ctx context.Context, db *sql.DB, job MigrationJob, columns []ColumnType, chunkSize int, batchRange BatchRange, out chan<- []UnknownRowValues) error {
query := buildExtractQueryMssql(job, columns, batchRange.validRange)
log.Debug("Query used to extract data from mssql: ", query)
func extractFromMssql(
ctx context.Context,
db *sql.DB,
job MigrationJob,
columns []ColumnType,
chunkSize int,
chBatchesIn <-chan Batch,
chChunksOut chan<- []UnknownRowValues,
chErrorsOut chan<- ExtractorError,
) {
indexPrimaryKey := slices.IndexFunc(columns, func(col ColumnType) bool {
return strings.EqualFold(col.name, job.PrimaryKey)
})
var queryArgs []any
if batchRange.validRange {
queryArgs = append(queryArgs,
sql.Named("minRange", batchRange.LowerLimit),
sql.Named("maxRange", batchRange.UpperLimit),
)
}
queryStartTime := time.Now()
rows, err := db.QueryContext(ctx, query, queryArgs...)
if err != nil {
return err
}
defer rows.Close()
log.Debugf("Query executed in %v", time.Since(queryStartTime))
rowsChunk := make([]UnknownRowValues, 0, chunkSize)
totalRowsExtracted := 0
chunkCount := 0
chunkStartTime := time.Now()
for rows.Next() {
values := make([]any, len(columns))
scanArgs := make([]any, len(columns))
for i := range values {
scanArgs[i] = &values[i]
}
if err := rows.Scan(scanArgs...); err != nil {
return err
}
rowsChunk = append(rowsChunk, values)
totalRowsExtracted++
if len(rowsChunk) >= chunkSize {
chunkCount++
chunkDuration := time.Since(chunkStartTime)
rowsPerSec := float64(chunkSize) / chunkDuration.Seconds()
log.Infof("Extracted chunk #%d: %d rows in %v (%.0f rows/sec) - Total: %d rows", chunkCount, len(rowsChunk), chunkDuration, rowsPerSec, totalRowsExtracted)
out <- rowsChunk
rowsChunk = make([]UnknownRowValues, 0, chunkSize)
chunkStartTime = time.Now()
if indexPrimaryKey == -1 {
exError := ExtractorError{
Batch: Batch{
RetryCounter: maxRetryAttempts,
},
HasLastId: false,
Msg: "Primary key not found in columns provided",
}
chErrorsOut <- exError
return
}
if len(rowsChunk) > 0 {
chunkCount++
chunkDuration := time.Since(chunkStartTime)
rowsPerSec := float64(len(rowsChunk)) / chunkDuration.Seconds()
log.Infof("Extracted final chunk #%d: %d rows in %v (%.0f rows/sec) - Total: %d rows",
chunkCount, len(rowsChunk), chunkDuration, rowsPerSec, totalRowsExtracted)
out <- rowsChunk
}
for batch := range chBatchesIn {
func() {
query := buildExtractQueryMssql(job, columns, batch.ShouldUseRange, batch.IsLowerLimitInclusive)
log.Debug("Query used to extract data from mssql: ", query)
return rows.Err()
var queryArgs []any
if batch.ShouldUseRange {
queryArgs = append(queryArgs,
sql.Named("min", batch.LowerLimit),
sql.Named("max", batch.UpperLimit),
)
}
queryStartTime := time.Now()
rows, err := db.QueryContext(ctx, query, queryArgs...)
if err != nil {
exError := ExtractorError{
Batch: batch,
HasLastId: false,
Msg: err.Error(),
}
chErrorsOut <- exError
return
}
defer rows.Close()
log.Debugf("Query executed in %v", time.Since(queryStartTime))
rowsChunk := make([]UnknownRowValues, 0, chunkSize)
totalRowsExtracted := 0
chunkStartTime := time.Now()
for rows.Next() {
values := make([]any, len(columns))
scanArgs := make([]any, len(columns))
for i := range values {
scanArgs[i] = &values[i]
}
if err := rows.Scan(scanArgs...); err != nil {
if len(rowsChunk) == 0 {
exError := ExtractorError{
Batch: batch,
HasLastId: false,
Msg: err.Error(),
}
chErrorsOut <- exError
return
}
lastRow := rowsChunk[len(rowsChunk)-1]
chErrorsOut <- ExtractorErrorFromLastRowMssql(lastRow, indexPrimaryKey, &batch, err)
return
}
rowsChunk = append(rowsChunk, values)
totalRowsExtracted++
if len(rowsChunk) >= chunkSize {
chunkDuration := time.Since(chunkStartTime)
rowsPerSec := float64(chunkSize) / chunkDuration.Seconds()
log.Infof("Extracted chunk: %d rows in %v (%.0f rows/sec) - Total: %d rows",
len(rowsChunk), chunkDuration, rowsPerSec, totalRowsExtracted)
chChunksOut <- rowsChunk
rowsChunk = make([]UnknownRowValues, 0, chunkSize)
chunkStartTime = time.Now()
}
}
if len(rowsChunk) > 0 {
chunkDuration := time.Since(chunkStartTime)
rowsPerSec := float64(len(rowsChunk)) / chunkDuration.Seconds()
log.Infof("Extracted final chunk: %d rows in %v (%.0f rows/sec) - Total: %d rows",
len(rowsChunk), chunkDuration, rowsPerSec, totalRowsExtracted)
chChunksOut <- rowsChunk
}
if err := rows.Err(); err != nil {
if len(rowsChunk) == 0 {
exError := ExtractorError{
Batch: batch,
HasLastId: false,
Msg: err.Error(),
}
chErrorsOut <- exError
return
}
lastRow := rowsChunk[len(rowsChunk)-1]
chErrorsOut <- ExtractorErrorFromLastRowMssql(lastRow, indexPrimaryKey, &batch, err)
return
}
}()
}
}
func extractFromPostgres(ctx context.Context, job MigrationJob, columns []ColumnType, chunkSize int, db *pgxpool.Pool, out chan<- []UnknownRowValues) error {