feat: implement batch processing for MSSQL extraction and transformation with range handling
This commit is contained in:
@@ -5,30 +5,36 @@ import (
|
|||||||
"strings"
|
"strings"
|
||||||
)
|
)
|
||||||
|
|
||||||
func buildExtractQueryMssql(job MigrationJob, columns []ColumnType) string {
|
func buildExtractQueryMssql(job MigrationJob, columns []ColumnType, includeRange bool) string {
|
||||||
var sbColumns strings.Builder
|
var sbQuery strings.Builder
|
||||||
|
|
||||||
|
sbQuery.WriteString("SELECT ")
|
||||||
|
|
||||||
if len(columns) == 0 {
|
if len(columns) == 0 {
|
||||||
sbColumns.WriteString("*")
|
sbQuery.WriteString("*")
|
||||||
} else {
|
} else {
|
||||||
for i, col := range columns {
|
for i, col := range columns {
|
||||||
sbColumns.WriteString("[")
|
fmt.Fprintf(&sbQuery, "[%s]", col.name)
|
||||||
sbColumns.WriteString(col.name)
|
|
||||||
sbColumns.WriteString("]")
|
|
||||||
|
|
||||||
if col.unifiedType == "GEOMETRY" {
|
if col.unifiedType == "GEOMETRY" {
|
||||||
sbColumns.WriteString(".STAsBinary() AS [")
|
fmt.Fprintf(&sbQuery, ".STAsBinary() AS [%s]", col.name)
|
||||||
sbColumns.WriteString(col.name)
|
|
||||||
sbColumns.WriteString("]")
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if i < len(columns)-1 {
|
if i < len(columns)-1 {
|
||||||
sbColumns.WriteString(", ")
|
sbQuery.WriteString(", ")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return fmt.Sprintf(`SELECT %s FROM [%s].[%s] ORDER BY [%s] ASC`, sbColumns.String(), job.Schema, job.Table, job.PrimaryKey)
|
fmt.Fprintf(&sbQuery, " FROM [%s].[%s]", job.Schema, job.Table)
|
||||||
|
|
||||||
|
if includeRange {
|
||||||
|
fmt.Fprintf(&sbQuery, " WHERE [%s] BETWEEN @minRange AND @maxRange", job.PrimaryKey)
|
||||||
|
}
|
||||||
|
|
||||||
|
fmt.Fprintf(&sbQuery, " ORDER BY [%s] ASC", job.PrimaryKey)
|
||||||
|
|
||||||
|
return sbQuery.String()
|
||||||
}
|
}
|
||||||
|
|
||||||
func buildExtractQueryPostgres(job MigrationJob, columns []ColumnType) string {
|
func buildExtractQueryPostgres(job MigrationJob, columns []ColumnType) string {
|
||||||
|
|||||||
91
cmd/go_migrate/chunk-planner.go
Normal file
91
cmd/go_migrate/chunk-planner.go
Normal file
@@ -0,0 +1,91 @@
|
|||||||
|
package main
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"database/sql"
|
||||||
|
"fmt"
|
||||||
|
)
|
||||||
|
|
||||||
|
type BatchRange struct {
|
||||||
|
LowerLimit int
|
||||||
|
UpperLimit int
|
||||||
|
validRange bool
|
||||||
|
}
|
||||||
|
|
||||||
|
func estimateTotalRowsMssql(ctx context.Context, db *sql.DB, job MigrationJob) (int, error) {
|
||||||
|
query := `
|
||||||
|
SELECT
|
||||||
|
SUM(p.rows) AS count
|
||||||
|
FROM sys.tables t
|
||||||
|
JOIN sys.schemas s ON t.schema_id = s.schema_id
|
||||||
|
JOIN sys.partitions p ON t.object_id = p.object_id
|
||||||
|
WHERE s.name = @schema AND t.name = @table AND p.index_id IN (0, 1)
|
||||||
|
GROUP BY t.name`
|
||||||
|
|
||||||
|
var rowsCount int
|
||||||
|
err := db.QueryRowContext(ctx, query, sql.Named("schema", job.Schema), sql.Named("table", job.Table)).Scan(&rowsCount)
|
||||||
|
if err != nil {
|
||||||
|
return 0, err
|
||||||
|
}
|
||||||
|
|
||||||
|
return rowsCount, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func calculateChunkRangesMssql(ctx context.Context, db *sql.DB, job MigrationJob, batchCount int) ([]BatchRange, error) {
|
||||||
|
query := fmt.Sprintf(`
|
||||||
|
SELECT
|
||||||
|
MIN([%s]) AS lower_limit,
|
||||||
|
MAX([%s]) AS upper_limit
|
||||||
|
FROM
|
||||||
|
(SELECT [%s], NTILE(@batchCount) OVER (ORDER BY [%s]) AS chunk_id FROM [%s].[%s]) AS T
|
||||||
|
GROUP BY chunk_id
|
||||||
|
ORDER BY chunk_id`, job.PrimaryKey, job.PrimaryKey, job.PrimaryKey, job.PrimaryKey, job.Schema, job.Table)
|
||||||
|
|
||||||
|
rows, err := db.QueryContext(ctx, query, sql.Named("batchCount", batchCount))
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
defer rows.Close()
|
||||||
|
|
||||||
|
batchRanges := make([]BatchRange, 0, batchCount)
|
||||||
|
|
||||||
|
for rows.Next() {
|
||||||
|
var br BatchRange
|
||||||
|
br.validRange = true
|
||||||
|
|
||||||
|
if err := rows.Scan(&br.LowerLimit, &br.UpperLimit); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
batchRanges = append(batchRanges, br)
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := rows.Err(); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
return batchRanges, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
const estimatedRowsPerBatch = 100_000
|
||||||
|
|
||||||
|
func calculateBatchMetrics(ctx context.Context, db *sql.DB, job MigrationJob) ([]BatchRange, error) {
|
||||||
|
rowsCount, err := estimateTotalRowsMssql(ctx, db, job)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
batchCount := 1
|
||||||
|
if rowsCount > estimatedRowsPerBatch {
|
||||||
|
batchCount = rowsCount / estimatedRowsPerBatch
|
||||||
|
} else {
|
||||||
|
return []BatchRange{{validRange: false}}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
chunksRange, err := calculateChunkRangesMssql(ctx, db, job, batchCount)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
return chunksRange, nil
|
||||||
|
}
|
||||||
@@ -12,12 +12,20 @@ import (
|
|||||||
|
|
||||||
type UnknownRowValues = []any
|
type UnknownRowValues = []any
|
||||||
|
|
||||||
func extractFromMssql(ctx context.Context, job MigrationJob, columns []ColumnType, chunkSize int, db *sql.DB, out chan<- []UnknownRowValues) error {
|
func extractFromMssql(ctx context.Context, db *sql.DB, job MigrationJob, columns []ColumnType, chunkSize int, batchRange BatchRange, out chan<- []UnknownRowValues) error {
|
||||||
query := buildExtractQueryMssql(job, columns)
|
query := buildExtractQueryMssql(job, columns, batchRange.validRange)
|
||||||
log.Debug("Query used to extract data from mssql: ", query)
|
log.Debug("Query used to extract data from mssql: ", query)
|
||||||
|
|
||||||
|
var queryArgs []any
|
||||||
|
if batchRange.validRange {
|
||||||
|
queryArgs = append(queryArgs,
|
||||||
|
sql.Named("minRange", batchRange.LowerLimit),
|
||||||
|
sql.Named("maxRange", batchRange.UpperLimit),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
queryStartTime := time.Now()
|
queryStartTime := time.Now()
|
||||||
rows, err := db.QueryContext(ctx, query)
|
rows, err := db.QueryContext(ctx, query, queryArgs...)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -21,7 +21,7 @@ var migrationJobs []MigrationJob = []MigrationJob{
|
|||||||
}
|
}
|
||||||
|
|
||||||
const (
|
const (
|
||||||
NumExtractors int = 1
|
NumExtractors int = 4
|
||||||
NumLoaders int = 4
|
NumLoaders int = 4
|
||||||
ChunkSize int = 50000
|
ChunkSize int = 50000
|
||||||
QueueSize int = 10
|
QueueSize int = 10
|
||||||
|
|||||||
@@ -24,24 +24,56 @@ func processMigrationJob(sourceDb *sql.DB, targetDb *pgxpool.Pool, job Migration
|
|||||||
logColumnTypes(sourceColTypes, "Source col types")
|
logColumnTypes(sourceColTypes, "Source col types")
|
||||||
logColumnTypes(targetColTypes, "Target col types")
|
logColumnTypes(targetColTypes, "Target col types")
|
||||||
|
|
||||||
chRowsExtract := make(chan []UnknownRowValues, QueueSize)
|
|
||||||
chRowsTransform := make(chan []UnknownRowValues)
|
|
||||||
mssqlCtx := context.Background()
|
mssqlCtx := context.Background()
|
||||||
|
batchRanges, err := calculateBatchMetrics(mssqlCtx, sourceDb, job)
|
||||||
|
if err != nil {
|
||||||
|
log.Error("Unexpected error calculating batch ranges: ", err)
|
||||||
|
}
|
||||||
|
|
||||||
go func() {
|
chBatchRanges := make(chan BatchRange, len(batchRanges))
|
||||||
log.Info("Starting extraction from MSSQL...")
|
|
||||||
|
maxExtractors := min(NumExtractors, len(batchRanges))
|
||||||
|
chRowsExtract := make(chan []UnknownRowValues, QueueSize)
|
||||||
|
var wgMssqlExtractors sync.WaitGroup
|
||||||
|
|
||||||
|
log.Infof("Starting %d MSSQL extractors...", maxExtractors)
|
||||||
extractStartTime := time.Now()
|
extractStartTime := time.Now()
|
||||||
if err := extractFromMssql(mssqlCtx, job, sourceColTypes, ChunkSize, sourceDb, chRowsExtract); err != nil {
|
for range maxExtractors {
|
||||||
|
wgMssqlExtractors.Go(func() {
|
||||||
|
for br := range chBatchRanges {
|
||||||
|
if err := extractFromMssql(mssqlCtx, sourceDb, job, sourceColTypes, ChunkSize, br, chRowsExtract); err != nil {
|
||||||
log.Error("Unexpected error extracting data from mssql: ", err)
|
log.Error("Unexpected error extracting data from mssql: ", err)
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
go func() {
|
||||||
|
for _, br := range batchRanges {
|
||||||
|
chBatchRanges <- br
|
||||||
|
}
|
||||||
|
close(chBatchRanges)
|
||||||
|
}()
|
||||||
|
|
||||||
|
go func() {
|
||||||
|
wgMssqlExtractors.Wait()
|
||||||
close(chRowsExtract)
|
close(chRowsExtract)
|
||||||
log.Infof("Extraction completed in %v", time.Since(extractStartTime))
|
log.Infof("Extraction completed in %v", time.Since(extractStartTime))
|
||||||
}()
|
}()
|
||||||
|
|
||||||
go func() {
|
chRowsTransform := make(chan []UnknownRowValues, QueueSize)
|
||||||
log.Info("Starting transformation of rows...")
|
var wgMssqlTransformers sync.WaitGroup
|
||||||
|
|
||||||
|
log.Infof("Starting %d MSSQL transformers...", maxExtractors)
|
||||||
transformStartTime := time.Now()
|
transformStartTime := time.Now()
|
||||||
|
for range maxExtractors {
|
||||||
|
wgMssqlTransformers.Go(func() {
|
||||||
transformRowsMssql(sourceColTypes, chRowsExtract, chRowsTransform)
|
transformRowsMssql(sourceColTypes, chRowsExtract, chRowsTransform)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
go func() {
|
||||||
|
wgMssqlTransformers.Wait()
|
||||||
close(chRowsTransform)
|
close(chRowsTransform)
|
||||||
log.Infof("Transformation completed in %v", time.Since(transformStartTime))
|
log.Infof("Transformation completed in %v", time.Since(transformStartTime))
|
||||||
}()
|
}()
|
||||||
|
|||||||
Reference in New Issue
Block a user