feat: update chunk size for MSSQL processing and enhance error handling in transformation functions
This commit is contained in:
@@ -28,7 +28,7 @@ var migrationJobs []MigrationJob = []MigrationJob{
|
||||
const (
|
||||
NumExtractors int = 4
|
||||
NumLoaders int = 8
|
||||
ChunkSize int = 25000
|
||||
ChunkSize int = 50000
|
||||
QueueSize int = 8
|
||||
ChunksPerBatch int = 16
|
||||
RowsPerBatch int64 = int64(ChunkSize * ChunksPerBatch)
|
||||
|
||||
@@ -2,27 +2,29 @@ package main
|
||||
|
||||
import (
|
||||
"encoding/binary"
|
||||
"errors"
|
||||
"time"
|
||||
)
|
||||
|
||||
func mssqlUuidToBigEndian(mssqlUuid []byte) []byte {
|
||||
func mssqlUuidToBigEndian(mssqlUuid []byte) ([]byte, error) {
|
||||
if len(mssqlUuid) != 16 {
|
||||
return mssqlUuid
|
||||
return nil, errors.New("Invalid uuid")
|
||||
}
|
||||
|
||||
pgUuid := make([]byte, 16)
|
||||
pgUuid[0], pgUuid[1], pgUuid[2], pgUuid[3] = mssqlUuid[3], mssqlUuid[2], mssqlUuid[1], mssqlUuid[0]
|
||||
pgUuid[4], pgUuid[5] = mssqlUuid[5], mssqlUuid[4]
|
||||
pgUuid[6], pgUuid[7] = mssqlUuid[7], mssqlUuid[6]
|
||||
copy(pgUuid[8:], mssqlUuid[8:])
|
||||
|
||||
return pgUuid
|
||||
return pgUuid, nil
|
||||
}
|
||||
|
||||
const sridFlag = 0x20000000
|
||||
|
||||
func wkbToEwkbWithSrid(geometry []byte, srid int) []byte {
|
||||
func wkbToEwkbWithSrid(geometry []byte, srid int) ([]byte, error) {
|
||||
if len(geometry) < 5 {
|
||||
return geometry
|
||||
return nil, errors.New("Invalid wkb")
|
||||
}
|
||||
|
||||
var byteOrder binary.ByteOrder
|
||||
@@ -34,7 +36,7 @@ func wkbToEwkbWithSrid(geometry []byte, srid int) []byte {
|
||||
|
||||
wkbType := byteOrder.Uint32(geometry[1:5])
|
||||
if wkbType&sridFlag != 0 {
|
||||
return geometry
|
||||
return geometry, nil
|
||||
}
|
||||
|
||||
ewkbType := wkbType | sridFlag
|
||||
@@ -49,7 +51,7 @@ func wkbToEwkbWithSrid(geometry []byte, srid int) []byte {
|
||||
|
||||
copy(result[9:], geometry[5:])
|
||||
|
||||
return result
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func ensureUTC(t time.Time) time.Time {
|
||||
@@ -59,3 +61,20 @@ func ensureUTC(t time.Time) time.Time {
|
||||
|
||||
return time.Date(t.Year(), t.Month(), t.Day(), t.Hour(), t.Minute(), t.Second(), t.Nanosecond(), time.UTC)
|
||||
}
|
||||
|
||||
func ToInt64(v any) (int64, bool) {
|
||||
switch t := v.(type) {
|
||||
case int:
|
||||
return int64(t), true
|
||||
case int8:
|
||||
return int64(t), true
|
||||
case int16:
|
||||
return int64(t), true
|
||||
case int32:
|
||||
return int64(t), true
|
||||
case int64:
|
||||
return int64(t), true
|
||||
default:
|
||||
return 0, false
|
||||
}
|
||||
}
|
||||
|
||||
@@ -74,20 +74,20 @@ func processMigrationJob(sourceDb *sql.DB, targetDb *pgxpool.Pool, job Migration
|
||||
log.Infof("Extraction completed in %v", time.Since(extractStartTime))
|
||||
}()
|
||||
|
||||
chRowsTransform := make(chan []UnknownRowValues, QueueSize)
|
||||
chChunksTransform := make(chan []UnknownRowValues, QueueSize)
|
||||
var wgMssqlTransformers sync.WaitGroup
|
||||
|
||||
log.Infof("Starting %d MSSQL transformers...", maxExtractors)
|
||||
transformStartTime := time.Now()
|
||||
for range maxExtractors {
|
||||
wgMssqlTransformers.Go(func() {
|
||||
transformRowsMssql(sourceColTypes, chChunks, chRowsTransform)
|
||||
transformRowsMssql(sourceColTypes, chChunks, chChunksTransform, chJobErrors)
|
||||
})
|
||||
}
|
||||
|
||||
go func() {
|
||||
wgMssqlTransformers.Wait()
|
||||
close(chRowsTransform)
|
||||
close(chChunksTransform)
|
||||
log.Infof("Transformation completed in %v", time.Since(transformStartTime))
|
||||
}()
|
||||
|
||||
@@ -98,7 +98,7 @@ func processMigrationJob(sourceDb *sql.DB, targetDb *pgxpool.Pool, job Migration
|
||||
|
||||
for range NumLoaders {
|
||||
wgPostgresLoaders.Go(func() {
|
||||
if err := loadRowsPostgres(ctx, job, targetColTypes, targetDb, chRowsTransform); err != nil {
|
||||
if err := loadRowsPostgres(ctx, job, targetColTypes, targetDb, chChunksTransform); err != nil {
|
||||
log.Error("Unexpected error loading data into postgres: ", err)
|
||||
}
|
||||
})
|
||||
|
||||
@@ -6,26 +6,53 @@ import (
|
||||
log "github.com/sirupsen/logrus"
|
||||
)
|
||||
|
||||
func transformRowsMssql(columns []ColumnType, in <-chan []UnknownRowValues, out chan<- []UnknownRowValues) {
|
||||
func transformRowsMssql(
|
||||
columns []ColumnType,
|
||||
chChunksIn <-chan []UnknownRowValues,
|
||||
chChunksOut chan<- []UnknownRowValues,
|
||||
chJobErrorsOut chan<- JobError,
|
||||
) {
|
||||
chunkCount := 0
|
||||
totalRowsTransformed := 0
|
||||
|
||||
for rows := range in {
|
||||
for rows := range chChunksIn {
|
||||
chunkStartTime := time.Now()
|
||||
log.Debugf("Chunk #%d received, transforming %d rows...", chunkCount+1, len(rows))
|
||||
log.Debugf("Chunk received, transforming %d rows...", len(rows))
|
||||
|
||||
for _, rowValues := range rows {
|
||||
for i, col := range columns {
|
||||
value := rowValues[i]
|
||||
if col.SystemType() == "uniqueidentifier" {
|
||||
|
||||
switch col.SystemType() {
|
||||
case "uniqueidentifier":
|
||||
if b, ok := value.([]byte); ok {
|
||||
rowValues[i] = mssqlUuidToBigEndian(b)
|
||||
pgUuid, err := mssqlUuidToBigEndian(b)
|
||||
if err != nil {
|
||||
jobError := JobError{
|
||||
ShouldCancelJob: true,
|
||||
Prev: err,
|
||||
}
|
||||
} else if col.SystemType() == "geometry" || col.SystemType() == "geography" {
|
||||
chJobErrorsOut <- jobError
|
||||
return
|
||||
}
|
||||
rowValues[i] = pgUuid
|
||||
}
|
||||
|
||||
case "geometry", "geography":
|
||||
if b, ok := value.([]byte); ok {
|
||||
rowValues[i] = wkbToEwkbWithSrid(b, 4326)
|
||||
ewkb, err := wkbToEwkbWithSrid(b, 4326)
|
||||
if err != nil {
|
||||
jobError := JobError{
|
||||
ShouldCancelJob: true,
|
||||
Prev: err,
|
||||
}
|
||||
} else if col.SystemType() == "datetime" || col.SystemType() == "datetime2" {
|
||||
chJobErrorsOut <- jobError
|
||||
return
|
||||
}
|
||||
rowValues[i] = ewkb
|
||||
}
|
||||
|
||||
case "datetime", "datetime2":
|
||||
if t, ok := value.(time.Time); ok {
|
||||
rowValues[i] = ensureUTC(t)
|
||||
}
|
||||
@@ -37,26 +64,9 @@ func transformRowsMssql(columns []ColumnType, in <-chan []UnknownRowValues, out
|
||||
totalRowsTransformed += len(rows)
|
||||
chunkDuration := time.Since(chunkStartTime)
|
||||
rowsPerSec := float64(len(rows)) / chunkDuration.Seconds()
|
||||
log.Infof("Transformed chunk #%d: %d rows in %v (%.0f rows/sec) - Total: %d rows",
|
||||
chunkCount, len(rows), chunkDuration, rowsPerSec, totalRowsTransformed)
|
||||
log.Infof("Transformed chunk: %d rows in %v (%.0f rows/sec) - Total: %d rows",
|
||||
len(rows), chunkDuration, rowsPerSec, totalRowsTransformed)
|
||||
|
||||
out <- rows
|
||||
}
|
||||
}
|
||||
|
||||
func ToInt64(v any) (int64, bool) {
|
||||
switch t := v.(type) {
|
||||
case int:
|
||||
return int64(t), true
|
||||
case int8:
|
||||
return int64(t), true
|
||||
case int16:
|
||||
return int64(t), true
|
||||
case int32:
|
||||
return int64(t), true
|
||||
case int64:
|
||||
return int64(t), true
|
||||
default:
|
||||
return 0, false
|
||||
chChunksOut <- rows
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user