1 Commits

48 changed files with 1075 additions and 3940 deletions

View File

@@ -1,24 +0,0 @@
# Skill Registry — go-migrate
Generated: 2026-04-21
## Compact Rules
### Go conventions
- Use existing error wrapping pattern: `fmt.Errorf("context: %w", err)`
- Channel-based pipeline — keep goroutine lifecycle clean (close channels in correct order)
- No comments unless non-obvious WHY; no docstrings
- Prefer named returns only when it aids clarity in short functions
- Use `strings.EqualFold` for case-insensitive column name comparison
### Project conventions
- Config structs live in `internal/app/config/`
- ETL interfaces live in `internal/app/etl/types.go`
- Transformer implementations in `internal/app/etl/transformers/`
- Azure operations via `internal/app/azure/main.go`
- Per-job transformer creation (not shared) when job has storage config
## User Skills
| Trigger | Skill |
|---------|-------|
| sdd-* | SDD workflow skills |

View File

@@ -1,12 +1,2 @@
SOURCE_DB_URL=sqlserver://sa:password@localhost:1433?database=master&packet+size=32767&loc=UTC
TARGET_DB_URL=postgresql://postgres:password@localhost:5432/db
LOG_LEVEL=INFO
AZ_STORAGE_ENABLED=false
AZ_ACCOUNT_NAME=
AZ_CONTAINER=
AZ_ACCOUNT_KEY=
AZ_USE_HTTPS=true
AZ_SERVICE_URL=
AZ_PREFIX=
PG_FROM_DB_URL=postgresql://postgres:password@localhost:5432/db
PG_TO_DB_URL=postgresql://postgres:password@localhost:5432/db

2
.gitignore vendored
View File

@@ -27,5 +27,5 @@ go.work.sum
# Editor/IDE
# .idea/
.vscode/
# .vscode/
.temp

View File

@@ -3,7 +3,6 @@ package main
import (
"time"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/config"
log "github.com/sirupsen/logrus"
)
@@ -14,13 +13,5 @@ func configureLog() {
DisableSorting: false,
PadLevelText: true,
})
logLevelEnv := config.App.LogLevel
logLevel, err := log.ParseLevel(logLevelEnv)
if err != nil {
log.Warnf("Nivel de log inválido '%s', usando INFO por defecto", logLevelEnv)
logLevel = log.InfoLevel
}
log.SetLevel(logLevel)
log.SetLevel(log.DebugLevel)
}

View File

@@ -2,16 +2,15 @@ package main
import (
"context"
"flag"
"sync"
"time"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/azure"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/config"
dbwrapper "git.ksdemosapps.com/kylesoda/go-migrate/internal/app/db-wrapper"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/db-wrapper"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/etl/extractors"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/etl/loaders"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/etl/table_analyzers"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/etl/transformers"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/models"
log "github.com/sirupsen/logrus"
"golang.org/x/sync/errgroup"
@@ -20,23 +19,12 @@ import (
func main() {
configureLog()
configPath := flag.String("config", "", "path to migration config file")
flag.Parse()
if flag.NArg() > 1 {
log.Fatalf("only one config file path is allowed")
}
if *configPath == "" && flag.NArg() == 1 {
*configPath = flag.Arg(0)
}
migrationConfig, err := config.ReadMigrationConfig(*configPath)
migrationConfig, err := config.ReadMigrationConfig()
if err != nil {
log.Fatalf("error leyendo configuracion: %v", err)
}
// log.Debugf("Config: %+v", migrationConfig)
log.Debugf("Config: %+v", migrationConfig)
startTime := time.Now()
@@ -98,7 +86,7 @@ func main() {
log.Infof("Migración terminada. Tablas: %d, Errores: %d, Filas totales: %d", len(results), totalErrors, totalProcessed)
totalDuration := time.Since(startTime)
// log.Infof("=== Migration completed successfully! ===")
log.Infof("=== Migration completed successfully! ===")
log.Infof("Total migration time: %v", totalDuration)
}
@@ -130,18 +118,10 @@ func processMigrationJobs(
sourceTableAnalyzer := table_analyzers.NewMssqlTableAnalyzer(sourceDb)
targetTableAnalyzer := table_analyzers.NewPostgresTableAnalyzer(targetDb)
extractor := extractors.NewExtractor(sourceDb)
extractor := extractors.NewMssqlExtractor(sourceDb)
transformer := transformers.NewMssqlTransformer()
loader := loaders.NewGenericLoader(targetDb)
var azureClient *azure.Client
if config.App.AzureStorage.Enabled {
var err error
azureClient, err = azure.NewClient(config.App.AzureStorage)
if err != nil {
log.Fatalf("Failed to create Azure storage client: %v", err)
}
}
for i := range maxParallelWorkers {
wgJobs.Go(func() {
for job := range chJobs {
@@ -152,10 +132,9 @@ func processMigrationJobs(
sourceTableAnalyzer,
targetTableAnalyzer,
extractor,
azureClient,
transformer,
loader,
job,
targetDb.GetDialect(),
)
chJobResults <- res

View File

@@ -7,48 +7,26 @@ import (
"sync/atomic"
"time"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/azure"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/config"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/custom_errors"
dbwrapper "git.ksdemosapps.com/kylesoda/go-migrate/internal/app/db-wrapper"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/etl"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/etl/extractors"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/etl/loaders"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/etl/table_analyzers"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/etl/transformers"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/models"
log "github.com/sirupsen/logrus"
"golang.org/x/sync/errgroup"
)
const jobErrorsChannelSize int = 100
func buildTruncateQuery(targetDbType, schema, table, truncateMethod string) string {
if truncateMethod == "DELETE" {
if targetDbType == "postgres" {
return fmt.Sprintf(`DELETE FROM "%s"."%s"`, schema, table)
}
return fmt.Sprintf(`DELETE FROM [%s].[%s]`, schema, table)
}
if targetDbType == "postgres" {
return fmt.Sprintf(`TRUNCATE TABLE "%s"."%s"`, schema, table)
}
return fmt.Sprintf(`TRUNCATE TABLE [%s].[%s]`, schema, table)
}
func processMigrationJob(
ctx context.Context,
targetDbWrapper dbwrapper.DbWrapper,
sourceTableAnalyzer etl.TableAnalyzer,
targetTableAnalyzer etl.TableAnalyzer,
extractor extractors.GenericExtractor,
azureClient *azure.Client,
loader loaders.GenericLoader,
extractor etl.Extractor,
transformer etl.Transformer,
loader etl.Loader,
job config.Job,
targetDbType string,
) models.JobResult {
transformer := transformers.NewMssqlTransformer(job.ToStorage, job.SourceTable, azureClient)
localCtx, cancel := context.WithCancel(ctx)
defer cancel()
@@ -57,6 +35,8 @@ func processMigrationJob(
StartTime: time.Now(),
}
var rowsRead, rowsLoaded, rowsFailed int64
var wgQueryColumnTypes errgroup.Group
var sourceColTypes, targetColTypes []models.ColumnType
@@ -86,13 +66,7 @@ func processMigrationJob(
return result
}
preSqlQueries := job.TargetTable.PreSQL
if job.TruncateTarget {
truncateQuery := buildTruncateQuery(targetDbType, job.TargetTable.Schema, job.TargetTable.Table, job.TruncateMethod)
preSqlQueries = append([]string{truncateQuery}, job.TargetTable.PreSQL...)
}
for _, query := range preSqlQueries {
for _, query := range job.PreSQL {
if _, err := targetDbWrapper.Exec(localCtx, query); err != nil {
result.Error = err
return result
@@ -104,22 +78,24 @@ func processMigrationJob(
sourceTableAnalyzer,
job.SourceTable.TableInfo,
job.SourceTable.PrimaryKey,
job.PartitionCalculationStrategy,
job.RowsPerPartition,
job.Range,
)
if err != nil {
log.Error("Unexpected error calculating batch ranges: ", err)
}
chJobErrors := make(chan custom_errors.JobError, jobErrorsChannelSize)
chPartitions := make(chan models.Partition)
chBatchesRaw := make(chan models.Batch, job.ExtractorQueueSize)
chBatchesTransformed := make(chan models.Batch, job.TransformerQueueSize)
chJobErrors := make(chan custom_errors.JobError, job.QueueSize)
chExtractorErrors := make(chan custom_errors.ExtractorError, job.QueueSize)
chLoadersErrors := make(chan custom_errors.LoaderError, job.QueueSize)
chPartitions := make(chan models.Partition, job.QueueSize)
chBatchesRaw := make(chan models.Batch, job.QueueSize)
chBatchesTransformed := make(chan models.Batch, job.QueueSize)
var wgActivePartitions, wgActiveBatches, wgExtractors, wgTransformers, wgLoaders sync.WaitGroup
var rowsRead, rowsLoaded, rowsFailed int64
var failedPartitionsCount, failedBatchesLoadCount int32
var wgActivePartitions sync.WaitGroup
var wgActiveBatches sync.WaitGroup
var wgExtractors sync.WaitGroup
var wgTransformers sync.WaitGroup
var wgLoaders sync.WaitGroup
go func() {
if err := custom_errors.JobErrorHandler(localCtx, chJobErrors); err != nil {
@@ -129,24 +105,41 @@ func processMigrationJob(
}
}()
go custom_errors.ExtractorErrorHandler(
localCtx,
job.Retry,
job.MaxPartitionErrrors,
chExtractorErrors,
chPartitions,
chJobErrors,
&wgActivePartitions,
)
go custom_errors.LoaderErrorHandler(
localCtx,
job.Retry,
job.MaxChunkErrors,
chLoadersErrors,
chBatchesTransformed,
chJobErrors,
&wgActiveBatches,
)
maxExtractors := min(job.MaxExtractors, len(partitions))
log.Infof("Starting %d extractor(s)... (%v)", maxExtractors, job.Name)
log.Infof("Starting %d extractor(s)...", maxExtractors)
for range maxExtractors {
wgExtractors.Go(func() {
extractor.Consume(
extractor.Exec(
localCtx,
job.SourceTable,
sourceColTypes,
job.ExtractorBatchSize,
job.Retry,
job.BatchSize,
chPartitions,
chBatchesRaw,
chExtractorErrors,
chJobErrors,
&wgActivePartitions,
&rowsRead,
&failedPartitionsCount,
job.SourceTable.FromJsonColumns,
)
})
}
@@ -158,15 +151,13 @@ func processMigrationJob(
}
}()
log.Infof("Starting %d transformer(s)... (%v)", maxExtractors, job.Name)
log.Infof("Starting %d transformer(s)...", maxExtractors)
for range maxExtractors {
wgTransformers.Go(func() {
transformer.Consume(
transformer.Exec(
localCtx,
sourceColTypes,
job.Retry,
job.TransformerBatchSize,
chBatchesRaw,
chBatchesTransformed,
chJobErrors,
@@ -175,62 +166,64 @@ func processMigrationJob(
})
}
log.Infof("Starting %d loader(s)... (%v)", job.MaxLoaders, job.Name)
log.Infof("Starting %d loader(s)...", job.MaxLoaders)
for range job.MaxLoaders {
wgLoaders.Go(func() {
loader.Consume(
loader.Exec(
localCtx,
job.TargetTable,
targetColTypes,
job.Retry,
job.LoaderBatchSize,
chBatchesTransformed,
chLoadersErrors,
chJobErrors,
&wgActiveBatches,
&rowsLoaded,
&failedBatchesLoadCount,
)
})
}
go func() {
// log.Debugf("Waiting for goroutines (%v)", job.Name)
log.Debugf("Waiting for goroutines (%v)", job.Name)
wgActivePartitions.Wait()
// log.Debugf("wgActivePartitions is empty (%v)", job.Name)
log.Debugf("wgActivePartitions is empty (%v)", job.Name)
close(chPartitions)
// log.Debugf("chPartitions is closed (%v)", job.Name)
log.Debugf("chPartitions is closed (%v)", job.Name)
close(chExtractorErrors)
log.Debugf("chExtractorErrors is closed (%v)", job.Name)
wgExtractors.Wait()
// log.Debugf("wgExtractors is empty (%v)", job.Name)
log.Debugf("wgExtractors is empty (%v)", job.Name)
close(chBatchesRaw)
// log.Debugf("chBatchesRaw is closed (%v)", job.Name)
log.Debugf("chBatchesRaw is closed (%v)", job.Name)
wgTransformers.Wait()
// log.Debugf("wgTransformers is empty (%v)", job.Name)
close(chBatchesTransformed)
// log.Debugf("chBatchesTransformed is closed (%v)", job.Name)
log.Debugf("wgTransformers is empty (%v)", job.Name)
wgActiveBatches.Wait()
// log.Debugf("wgActiveBatches is empty (%v)", job.Name)
log.Debugf("wgActiveBatches is empty (%v)", job.Name)
close(chBatchesTransformed)
log.Debugf("chBatchesTransformed is empty (%v)", job.Name)
close(chLoadersErrors)
log.Debugf("chLoadersErrors is empty (%v)", job.Name)
wgLoaders.Wait()
// log.Debugf("wgLoaders is empty (%v)", job.Name)
log.Debugf("wgLoaders is empty (%v)", job.Name)
cancel()
}()
for _, query := range job.TargetTable.PostSQL {
for _, query := range job.PostSQL {
if _, err := targetDbWrapper.Exec(localCtx, query); err != nil {
result.Error = err
return result
}
}
// log.Debugf("waiting for local context to be done (%v)", job.Name)
log.Debugf("waiting for local context to be done (%v)", job.Name)
<-localCtx.Done()
// log.Debugf("local context done (%v)", job.Name)
log.Debugf("local context done (%v)", job.Name)
if ctx.Err() != nil {
result.Error = ctx.Err()
@@ -245,9 +238,5 @@ func processMigrationJob(
result.Error = fmt.Errorf("Row count mismatch: extracted %d rows but loaded %d rows (failed: %d)", result.RowsRead, result.RowsLoaded, result.RowsFailed)
}
if result.RowsRead == 0 {
log.Warnf("No rows extracted from (%v)", job.Name)
}
return result
}

View File

@@ -1,27 +1,22 @@
max_parallel_workers: 4
source_db_type: sqlserver
target_db_type: postgres
target_db_type: sqlserver
defaults:
batches_per_partition: 4
max_extractors: 2
extractor_batch_size: 5000
extractor_queue_size: 8
max_transformers: 2
transformer_batch_size: 12500
transformer_queue_size: 8
max_loaders: 4
loader_batch_size: 25000
partition_calculation_strategy: EXACT # EXACT | ESTIMATION
queue_size: 8
batch_size: 25000
batches_per_partition: 8
truncate_target: true
truncate_method: TRUNCATE # TRUNCATE | DELETE
max_partition_errrors: 5
max_chunk_errors: 5
retry:
attempts: 3
base_delay_ms: 500
max_delay_ms: 10000
max_jitter_ms: 500
max_failed_partitions: 5
max_failed_batches_load: 5
jobs:
- name: cartografia_manzana
@@ -35,6 +30,7 @@ jobs:
table: MANZANA
pre_sql:
- 'SELECT 1'
# - 'TRUNCATE TABLE "Cartografia"."MANZANA"'
range:
min: 1000000
max: 2000000
@@ -47,42 +43,11 @@ jobs:
schema: Red
table: PUERTO
primary_key: ID_PUERTO
from_json:
- column: $node_id*
field: id
target:
schema: Red
table: PUERTO
pre_sql:
- 'SELECT 1'
# - 'TRUNCATE TABLE "Red"."PUERTO"'
post_sql:
- "SELECT 1"
- name: infraestructura_site_holder__attach
source:
schema: Infraestructura
table: SITE_HOLDER__ATTACH
primary_key: GDB_ARCHIVE_OID
target:
schema: Infraestructura
table: SITE_HOLDER__ATTACH
to_storage:
columns:
- source: DATA
target: FILE_URL
mode: REFERENCE_ONLY
prefix: Infraestructura/SITE_HOLDER__ATTACH
batches_per_partition: 20
max_extractors: 32
extractor_batch_size: 1
extractor_queue_size: 100
max_transformers: 48
transformer_batch_size: 500
transformer_queue_size: 8
max_loaders: 4
loader_batch_size: 500
retry:
attempts: 5
base_delay_ms: 1000
max_delay_ms: 15000
max_jitter_ms: 500

12
go.mod
View File

@@ -1,13 +1,12 @@
module git.ksdemosapps.com/kylesoda/go-migrate
go 1.26
go 1.25.7
require (
github.com/Azure/azure-sdk-for-go/sdk/storage/azblob v1.6.4
github.com/gaspardle/go-mssqlclrgeo v0.0.0-20160129143314-97ceabf987a4
github.com/google/uuid v1.6.0
github.com/ilyakaznacheev/cleanenv v1.5.0
github.com/jackc/pgx/v5 v5.9.1
github.com/joho/godotenv v1.5.1
github.com/microsoft/go-mssqldb v1.9.8
github.com/sirupsen/logrus v1.9.4
github.com/twpayne/go-geom v1.6.1
@@ -16,20 +15,15 @@ require (
)
require (
github.com/Azure/azure-sdk-for-go/sdk/azcore v1.21.0 // indirect
github.com/Azure/azure-sdk-for-go/sdk/internal v1.11.2 // indirect
github.com/BurntSushi/toml v1.6.0 // indirect
github.com/golang-sql/civil v0.0.0-20220223132316-b832511892a9 // indirect
github.com/golang-sql/sqlexp v0.1.0 // indirect
github.com/jackc/pgpassfile v1.0.0 // indirect
github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761 // indirect
github.com/jackc/puddle/v2 v2.2.2 // indirect
github.com/joho/godotenv v1.5.1 // indirect
github.com/kr/text v0.2.0 // indirect
github.com/rogpeppe/go-internal v1.14.1 // indirect
github.com/shopspring/decimal v1.4.0 // indirect
golang.org/x/crypto v0.48.0 // indirect
golang.org/x/net v0.51.0 // indirect
golang.org/x/sys v0.41.0 // indirect
golang.org/x/text v0.34.0 // indirect
olympos.io/encoding/edn v0.0.0-20201019073823-d3554ca0b0a3 // indirect
)

16
go.sum
View File

@@ -4,25 +4,19 @@ github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.13.1 h1:Hk5QBxZQC1jb2Fwj6mpz
github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.13.1/go.mod h1:IYus9qsFobWIc2YVwe/WPjcnyCkPKtnHAqUYeebc8z0=
github.com/Azure/azure-sdk-for-go/sdk/internal v1.11.2 h1:9iefClla7iYpfYWdzPCRDozdmndjTm8DXdpCzPajMgA=
github.com/Azure/azure-sdk-for-go/sdk/internal v1.11.2/go.mod h1:XtLgD3ZD34DAaVIIAyG3objl5DynM3CQ/vMcbBNJZGI=
github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/storage/armstorage v1.8.1 h1:/Zt+cDPnpC3OVDm/JKLOs7M2DKmLRIIp3XIx9pHHiig=
github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/storage/armstorage v1.8.1/go.mod h1:Ng3urmn6dYe8gnbCMoHHVl5APYz2txho3koEkV2o2HA=
github.com/Azure/azure-sdk-for-go/sdk/security/keyvault/azkeys v1.4.0 h1:E4MgwLBGeVB5f2MdcIVD3ELVAWpr+WD6MUe1i+tM/PA=
github.com/Azure/azure-sdk-for-go/sdk/security/keyvault/azkeys v1.4.0/go.mod h1:Y2b/1clN4zsAoUd/pgNAQHjLDnTis/6ROkUfyob6psM=
github.com/Azure/azure-sdk-for-go/sdk/security/keyvault/internal v1.2.0 h1:nCYfgcSyHZXJI8J0IWE5MsCGlb2xp9fJiXyxWgmOFg4=
github.com/Azure/azure-sdk-for-go/sdk/security/keyvault/internal v1.2.0/go.mod h1:ucUjca2JtSZboY8IoUqyQyuuXvwbMBVwFOm0vdQPNhA=
github.com/Azure/azure-sdk-for-go/sdk/storage/azblob v1.6.4 h1:jWQK1GI+LeGGUKBADtcH2rRqPxYB1Ljwms5gFA2LqrM=
github.com/Azure/azure-sdk-for-go/sdk/storage/azblob v1.6.4/go.mod h1:8mwH4klAm9DUgR2EEHyEEAQlRDvLPyg5fQry3y+cDew=
github.com/AzureAD/microsoft-authentication-library-for-go v1.6.0 h1:XRzhVemXdgvJqCH0sFfrBUTnUJSBrBf7++ypk+twtRs=
github.com/AzureAD/microsoft-authentication-library-for-go v1.6.0/go.mod h1:HKpQxkWaGLJ+D/5H8QRpyQXA1eKjxkFlOMwck5+33Jk=
github.com/BurntSushi/toml v1.2.1/go.mod h1:CxXYINrC8qIiEnFrOxCa7Jy5BFHlXnUU2pbicEuybxQ=
github.com/BurntSushi/toml v1.6.0 h1:dRaEfpa2VI55EwlIW72hMRHdWouJeRF7TPYhI+AUQjk=
github.com/BurntSushi/toml v1.6.0/go.mod h1:ukJfTF/6rtPPRCnwkur4qwRxa8vTRFBF0uk2lLoLwho=
github.com/DATA-DOG/go-sqlmock v1.5.2 h1:OcvFkGmslmlZibjAjaHm3L//6LiuBgolP7OputlJIzU=
github.com/DATA-DOG/go-sqlmock v1.5.2/go.mod h1:88MAG/4G7SMwSE3CeA0ZKzrT5CiOU3OJ+JlNzwDqpNU=
github.com/alecthomas/assert/v2 v2.10.0 h1:jjRCHsj6hBJhkmhznrCzoNpbA3zqy0fYiUcYZP/GkPY=
github.com/alecthomas/assert/v2 v2.10.0/go.mod h1:Bze95FyfUr7x34QZrjL+XP+0qgp/zg8yS+TtBj1WA3k=
github.com/alecthomas/repr v0.4.0 h1:GhI2A8MACjfegCPVq9f1FLvIBS+DrQ2KQBFZP1iFzXc=
github.com/alecthomas/repr v0.4.0/go.mod h1:Fr0507jx4eOXV7AlPV6AVZLYrLIuIeSOWtW57eE/O/4=
github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
@@ -38,8 +32,6 @@ github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
github.com/hexops/gotextdiff v1.0.3 h1:gitA9+qJrrTCsiCl7+kh75nPqQt1cx4ZkudSTLoUqJM=
github.com/hexops/gotextdiff v1.0.3/go.mod h1:pSWU5MAI3yDq+fZBTazCSJysOMbxWL1BSow5/V2vxeg=
github.com/ilyakaznacheev/cleanenv v1.5.0 h1:0VNZXggJE2OYdXE87bfSSwGxeiGt9moSR2lOrsHHvr4=
github.com/ilyakaznacheev/cleanenv v1.5.0/go.mod h1:a5aDzaJrLCQZsazHol1w8InnDcOX0OColm64SlIi6gk=
github.com/jackc/pgpassfile v1.0.0 h1:/6Hmqy13Ss2zCq62VdNG8tM1wchn8zjSGOBJ6icpsIM=
github.com/jackc/pgpassfile v1.0.0/go.mod h1:CEx0iS5ambNFdcRtxPj5JhEz+xB6uRky5eyVu/W2HEg=
github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761 h1:iCEnooe7UlwOQYpKFhBabPMi4aNAfoODPEFNiAnClxo=
@@ -50,8 +42,8 @@ github.com/jackc/puddle/v2 v2.2.2 h1:PR8nw+E/1w0GLuRFSmiioY6UooMp6KJv0/61nB7icHo
github.com/jackc/puddle/v2 v2.2.2/go.mod h1:vriiEXHvEE654aYKXXjOvZM39qJ0q+azkZFrfEOc3H4=
github.com/joho/godotenv v1.5.1 h1:7eLL/+HRGLY0ldzfGMeQkb7vMd0as4CfYvUVzLqw0N0=
github.com/joho/godotenv v1.5.1/go.mod h1:f4LDr5Voq0i2e/R5DDNOoa2zzDfwtkZa6DnEwAbqwq4=
github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk=
github.com/kr/pretty v0.3.0 h1:WgNl7dwNpEZ6jJ9k1snq4pZsg7DOEN8hP9Xw0Tsjwk0=
github.com/kr/pretty v0.3.0/go.mod h1:640gp4NfQd8pI5XOwp5fnNeVWj67G7CFk/SaSQn7NBk=
github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc=
@@ -91,5 +83,3 @@ gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EV
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
olympos.io/encoding/edn v0.0.0-20201019073823-d3554ca0b0a3 h1:slmdOY3vp8a7KQbHkL+FLbvbkgMqmXojpFUO/jENuqQ=
olympos.io/encoding/edn v0.0.0-20201019073823-d3554ca0b0a3/go.mod h1:oVgVk4OWVDi43qWBEyGhXgYxt7+ED4iYNpTngSLX2Iw=

View File

@@ -1,89 +0,0 @@
package azure
import (
"context"
"errors"
"fmt"
"net/http"
"net/url"
"path"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/config"
"github.com/Azure/azure-sdk-for-go/sdk/storage/azblob"
"github.com/Azure/azure-sdk-for-go/sdk/storage/azblob/blob"
)
var (
ErrInvalidConnectionString = errors.New("invalid connection string")
ErrContainerNotFound = errors.New("container not found")
ErrBlobNotFound = errors.New("blob not found")
ErrInvalidInput = errors.New("invalid input parameters")
)
type Client struct {
client *azblob.Client
azureStorageConfig config.AzureStorageConfig
}
func NewClient(azureStorageConfig config.AzureStorageConfig) (*Client, error) {
protocol := "https"
if !azureStorageConfig.UseHTTPS {
protocol = "http"
}
blobEndpoint, _ := url.JoinPath(azureStorageConfig.ServiceURL, azureStorageConfig.AccountName)
connStr := fmt.Sprintf("DefaultEndpointsProtocol=%s;AccountName=%s;AccountKey=%s;BlobEndpoint=%s;",
protocol, azureStorageConfig.AccountName, azureStorageConfig.AccountKey, blobEndpoint)
client, err := azblob.NewClientFromConnectionString(connStr, nil)
if err != nil {
return nil, fmt.Errorf("creating azure storage client: %w", err)
}
return &Client{
client: client,
azureStorageConfig: azureStorageConfig,
}, nil
}
func (c *Client) CreateContainer(ctx context.Context, containerName string) error {
if containerName == "" {
return ErrInvalidInput
}
_, err := c.client.CreateContainer(ctx, containerName, nil)
if err != nil {
return fmt.Errorf("creating container %s: %w", containerName, err)
}
return nil
}
func (c *Client) UploadBuffer(ctx context.Context, containerName, blobPath string, buffer []byte) error {
if containerName == "" || blobPath == "" || buffer == nil {
return ErrInvalidInput
}
_, err := c.client.UploadBuffer(ctx, containerName, blobPath, buffer, nil)
if err != nil {
return fmt.Errorf("uploading blob %s: %w", blobPath, err)
}
return nil
}
func (c *Client) UploadAndGetURL(ctx context.Context, blobPath string, buffer []byte) (string, error) {
if blobPath == "" || buffer == nil {
return "", ErrInvalidInput
}
fullPath := path.Join(c.azureStorageConfig.Prefix, blobPath)
contentType := http.DetectContentType(buffer)
opts := &azblob.UploadBufferOptions{
HTTPHeaders: &blob.HTTPHeaders{BlobContentType: &contentType},
}
if _, err := c.client.UploadBuffer(ctx, c.azureStorageConfig.Container, fullPath, buffer, opts); err != nil {
return "", fmt.Errorf("uploading blob %s: %w", fullPath, err)
}
return fullPath, nil
}

View File

@@ -1,41 +1,41 @@
package config
import (
"github.com/ilyakaznacheev/cleanenv"
"os"
"github.com/joho/godotenv"
log "github.com/sirupsen/logrus"
)
type AzureStorageConfig struct {
AccountName string `env:"AZ_ACCOUNT_NAME"`
Container string `env:"AZ_CONTAINER"`
AccountKey string `env:"AZ_ACCOUNT_KEY"`
UseHTTPS bool `env:"AZ_USE_HTTPS" env-default:"true"`
ServiceURL string `env:"AZ_SERVICE_URL"`
Prefix string `env:"AZ_PREFIX"`
Enabled bool `env:"AZ_STORAGE_ENABLED"`
type appConfig struct {
SourceDbUrl string
TargetDbUrl string
}
type appConfig struct {
SourceDbUrl string `env:"SOURCE_DB_URL" env-required:"true"`
TargetDbUrl string `env:"TARGET_DB_URL" env-required:"true"`
LogLevel string `env:"LOG_LEVEL" env-default:"INFO"`
AzureStorage AzureStorageConfig
func loadEnv() {
err := godotenv.Load()
if err != nil {
log.Warn("Warning: could not load .env file")
}
}
func getAppConfig() appConfig {
var cfg appConfig
loadEnv()
err := cleanenv.ReadConfig(".env", &cfg)
if err != nil {
log.Warn("Could not load .env file")
sourceDbUrl := os.Getenv("SOURCE_DB_URL")
if sourceDbUrl == "" {
log.Fatal("SOURCE_DB_URL environment variable not set")
}
err = cleanenv.ReadEnv(&cfg)
if err != nil {
log.Fatalf("Error al cargar variables: %v", err)
targetDbUrl := os.Getenv("TARGET_DB_URL")
if targetDbUrl == "" {
log.Fatal("TARGET_DB_URL environment variable not set")
}
return cfg
return appConfig{
SourceDbUrl: sourceDbUrl,
TargetDbUrl: targetDbUrl,
}
}
var App appConfig = getAppConfig()

View File

@@ -12,42 +12,20 @@ type RetryConfig struct {
BaseDelayMs int `yaml:"base_delay_ms"`
MaxDelayMs int `yaml:"max_delay_ms"`
MaxJitterMs int `yaml:"max_jitter_ms"`
MaxFailedPartitions int `yaml:"max_failed_partitions"`
MaxFailedBatchesLoad int `yaml:"max_failed_batches_load"`
}
type ToStorageColumnConfig struct {
Source string `yaml:"source"`
Target string `yaml:"target"`
Mode string `yaml:"mode"`
Prefix string `yaml:"prefix"`
}
type ToStorageConfig struct {
Columns []ToStorageColumnConfig `yaml:"columns"`
}
type JobConfig struct {
BatchesPerPartition int `yaml:"batches_per_partition"`
MaxExtractors int `yaml:"max_extractors"`
ExtractorBatchSize int `yaml:"extractor_batch_size"`
ExtractorQueueSize int `yaml:"extractor_queue_size"`
MaxTransformers int `yaml:"max_transformers"`
TransformerBatchSize int `yaml:"transformer_batch_size"`
TransformerQueueSize int `yaml:"transformer_queue_size"`
MaxLoaders int `yaml:"max_loaders"`
LoaderBatchSize int `yaml:"loader_batch_size"`
PartitionCalculationStrategy string `yaml:"partition_calculation_strategy"`
QueueSize int `yaml:"queue_size"`
BatchSize int `yaml:"batch_size"`
BatchesPerPartition int `yaml:"batches_per_partition"`
TruncateTarget bool `yaml:"truncate_target"`
TruncateMethod string `yaml:"truncate_method"`
MaxPartitionErrrors int `yaml:"max_partition_errrors"`
MaxChunkErrors int `yaml:"max_chunk_errors"`
Retry RetryConfig `yaml:"retry"`
RowsPerPartition int64
ToStorage ToStorageConfig `yaml:"to_storage"`
}
type FromJsonItem struct {
Column string `yaml:"column"`
Field string `yaml:"field"`
}
type TableInfo struct {
@@ -55,23 +33,13 @@ type TableInfo struct {
Table string `yaml:"table"`
}
type TargetTableInfo struct {
TableInfo `yaml:",inline"`
}
type SourceTableInfo struct {
TableInfo `yaml:",inline"`
PrimaryKey string `yaml:"primary_key"`
FromJsonColumns []FromJsonItem `yaml:"from_json"`
}
type TargetTableInfo struct {
TableInfo `yaml:",inline"`
PreSQL []string `yaml:"pre_sql"`
PostSQL []string `yaml:"post_sql"`
}
type RangeConfig struct {
Min *int64 `yaml:"min"`
Max *int64 `yaml:"max"`
IsMinInclusive bool `yaml:"is_min_inclusive"`
IsMaxInclusive bool `yaml:"is_max_inclusive"`
}
type Job struct {
@@ -79,8 +47,15 @@ type Job struct {
Enabled bool `yaml:"enabled"`
SourceTable SourceTableInfo `yaml:"source"`
TargetTable TargetTableInfo `yaml:"target"`
PreSQL []string `yaml:"pre_sql"`
PostSQL []string `yaml:"post_sql"`
JobConfig `yaml:",inline"`
Range RangeConfig `yaml:"range"`
Range struct {
Min int64 `yaml:"min"`
Max int64 `yaml:"max"`
IsMinInclusive bool `yaml:"is_min_inclusive"`
IsMaxInclusive bool `yaml:"is_max_inclusive"`
}
}
type MigrationConfig struct {
@@ -109,7 +84,7 @@ func (c *MigrationConfig) UnmarshalYAML(value *yaml.Node) error {
c.Defaults = raw.Defaults
c.SourceDbType = raw.SourceDbType
c.TargetDbType = raw.TargetDbType
c.Defaults.RowsPerPartition = int64(raw.Defaults.ExtractorBatchSize * raw.Defaults.BatchesPerPartition)
c.Defaults.RowsPerPartition = int64(raw.Defaults.BatchSize * raw.Defaults.BatchesPerPartition)
for _, node := range raw.Jobs {
job := Job{
@@ -120,7 +95,7 @@ func (c *MigrationConfig) UnmarshalYAML(value *yaml.Node) error {
return err
}
job.RowsPerPartition = int64(job.ExtractorBatchSize * job.BatchesPerPartition)
job.RowsPerPartition = int64(job.BatchSize * job.BatchesPerPartition)
c.Jobs = append(c.Jobs, job)
}

View File

@@ -1,11 +1,12 @@
package custom_errors
import (
"context"
"math/rand"
"time"
)
func ComputeBackoffDelay(retryCounter int, baseDelayMs int, maxDelayMs int, maxJitterMs int) time.Duration {
func computeBackoffDelay(retryCounter int, baseDelayMs int, maxDelayMs int, maxJitterMs int) time.Duration {
if retryCounter < 0 {
retryCounter = 0
}
@@ -39,3 +40,22 @@ func ComputeBackoffDelay(retryCounter int, baseDelayMs int, maxDelayMs int, maxJ
return delay
}
func requeueWithBackoff(ctx context.Context, delay time.Duration, enqueue func()) {
if delay <= 0 {
enqueue()
return
}
go func() {
timer := time.NewTimer(delay)
defer timer.Stop()
select {
case <-ctx.Done():
return
case <-timer.C:
enqueue()
}
}()
}

View File

@@ -0,0 +1,119 @@
package custom_errors
import (
"context"
"fmt"
"sync"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/config"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/models"
"github.com/google/uuid"
)
type ExtractorError struct {
Partition models.Partition
LastId int64
HasLastId bool
Msg string
}
func (e *ExtractorError) Error() string {
return e.Msg
}
func ExtractorErrorHandler(
ctx context.Context,
retryConfig config.RetryConfig,
maxPartitionErrors int,
chErrorsIn <-chan ExtractorError,
chPartitionsOut chan<- models.Partition,
chJobErrorsOut chan<- JobError,
wgActivePartitions *sync.WaitGroup,
) {
definitiveErrors := 0
for {
if ctx.Err() != nil {
return
}
select {
case <-ctx.Done():
return
case err, ok := <-chErrorsIn:
if !ok {
return
}
if err.Partition.RetryCounter >= retryConfig.Attempts {
wgActivePartitions.Done()
definitiveErrors++
jobError := JobError{
ShouldCancelJob: false,
Msg: fmt.Sprintf("Partition %v reached max retries (%d)", err.Partition.Id, retryConfig.Attempts),
Prev: &err,
}
select {
case chJobErrorsOut <- jobError:
case <-ctx.Done():
return
}
if maxPartitionErrors > 0 && definitiveErrors >= maxPartitionErrors {
fatalError := JobError{
ShouldCancelJob: true,
Msg: fmt.Sprintf("Partition error limit reached (%d)", maxPartitionErrors),
Prev: &err,
}
select {
case chJobErrorsOut <- fatalError:
case <-ctx.Done():
return
}
}
continue
} else {
jobError := JobError{
ShouldCancelJob: false,
Msg: fmt.Sprintf("Temporal error in partition %v (retries: %d)", err.Partition.Id, err.Partition.RetryCounter),
Prev: &err,
}
select {
case chJobErrorsOut <- jobError:
case <-ctx.Done():
return
}
}
newPartition := err.Partition
newPartition.RetryCounter++
delay := computeBackoffDelay(
newPartition.RetryCounter,
retryConfig.BaseDelayMs,
retryConfig.MaxDelayMs,
retryConfig.MaxJitterMs,
)
if err.HasLastId {
newPartition.ParentId = err.Partition.Id
newPartition.Id = uuid.New()
newPartition.Range.Min = err.LastId
newPartition.Range.IsMinInclusive = false
}
requeueWithBackoff(ctx, delay, func() {
select {
case chPartitionsOut <- newPartition:
case <-ctx.Done():
return
}
})
}
}
}

View File

@@ -0,0 +1,107 @@
package custom_errors
import (
"context"
"fmt"
"sync"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/config"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/models"
)
type LoaderError struct {
Batch models.Batch
Msg string
}
func (e *LoaderError) Error() string {
return e.Msg
}
func LoaderErrorHandler(
ctx context.Context,
retryConfig config.RetryConfig,
maxChunkErrors int,
chErrorsIn <-chan LoaderError,
chBatchesOut chan<- models.Batch,
chJobErrorsOut chan<- JobError,
wgActiveBatches *sync.WaitGroup,
) {
definitiveErrors := 0
for {
if ctx.Err() != nil {
return
}
select {
case <-ctx.Done():
return
case err, ok := <-chErrorsIn:
if !ok {
return
}
if err.Batch.RetryCounter >= retryConfig.Attempts {
wgActiveBatches.Done()
definitiveErrors++
jobError := JobError{
ShouldCancelJob: false,
Msg: fmt.Sprintf("Batch %v reached max retries (%d)", err.Batch.Id, retryConfig.Attempts),
Prev: &err,
}
select {
case chJobErrorsOut <- jobError:
case <-ctx.Done():
return
}
if maxChunkErrors > 0 && definitiveErrors >= maxChunkErrors {
fatalError := JobError{
ShouldCancelJob: true,
Msg: fmt.Sprintf("Chunk error limit reached (%d)", maxChunkErrors),
Prev: &err,
}
select {
case chJobErrorsOut <- fatalError:
case <-ctx.Done():
return
}
}
continue
} else {
jobError := JobError{
ShouldCancelJob: false,
Msg: fmt.Sprintf("Temporal error in batch %v (retries: %d)", err.Batch.Id, err.Batch.RetryCounter),
Prev: &err,
}
select {
case chJobErrorsOut <- jobError:
case <-ctx.Done():
return
}
}
err.Batch.RetryCounter++
delay := computeBackoffDelay(
err.Batch.RetryCounter,
retryConfig.BaseDelayMs,
retryConfig.MaxDelayMs,
retryConfig.MaxJitterMs,
)
requeueWithBackoff(ctx, delay, func() {
select {
case chBatchesOut <- err.Batch:
case <-ctx.Done():
return
}
})
}
}
}

View File

@@ -1,25 +0,0 @@
package custom_errors
import (
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/models"
)
type ExtractorError struct {
Partition models.Partition
LastId int64
HasLastId bool
Msg string
}
func (e *ExtractorError) Error() string {
return e.Msg
}
type LoaderError struct {
Batch models.Batch
Msg string
}
func (e *LoaderError) Error() string {
return e.Msg
}

View File

@@ -1,7 +0,0 @@
package db_dialects
const (
SqlServer string = "sqlserver"
Postgres string = "postgres"
Null string = "null"
)

View File

@@ -4,17 +4,13 @@ import (
"context"
"database/sql"
"fmt"
"strings"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/config"
dbdialects "git.ksdemosapps.com/kylesoda/go-migrate/internal/app/db-wrapper/db_dialects"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/models"
mssql "github.com/microsoft/go-mssqldb"
)
func init() {
Register(dbdialects.SqlServer, func() DbWrapper {
return &mssqlDbWrapper{dialect: dbdialects.SqlServer}
Register("sqlserver", func() DbWrapper {
return &mssqlDbWrapper{dialect: "sqlserver"}
})
}
@@ -178,140 +174,3 @@ func (mw *mssqlDbWrapper) SaveMassive(ctx context.Context, schema string, table
return rowsAffected, nil
}
func buildExtractQueryMssql(q ExtractionQuery) (string, error) {
var sbQuery strings.Builder
sbQuery.WriteString("SELECT ")
hasRegularColumns := len(q.Columns) > 0
hasJsonColumns := len(q.FromJsonColumns) > 0
resolvedJson := make(map[string][]config.FromJsonItem, len(q.FromJsonColumns))
if hasJsonColumns {
for _, jsonConfig := range q.FromJsonColumns {
actualColumnName, err := findColumnByPattern(q.Columns, jsonConfig.Column)
if err != nil {
return "", err
}
resolvedJson[actualColumnName] = append(resolvedJson[actualColumnName], jsonConfig)
}
}
selectParts := make([]string, 0, len(q.Columns)+len(q.FromJsonColumns))
if hasRegularColumns {
for _, col := range q.Columns {
jsonConfigs, isJsonColumn := resolvedJson[col.Name()]
if isJsonColumn {
for _, jsonConfig := range jsonConfigs {
jsonPath := buildJsonPathMssql(jsonConfig.Field)
jsonExpr := fmt.Sprintf("JSON_VALUE([%s], '%s') AS [%s]", col.Name(), jsonPath, col.Name())
selectParts = append(selectParts, jsonExpr)
}
continue
}
colExpr := fmt.Sprintf("[%s]", col.Name())
switch col.Type() {
case "GEOMETRY":
colExpr = fmt.Sprintf("[%s].STAsBinary() AS [%s]", col.Name(), col.Name())
}
selectParts = append(selectParts, colExpr)
}
} else if !hasJsonColumns {
selectParts = append(selectParts, "*")
}
for i, part := range selectParts {
sbQuery.WriteString(part)
if i < len(selectParts)-1 {
sbQuery.WriteString(", ")
}
}
fmt.Fprintf(&sbQuery, " FROM [%s].[%s]", q.Schema, q.Table)
if q.LowerLimit.IsValid || q.UpperLimit.IsValid {
sbQuery.WriteString(" WHERE ")
if q.LowerLimit.IsValid {
fmt.Fprintf(&sbQuery, "[%s]", q.PrimaryKey)
if q.LowerLimit.IsInclusive {
sbQuery.WriteString(" >=")
} else {
sbQuery.WriteString(" >")
}
sbQuery.WriteString(" @min")
}
if q.LowerLimit.IsValid && q.UpperLimit.IsValid {
sbQuery.WriteString(" AND ")
}
if q.UpperLimit.IsValid {
fmt.Fprintf(&sbQuery, "[%s]", q.PrimaryKey)
if q.UpperLimit.IsInclusive {
sbQuery.WriteString(" <=")
} else {
sbQuery.WriteString(" <")
}
sbQuery.WriteString(" @max")
}
}
fmt.Fprintf(&sbQuery, " ORDER BY [%s] ASC", q.PrimaryKey)
return sbQuery.String(), nil
}
func findColumnByPattern(columns []models.ColumnType, pattern string) (string, error) {
if pattern == "" {
return "", fmt.Errorf("column pattern cannot be empty")
}
if before, ok := strings.CutSuffix(pattern, "*"); ok {
prefix := before
for _, col := range columns {
if strings.HasPrefix(col.Name(), prefix) {
return col.Name(), nil
}
}
return "", fmt.Errorf("no column found matching pattern '%s'", pattern)
}
for _, col := range columns {
if col.Name() == pattern {
return col.Name(), nil
}
}
return "", fmt.Errorf("column '%s' not found in table columns", pattern)
}
func (mw *mssqlDbWrapper) QueryFromObject(ctx context.Context, q ExtractionQuery) (RowsResult, error) {
queryString, err := buildExtractQueryMssql(q)
if err != nil {
return nil, err
}
// logrus.Debugf("Query: %s", queryString)
var queryArgs []any
if q.LowerLimit.IsValid {
queryArgs = append(queryArgs, sql.Named("min", q.LowerLimit.Value))
}
if q.UpperLimit.IsValid {
queryArgs = append(queryArgs, sql.Named("max", q.UpperLimit.Value))
}
return mw.Query(ctx, queryString, queryArgs...)
}
func buildJsonPathMssql(field string) string {
if len(field) > 0 && field[0] == '.' {
field = field[1:]
}
return "$." + field
}

View File

@@ -1,396 +0,0 @@
package dbwrapper
import (
"strings"
"testing"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/config"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/models"
)
func TestBuildExtractQueryMssql_NoJsonColumns(t *testing.T) {
q := ExtractionQuery{
Schema: "dbo",
Table: "Users",
PrimaryKey: "ID",
Columns: []models.ColumnType{
models.NewColumnType("ID", false, false, "INT", "int", "INT", false, 0, 0, 0),
models.NewColumnType("Name", true, false, "VARCHAR", "varchar", "VARCHAR", true, 255, 0, 0),
},
FromJsonColumns: []config.FromJsonItem{},
LowerLimit: ExtractorQueryLimit{IsValid: false},
UpperLimit: ExtractorQueryLimit{IsValid: false},
}
query, err := buildExtractQueryMssql(q)
if err != nil {
t.Fatalf("Expected no error, got: %v", err)
}
if !strings.Contains(query, "SELECT [ID], [Name]") {
t.Errorf("Expected columns in query, got: %s", query)
}
if !strings.Contains(query, "FROM [dbo].[Users]") {
t.Errorf("Expected FROM clause, got: %s", query)
}
if !strings.Contains(query, "ORDER BY [ID] ASC") {
t.Errorf("Expected ORDER BY clause, got: %s", query)
}
}
func TestBuildExtractQueryMssql_WithJsonColumns_ExactColumnMatch(t *testing.T) {
// Test that the actual column name is used as alias, not a generated one
q := ExtractionQuery{
Schema: "dbo",
Table: "Events",
PrimaryKey: "EventID",
Columns: []models.ColumnType{
models.NewColumnType("EventID", false, false, "INT", "int", "INT", false, 0, 0, 0),
models.NewColumnType("EventData", true, false, "VARCHAR", "varchar", "VARCHAR", true, 500, 0, 0),
},
FromJsonColumns: []config.FromJsonItem{
{Column: "EventData", Field: ".userId"},
{Column: "EventData", Field: ".timestamp"},
},
LowerLimit: ExtractorQueryLimit{IsValid: false},
UpperLimit: ExtractorQueryLimit{IsValid: false},
}
query, err := buildExtractQueryMssql(q)
if err != nil {
t.Fatalf("Expected no error, got: %v", err)
}
if !strings.HasPrefix(query, "SELECT [EventID], JSON_VALUE([EventData], '$.userId') AS [EventData], JSON_VALUE([EventData], '$.timestamp') AS [EventData]") {
t.Errorf("Expected JSON columns to replace EventData in-order, got: %s", query)
}
if strings.Contains(query, "SELECT [EventID], [EventData]") {
t.Errorf("Expected EventData to be replaced by JSON extraction, got: %s", query)
}
// Alias should be exactly "EventData", not "EventData_userId"
if !strings.Contains(query, "JSON_VALUE([EventData], '$.userId') AS [EventData]") {
t.Errorf("Expected JSON alias to be [EventData], got: %s", query)
}
if !strings.Contains(query, "JSON_VALUE([EventData], '$.timestamp') AS [EventData]") {
t.Errorf("Expected JSON alias to be [EventData], got: %s", query)
}
// Should have comma separating them
if !strings.Contains(query, "JSON_VALUE([EventData], '$.userId') AS [EventData], JSON_VALUE([EventData], '$.timestamp') AS [EventData]") {
t.Errorf("Expected comma-separated JSON values, got: %s", query)
}
}
func TestBuildExtractQueryMssql_WithWildcardPattern(t *testing.T) {
// Test that wildcard pattern matching finds the correct column
q := ExtractionQuery{
Schema: "dbo",
Table: "Events",
PrimaryKey: "ID",
Columns: []models.ColumnType{
models.NewColumnType("ID", false, false, "INT", "int", "INT", false, 0, 0, 0),
models.NewColumnType("NodeMetadata", true, false, "VARCHAR", "varchar", "VARCHAR", true, 500, 0, 0),
},
FromJsonColumns: []config.FromJsonItem{
{Column: "NodeMeta*", Field: ".id"},
},
LowerLimit: ExtractorQueryLimit{IsValid: false},
UpperLimit: ExtractorQueryLimit{IsValid: false},
}
query, err := buildExtractQueryMssql(q)
if err != nil {
t.Fatalf("Expected no error, got: %v", err)
}
// Should find "NodeMetadata" from pattern "NodeMeta*" and use it as alias
if !strings.Contains(query, "JSON_VALUE([NodeMetadata], '$.id') AS [NodeMetadata]") {
t.Errorf("Expected to find and use NodeMetadata column by pattern, got: %s", query)
}
if strings.Contains(query, "SELECT [ID], [NodeMetadata]") {
t.Errorf("Expected NodeMetadata to be replaced by JSON extraction, got: %s", query)
}
}
func TestBuildExtractQueryMssql_ColumnNotFound_Error(t *testing.T) {
// Test that an error is returned when column is not found
q := ExtractionQuery{
Schema: "dbo",
Table: "Events",
PrimaryKey: "ID",
Columns: []models.ColumnType{
models.NewColumnType("ID", false, false, "INT", "int", "INT", false, 0, 0, 0),
},
FromJsonColumns: []config.FromJsonItem{
{Column: "NonExistentColumn", Field: ".id"},
},
LowerLimit: ExtractorQueryLimit{IsValid: false},
UpperLimit: ExtractorQueryLimit{IsValid: false},
}
query, err := buildExtractQueryMssql(q)
if err == nil {
t.Fatalf("Expected error for missing column, got no error. Query: %s", query)
}
if !strings.Contains(err.Error(), "NonExistentColumn") {
t.Errorf("Expected error message to contain column name, got: %v", err)
}
}
func TestBuildExtractQueryMssql_WildcardPatternNotMatched_Error(t *testing.T) {
// Test that an error is returned when wildcard pattern doesn't match any column
q := ExtractionQuery{
Schema: "dbo",
Table: "Events",
PrimaryKey: "ID",
Columns: []models.ColumnType{
models.NewColumnType("ID", false, false, "INT", "int", "INT", false, 0, 0, 0),
models.NewColumnType("EventData", true, false, "VARCHAR", "varchar", "VARCHAR", true, 500, 0, 0),
},
FromJsonColumns: []config.FromJsonItem{
{Column: "NonMatching*", Field: ".id"},
},
LowerLimit: ExtractorQueryLimit{IsValid: false},
UpperLimit: ExtractorQueryLimit{IsValid: false},
}
query, err := buildExtractQueryMssql(q)
if err == nil {
t.Fatalf("Expected error for non-matching wildcard pattern, got no error. Query: %s", query)
}
if !strings.Contains(err.Error(), "NonMatching*") {
t.Errorf("Expected error message to contain pattern, got: %v", err)
}
}
func TestBuildExtractQueryMssql_NestedJsonFields(t *testing.T) {
q := ExtractionQuery{
Schema: "dbo",
Table: "Data",
PrimaryKey: "ID",
Columns: []models.ColumnType{
models.NewColumnType("ID", false, false, "INT", "int", "INT", false, 0, 0, 0),
models.NewColumnType("NodeData", true, false, "VARCHAR", "varchar", "VARCHAR", true, 500, 0, 0),
},
FromJsonColumns: []config.FromJsonItem{
{Column: "NodeData", Field: ".user.name"},
{Column: "NodeData", Field: ".user.email"},
},
LowerLimit: ExtractorQueryLimit{IsValid: false},
UpperLimit: ExtractorQueryLimit{IsValid: false},
}
query, err := buildExtractQueryMssql(q)
if err != nil {
t.Fatalf("Expected no error, got: %v", err)
}
if !strings.Contains(query, "JSON_VALUE([NodeData], '$.user.name') AS [NodeData]") {
t.Errorf("Expected nested JSON path for user.name, got: %s", query)
}
if !strings.Contains(query, "JSON_VALUE([NodeData], '$.user.email') AS [NodeData]") {
t.Errorf("Expected nested JSON path for user.email, got: %s", query)
}
if strings.Contains(query, "SELECT [ID], [NodeData]") {
t.Errorf("Expected NodeData to be replaced by JSON extraction, got: %s", query)
}
}
func TestBuildExtractQueryMssql_WithRangeLimits(t *testing.T) {
q := ExtractionQuery{
Schema: "dbo",
Table: "Products",
PrimaryKey: "ProductID",
Columns: []models.ColumnType{
models.NewColumnType("ProductID", false, false, "INT", "int", "INT", false, 0, 0, 0),
models.NewColumnType("Details", true, false, "VARCHAR", "varchar", "VARCHAR", true, 500, 0, 0),
},
FromJsonColumns: []config.FromJsonItem{
{Column: "Details", Field: ".price"},
},
LowerLimit: ExtractorQueryLimit{IsValid: true, IsInclusive: true, Value: 100},
UpperLimit: ExtractorQueryLimit{IsValid: true, IsInclusive: false, Value: 500},
}
query, err := buildExtractQueryMssql(q)
if err != nil {
t.Fatalf("Expected no error, got: %v", err)
}
if !strings.Contains(query, "WHERE [ProductID] >= @min") {
t.Errorf("Expected WHERE clause with >=, got: %s", query)
}
if !strings.Contains(query, "[ProductID] < @max") {
t.Errorf("Expected upper limit with <, got: %s", query)
}
if !strings.Contains(query, "JSON_VALUE([Details], '$.price') AS [Details]") {
t.Errorf("Expected JSON_VALUE for Details, got: %s", query)
}
if strings.Contains(query, "SELECT [ProductID], [Details]") {
t.Errorf("Expected Details to be replaced by JSON extraction, got: %s", query)
}
}
func TestBuildJsonPathMssql(t *testing.T) {
tests := []struct {
input string
expected string
}{
{".id", "$.id"},
{"id", "$.id"},
{".user.name", "$.user.name"},
{"user.name", "$.user.name"},
{".location.coordinates.lat", "$.location.coordinates.lat"},
{"", "$."},
}
for _, tt := range tests {
result := buildJsonPathMssql(tt.input)
if result != tt.expected {
t.Errorf("buildJsonPathMssql(%q) = %q, want %q", tt.input, result, tt.expected)
}
}
}
func TestFindColumnByPattern_ExactMatch(t *testing.T) {
columns := []models.ColumnType{
models.NewColumnType("ID", false, false, "INT", "int", "INT", false, 0, 0, 0),
models.NewColumnType("Metadata", true, false, "VARCHAR", "varchar", "VARCHAR", true, 500, 0, 0),
models.NewColumnType("EventData", true, false, "VARCHAR", "varchar", "VARCHAR", true, 500, 0, 0),
}
result, err := findColumnByPattern(columns, "Metadata")
if err != nil {
t.Fatalf("Expected no error, got: %v", err)
}
if result != "Metadata" {
t.Errorf("Expected 'Metadata', got '%s'", result)
}
}
func TestFindColumnByPattern_WildcardMatch(t *testing.T) {
columns := []models.ColumnType{
models.NewColumnType("ID", false, false, "INT", "int", "INT", false, 0, 0, 0),
models.NewColumnType("NodeMetadata", true, false, "VARCHAR", "varchar", "VARCHAR", true, 500, 0, 0),
models.NewColumnType("EventData", true, false, "VARCHAR", "varchar", "VARCHAR", true, 500, 0, 0),
}
result, err := findColumnByPattern(columns, "NodeMeta*")
if err != nil {
t.Fatalf("Expected no error, got: %v", err)
}
if result != "NodeMetadata" {
t.Errorf("Expected 'NodeMetadata', got '%s'", result)
}
}
func TestFindColumnByPattern_NotFound(t *testing.T) {
columns := []models.ColumnType{
models.NewColumnType("ID", false, false, "INT", "int", "INT", false, 0, 0, 0),
models.NewColumnType("Metadata", true, false, "VARCHAR", "varchar", "VARCHAR", true, 500, 0, 0),
}
result, err := findColumnByPattern(columns, "NonExistent")
if err == nil {
t.Fatalf("Expected error, got no error. Result: %s", result)
}
if !strings.Contains(err.Error(), "NonExistent") {
t.Errorf("Expected error to contain column name, got: %v", err)
}
}
func TestFindColumnByPattern_WildcardNotFound(t *testing.T) {
columns := []models.ColumnType{
models.NewColumnType("ID", false, false, "INT", "int", "INT", false, 0, 0, 0),
models.NewColumnType("Metadata", true, false, "VARCHAR", "varchar", "VARCHAR", true, 500, 0, 0),
}
result, err := findColumnByPattern(columns, "Event*")
if err == nil {
t.Fatalf("Expected error, got no error. Result: %s", result)
}
if !strings.Contains(err.Error(), "Event*") {
t.Errorf("Expected error to contain pattern, got: %v", err)
}
}
func TestBuildExtractQueryMssql_OnlyJsonColumns(t *testing.T) {
// Test when all columns are used via JSON extraction
q := ExtractionQuery{
Schema: "dbo",
Table: "Data",
PrimaryKey: "ID",
Columns: []models.ColumnType{
models.NewColumnType("ID", false, false, "INT", "int", "INT", false, 0, 0, 0),
models.NewColumnType("JsonData", true, false, "VARCHAR", "varchar", "VARCHAR", true, 500, 0, 0),
},
FromJsonColumns: []config.FromJsonItem{
{Column: "JsonData", Field: ".field1"},
},
LowerLimit: ExtractorQueryLimit{IsValid: false},
UpperLimit: ExtractorQueryLimit{IsValid: false},
}
query, err := buildExtractQueryMssql(q)
if err != nil {
t.Fatalf("Expected no error, got: %v", err)
}
if !strings.HasPrefix(query, "SELECT [ID], JSON_VALUE([JsonData], '$.field1') AS [JsonData]") {
t.Errorf("Expected JsonData to be replaced by JSON extraction, got: %s", query)
}
if strings.Contains(query, "SELECT [ID], [JsonData]") {
t.Errorf("Expected JsonData to be excluded from raw selection, got: %s", query)
}
}
func TestBuildExtractQueryMssql_JsonColumnsReplaceInOrder(t *testing.T) {
q := ExtractionQuery{
Schema: "dbo",
Table: "Users",
PrimaryKey: "UserID",
Columns: []models.ColumnType{
models.NewColumnType("UserID", false, false, "INT", "int", "INT", false, 0, 0, 0),
models.NewColumnType("Name", true, false, "VARCHAR", "varchar", "VARCHAR", false, 255, 0, 0),
models.NewColumnType("Email", true, false, "VARCHAR", "varchar", "VARCHAR", false, 255, 0, 0),
models.NewColumnType("Metadata", true, false, "NVARCHAR", "nvarchar", "NVARCHAR", true, 4000, 0, 0),
models.NewColumnType("Profile", true, false, "NVARCHAR", "nvarchar", "NVARCHAR", true, 4000, 0, 0),
models.NewColumnType("Settings", true, false, "NVARCHAR", "nvarchar", "NVARCHAR", true, 4000, 0, 0),
},
FromJsonColumns: []config.FromJsonItem{
{Column: "Metadata", Field: ".id"},
{Column: "Profile", Field: ".id"},
{Column: "Settings", Field: ".id"},
},
LowerLimit: ExtractorQueryLimit{IsValid: false},
UpperLimit: ExtractorQueryLimit{IsValid: false},
}
query, err := buildExtractQueryMssql(q)
if err != nil {
t.Fatalf("Expected no error, got: %v", err)
}
expected := "SELECT [UserID], [Name], [Email], JSON_VALUE([Metadata], '$.id') AS [Metadata], JSON_VALUE([Profile], '$.id') AS [Profile], JSON_VALUE([Settings], '$.id') AS [Settings] FROM [dbo].[Users] ORDER BY [UserID] ASC"
if query != expected {
t.Errorf("Unexpected query.\nExpected: %s\nGot: %s", expected, query)
}
}

View File

@@ -3,17 +3,14 @@ package dbwrapper
import (
"context"
"errors"
"fmt"
"strings"
dbdialects "git.ksdemosapps.com/kylesoda/go-migrate/internal/app/db-wrapper/db_dialects"
"github.com/jackc/pgx/v5"
"github.com/jackc/pgx/v5/pgxpool"
)
func init() {
Register(dbdialects.Postgres, func() DbWrapper {
return &postgresDbWrapper{dialect: dbdialects.Postgres}
Register("postgres", func() DbWrapper {
return &postgresDbWrapper{dialect: "postgres"}
})
}
@@ -129,75 +126,3 @@ func (pw *postgresDbWrapper) SaveMassive(ctx context.Context, schema string, tab
return affectedRows, nil
}
func (pw *postgresDbWrapper) QueryFromObject(ctx context.Context, q ExtractionQuery) (RowsResult, error) {
var sbQuery strings.Builder
sbQuery.WriteString("SELECT ")
if len(q.Columns) == 0 {
sbQuery.WriteString("*")
} else {
for i, col := range q.Columns {
switch col.Type() {
case "GEOMETRY":
fmt.Fprintf(&sbQuery, `ST_AsEWKB("%s") AS "%s"`, col.Name(), col.Name())
default:
fmt.Fprintf(&sbQuery, `"%s"`, col.Name())
}
if i < len(q.Columns)-1 {
sbQuery.WriteString(", ")
}
}
}
fmt.Fprintf(&sbQuery, ` FROM "%s"."%s"`, q.Schema, q.Table)
if q.LowerLimit.IsValid || q.UpperLimit.IsValid {
sbQuery.WriteString(" WHERE ")
paramIdx := 1
if q.LowerLimit.IsValid {
fmt.Fprintf(&sbQuery, `"%s"`, q.PrimaryKey)
if q.LowerLimit.IsInclusive {
sbQuery.WriteString(" >=")
} else {
sbQuery.WriteString(" >")
}
fmt.Fprintf(&sbQuery, " $%d", paramIdx)
paramIdx++
}
if q.LowerLimit.IsValid && q.UpperLimit.IsValid {
sbQuery.WriteString(" AND ")
}
if q.UpperLimit.IsValid {
fmt.Fprintf(&sbQuery, `"%s"`, q.PrimaryKey)
if q.UpperLimit.IsInclusive {
sbQuery.WriteString(" <=")
} else {
sbQuery.WriteString(" <")
}
fmt.Fprintf(&sbQuery, " $%d", paramIdx)
paramIdx++
}
}
fmt.Fprintf(&sbQuery, ` ORDER BY "%s" ASC`, q.PrimaryKey)
queryString := sbQuery.String()
var queryArgs []any
if q.LowerLimit.IsValid {
queryArgs = append(queryArgs, q.LowerLimit.Value)
}
if q.UpperLimit.IsValid {
queryArgs = append(queryArgs, q.UpperLimit.Value)
}
return pw.Query(ctx, queryString, queryArgs...)
}

View File

@@ -3,9 +3,6 @@ package dbwrapper
import (
"context"
"errors"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/config"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/models"
)
var MethodNotSupported error = errors.New("Method not supported by driver... yet :P")
@@ -27,22 +24,6 @@ type RowResult interface {
Scan(dest ...any) error
}
type ExtractorQueryLimit struct {
IsValid bool
IsInclusive bool
Value int64
}
type ExtractionQuery struct {
Schema string
Table string
PrimaryKey string
Columns []models.ColumnType
LowerLimit ExtractorQueryLimit
UpperLimit ExtractorQueryLimit
FromJsonColumns []config.FromJsonItem
}
type DbWrapper interface {
Close() error
Connect(ctx context.Context, dbUrl string) error
@@ -51,5 +32,4 @@ type DbWrapper interface {
Query(ctx context.Context, query string, args ...any) (RowsResult, error)
QueryRow(ctx context.Context, query string, args ...any) RowResult
SaveMassive(ctx context.Context, schema string, table string, columnNames []string, rows [][]any) (int64, error)
QueryFromObject(ctx context.Context, query ExtractionQuery) (RowsResult, error)
}

View File

@@ -1,108 +0,0 @@
package extractors
import (
"context"
"errors"
"slices"
"strings"
"sync"
"sync/atomic"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/config"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/custom_errors"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/models"
"github.com/sirupsen/logrus"
)
func (ex *GenericExtractor) Consume(
ctx context.Context,
tableInfo config.SourceTableInfo,
columns []models.ColumnType,
batchSize int,
retryConfig config.RetryConfig,
chPartitionsIn <-chan models.Partition,
chBatchesOut chan<- models.Batch,
chErrorsOut chan<- custom_errors.JobError,
wgActivePartitions *sync.WaitGroup,
rowsRead *int64,
failedPartitionsCount *int32,
fromJsonColumns []config.FromJsonItem,
) {
indexPrimaryKey := slices.IndexFunc(columns, func(col models.ColumnType) bool {
return strings.EqualFold(col.Name(), tableInfo.PrimaryKey)
})
if indexPrimaryKey == -1 {
select {
case <-ctx.Done():
return
case chErrorsOut <- custom_errors.JobError{
ShouldCancelJob: true,
Msg: "Primary key not found in provided columns",
}:
}
return
}
for {
if ctx.Err() != nil {
return
}
select {
case <-ctx.Done():
return
case partition, ok := <-chPartitionsIn:
if !ok {
return
}
rowsReadResult, err := ex.ProcessPartitionWithRetries(
ctx,
tableInfo,
columns,
batchSize,
partition,
indexPrimaryKey,
retryConfig,
chBatchesOut,
fromJsonColumns,
)
wgActivePartitions.Done()
if rowsReadResult > 0 {
current := atomic.LoadInt64(rowsRead)
logrus.Debugf("Rows read (partition extracted): +%v [current=%v] (%s.%s)", rowsReadResult, current, tableInfo.Schema, tableInfo.Table)
atomic.AddInt64(rowsRead, int64(rowsReadResult))
}
if err != nil {
atomic.AddInt32(failedPartitionsCount, 1)
if jobError, ok := errors.AsType[*custom_errors.JobError](err); ok {
select {
case <-ctx.Done():
return
case chErrorsOut <- *jobError:
}
} else {
select {
case <-ctx.Done():
return
case chErrorsOut <- custom_errors.JobError{ShouldCancelJob: false, Msg: err.Error(), Prev: err}:
}
}
currentFPCount := atomic.LoadInt32(failedPartitionsCount)
if currentFPCount > int32(retryConfig.MaxFailedPartitions) {
select {
case <-ctx.Done():
return
case chErrorsOut <- custom_errors.JobError{ShouldCancelJob: true, Msg: "Max failed partitions reached"}:
return
}
}
}
}
}
}

View File

@@ -1,41 +0,0 @@
package extractors
import (
"context"
dbwrapper "git.ksdemosapps.com/kylesoda/go-migrate/internal/app/db-wrapper"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/models"
"github.com/google/uuid"
)
type GenericExtractor struct {
db dbwrapper.DbWrapper
}
func NewExtractor(db dbwrapper.DbWrapper) GenericExtractor {
return GenericExtractor{db: db}
}
func sendBatch(ctx context.Context, chBatchesOut chan<- models.Batch, batch models.Batch) error {
select {
case chBatchesOut <- batch:
return nil
case <-ctx.Done():
return ctx.Err()
}
}
func flush(
ctx context.Context,
batchSize int,
batchRows []models.UnknownRowValues,
chBatchesOut chan<- models.Batch,
) error {
if len(batchRows) == 0 {
return nil
}
batch := models.Batch{Id: uuid.New(), Rows: batchRows}
batchRows = make([]models.UnknownRowValues, 0, batchSize)
return sendBatch(ctx, chBatchesOut, batch)
}

View File

@@ -0,0 +1,277 @@
package extractors
import (
"context"
"database/sql"
"errors"
"fmt"
"slices"
"strings"
"sync"
"sync/atomic"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/config"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/convert"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/custom_errors"
dbwrapper "git.ksdemosapps.com/kylesoda/go-migrate/internal/app/db-wrapper"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/etl"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/models"
"github.com/google/uuid"
)
type MssqlExtractor struct {
db dbwrapper.DbWrapper
}
func NewMssqlExtractor(db dbwrapper.DbWrapper) etl.Extractor {
return &MssqlExtractor{db: db}
}
func buildExtractQueryMssql(
tableInfo config.SourceTableInfo,
columns []models.ColumnType,
includeRange bool,
isMinInclusive bool,
) string {
var sbQuery strings.Builder
sbQuery.WriteString("SELECT ")
if len(columns) == 0 {
sbQuery.WriteString("*")
} else {
for i, col := range columns {
fmt.Fprintf(&sbQuery, "[%s]", col.Name())
// if col.Type() == "GEOMETRY" {
// fmt.Fprintf(&sbQuery, ".STAsBinary() AS [%s]", col.Name())
// }
if i < len(columns)-1 {
sbQuery.WriteString(", ")
}
}
}
fmt.Fprintf(&sbQuery, " FROM [%s].[%s]", tableInfo.Schema, tableInfo.Table)
if includeRange {
fmt.Fprintf(&sbQuery, " WHERE [%s]", tableInfo.PrimaryKey)
if isMinInclusive {
sbQuery.WriteString(" >=")
} else {
sbQuery.WriteString(" >")
}
fmt.Fprintf(&sbQuery, " @min AND [%s] <= @max", tableInfo.PrimaryKey)
}
fmt.Fprintf(&sbQuery, " ORDER BY [%s] ASC", tableInfo.PrimaryKey)
return sbQuery.String()
}
func errorFromLastRow(
lastRow models.UnknownRowValues,
indexPrimaryKey int,
partition models.Partition,
previousError error,
) *custom_errors.ExtractorError {
lastIdRawValue := lastRow[indexPrimaryKey]
lastId, ok := convert.ToInt64(lastIdRawValue)
if !ok {
currentPartition := partition
currentPartition.RetryCounter = 3
return &custom_errors.ExtractorError{
Partition: currentPartition,
HasLastId: true,
Msg: fmt.Sprintf("Couldn't cast last id value as int: %s", previousError.Error()),
}
}
return &custom_errors.ExtractorError{
Partition: partition,
HasLastId: true,
LastId: lastId,
Msg: previousError.Error(),
}
}
func (mssqlEx *MssqlExtractor) ProcessPartition(
ctx context.Context,
tableInfo config.SourceTableInfo,
columns []models.ColumnType,
batchSize int,
partition models.Partition,
indexPrimaryKey int,
chBatchesOut chan<- models.Batch,
) (int, error) {
query := buildExtractQueryMssql(tableInfo, columns, partition.HasRange, partition.Range.IsMinInclusive)
var queryArgs []any
if partition.HasRange {
queryArgs = append(queryArgs,
sql.Named("min", partition.Range.Min),
sql.Named("max", partition.Range.Max),
)
}
rowsRead := 0
rows, err := mssqlEx.db.Query(ctx, query, queryArgs...)
if err != nil {
return rowsRead, &custom_errors.ExtractorError{Partition: partition, HasLastId: false, Msg: err.Error()}
}
defer rows.Close()
batchRows := make([]models.UnknownRowValues, 0, batchSize)
for rows.Next() {
rowValues := make([]any, len(columns))
scanArgs := make([]any, len(columns))
for i := range rowValues {
scanArgs[i] = &rowValues[i]
}
if err := rows.Scan(scanArgs...); err != nil {
if len(batchRows) == 0 {
return rowsRead, &custom_errors.ExtractorError{Partition: partition, HasLastId: false, Msg: err.Error()}
}
lastRow := batchRows[len(batchRows)-1]
select {
case chBatchesOut <- models.Batch{Id: uuid.New(), PartitionId: partition.Id, Rows: batchRows, RetryCounter: 0}:
case <-ctx.Done():
return rowsRead, ctx.Err()
}
return rowsRead, errorFromLastRow(lastRow, indexPrimaryKey, partition, err)
}
rowsRead++
batchRows = append(batchRows, rowValues)
if len(batchRows) >= batchSize {
select {
case chBatchesOut <- models.Batch{Id: uuid.New(), PartitionId: partition.Id, Rows: batchRows, RetryCounter: 0}:
case <-ctx.Done():
return rowsRead, ctx.Err()
}
batchRows = make([]models.UnknownRowValues, 0, batchSize)
}
}
if err := rows.Err(); err != nil {
if errors.Is(err, ctx.Err()) {
return rowsRead, ctx.Err()
}
if len(batchRows) > 0 {
lastRow := batchRows[len(batchRows)-1]
return rowsRead, errorFromLastRow(lastRow, indexPrimaryKey, partition, err)
}
return rowsRead, &custom_errors.ExtractorError{Partition: partition, HasLastId: false, Msg: err.Error()}
}
if len(batchRows) > 0 {
select {
case chBatchesOut <- models.Batch{Id: uuid.New(), PartitionId: partition.Id, Rows: batchRows, RetryCounter: 0}:
case <-ctx.Done():
return rowsRead, ctx.Err()
}
}
return rowsRead, nil
}
func (mssqlEx *MssqlExtractor) Exec(
ctx context.Context,
tableInfo config.SourceTableInfo,
columns []models.ColumnType,
batchSize int,
chPartitionsIn <-chan models.Partition,
chBatchesOut chan<- models.Batch,
chErrorsOut chan<- custom_errors.ExtractorError,
chJobErrorsOut chan<- custom_errors.JobError,
wgActivePartitions *sync.WaitGroup,
rowsRead *int64,
) {
indexPrimaryKey := slices.IndexFunc(columns, func(col models.ColumnType) bool {
return strings.EqualFold(col.Name(), tableInfo.PrimaryKey)
})
if indexPrimaryKey == -1 {
select {
case <-ctx.Done():
return
case chJobErrorsOut <- custom_errors.JobError{
ShouldCancelJob: true,
Msg: "Primary key not found in provided columns",
}:
}
return
}
for {
if ctx.Err() != nil {
return
}
select {
case <-ctx.Done():
return
case partition, ok := <-chPartitionsIn:
if !ok {
return
}
rowsReadResult, err := mssqlEx.ProcessPartition(
ctx,
tableInfo,
columns,
batchSize,
partition,
indexPrimaryKey,
chBatchesOut,
)
if rowsReadResult > 0 {
atomic.AddInt64(rowsRead, int64(rowsReadResult))
}
if err != nil {
var exError *custom_errors.ExtractorError
var jobError *custom_errors.JobError
if errors.As(err, &exError) {
select {
case <-ctx.Done():
return
case chErrorsOut <- *exError:
}
} else if errors.As(err, &jobError) {
select {
case <-ctx.Done():
return
case chJobErrorsOut <- *jobError:
}
} else {
select {
case <-ctx.Done():
return
case chErrorsOut <- custom_errors.ExtractorError{Partition: partition, Msg: err.Error()}:
}
}
continue
}
wgActivePartitions.Done()
}
}
}

View File

@@ -0,0 +1,125 @@
package extractors
import (
"context"
"errors"
"fmt"
"strings"
"sync"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/config"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/custom_errors"
dbwrapper "git.ksdemosapps.com/kylesoda/go-migrate/internal/app/db-wrapper"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/etl"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/models"
"github.com/google/uuid"
)
type PostgresExtractor struct {
db dbwrapper.DbWrapper
}
func NewPostgresExtractor(db dbwrapper.DbWrapper) etl.Extractor {
return &PostgresExtractor{db: db}
}
func buildExtractQueryPostgres(sourceDbInfo config.SourceTableInfo, columns []models.ColumnType) string {
var sbColumns strings.Builder
if len(columns) == 0 {
sbColumns.WriteString("*")
} else {
for i, col := range columns {
if col.Type() == "GEOMETRY" {
sbColumns.WriteString(`ST_AsEWKB("`)
sbColumns.WriteString(col.Name())
sbColumns.WriteString(`") AS "`)
sbColumns.WriteString(col.Name())
sbColumns.WriteString(`"`)
} else {
sbColumns.WriteString(`"`)
sbColumns.WriteString(col.Name())
sbColumns.WriteString(`"`)
}
if i < len(columns)-1 {
sbColumns.WriteString(", ")
}
}
}
return fmt.Sprintf(`SELECT %s FROM "%s"."%s" ORDER BY "%s" ASC`, sbColumns.String(), sourceDbInfo.Schema, sourceDbInfo.Table, sourceDbInfo.PrimaryKey)
}
func (postgresEx *PostgresExtractor) ProcessPartition(
ctx context.Context,
tableInfo config.SourceTableInfo,
columns []models.ColumnType,
batchSize int,
partition models.Partition,
indexPrimaryKey int,
chBatchesOut chan<- models.Batch,
) (int, error) {
query := buildExtractQueryPostgres(tableInfo, columns)
if partition.HasRange {
return 0, errors.New("Batch config not yet supported")
}
rowsRead := 0
rows, err := postgresEx.db.Query(ctx, query)
if err != nil {
return rowsRead, &custom_errors.ExtractorError{Partition: partition, HasLastId: false, Msg: err.Error()}
}
defer rows.Close()
batchRows := make([]models.UnknownRowValues, 0, batchSize)
for rows.Next() {
values, err := rows.Values()
if err != nil {
return rowsRead, errors.New("Unexpected error reading rows from source")
}
rowsRead++
batchRows = append(batchRows, values)
if len(batchRows) >= batchSize {
select {
case chBatchesOut <- models.Batch{Id: uuid.New(), PartitionId: partition.Id, Rows: batchRows, RetryCounter: 0}:
case <-ctx.Done():
return rowsRead, ctx.Err()
}
batchRows = make([]models.UnknownRowValues, 0, batchSize)
}
}
if err := rows.Err(); err != nil {
return rowsRead, errors.New("Unexpected error reading rows from source")
}
if len(batchRows) > 0 {
select {
case chBatchesOut <- models.Batch{Id: uuid.New(), PartitionId: partition.Id, Rows: batchRows, RetryCounter: 0}:
case <-ctx.Done():
return rowsRead, nil
}
}
return rowsRead, nil
}
func (postgresEx *PostgresExtractor) Exec(
ctx context.Context,
tableInfo config.SourceTableInfo,
columns []models.ColumnType,
batchSize int,
chPartitionsIn <-chan models.Partition,
chBatchesOut chan<- models.Batch,
chErrorsOut chan<- custom_errors.ExtractorError,
chJobErrorsOut chan<- custom_errors.JobError,
wgActivePartitions *sync.WaitGroup,
rowsRead *int64,
) {
}

View File

@@ -1,77 +0,0 @@
package extractors
import (
"context"
"errors"
"fmt"
"time"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/config"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/custom_errors"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/models"
"github.com/google/uuid"
// "github.com/sirupsen/logrus"
)
func (ex *GenericExtractor) ProcessPartitionWithRetries(
ctx context.Context,
tableInfo config.SourceTableInfo,
columns []models.ColumnType,
batchSize int,
partition models.Partition,
indexPrimaryKey int,
retryConfig config.RetryConfig,
chBatchesOut chan<- models.Batch,
fromJsonColumns []config.FromJsonItem,
) (int64, error) {
var totalRowsRead int64
currentParitition := partition
for {
rowsRead, err := ex.ProcessPartition(
ctx,
tableInfo,
columns,
batchSize,
currentParitition,
indexPrimaryKey,
chBatchesOut,
fromJsonColumns,
)
// logrus.Debugf("Partition %v finished processing (%s.%s)", partition.Id, tableInfo.Schema, tableInfo.Table)
totalRowsRead += rowsRead
if err == nil {
return totalRowsRead, nil
}
if exError, ok := errors.AsType[*custom_errors.ExtractorError](err); ok {
currentParitition.RetryCounter++
if currentParitition.RetryCounter >= retryConfig.Attempts {
return totalRowsRead, &custom_errors.JobError{
Msg: fmt.Sprintf("Partition %v reached max retries (%d)", currentParitition.Id, currentParitition.RetryCounter),
Prev: err,
}
}
if exError.HasLastId {
currentParitition.ParentId = exError.Partition.Id
currentParitition.Id = uuid.New()
currentParitition.Range.Min = exError.LastId
currentParitition.Range.IsMinInclusive = false
}
delay := custom_errors.ComputeBackoffDelay(
currentParitition.RetryCounter,
retryConfig.BaseDelayMs,
retryConfig.MaxDelayMs,
retryConfig.MaxJitterMs,
)
time.Sleep(delay)
continue
}
return totalRowsRead, err
}
}

View File

@@ -1,127 +0,0 @@
package extractors
import (
"context"
"fmt"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/config"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/convert"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/custom_errors"
dbwrapper "git.ksdemosapps.com/kylesoda/go-migrate/internal/app/db-wrapper"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/models"
// "github.com/sirupsen/logrus"
)
func errorFromLastPartitionRow(
lastRow models.UnknownRowValues,
indexPrimaryKey int,
partition models.Partition,
previousError error,
) error {
lastIdRawValue := lastRow[indexPrimaryKey]
lastId, ok := convert.ToInt64(lastIdRawValue)
if !ok {
currentPartition := partition
currentPartition.RetryCounter = 3
return &custom_errors.ExtractorError{
Partition: currentPartition,
HasLastId: true,
Msg: fmt.Sprintf("Couldn't cast last id value as int: %s", previousError.Error()),
}
}
return &custom_errors.ExtractorError{
Partition: partition,
HasLastId: true,
LastId: lastId,
Msg: previousError.Error(),
}
}
func (ex *GenericExtractor) ProcessPartition(
ctx context.Context,
tableInfo config.SourceTableInfo,
columns []models.ColumnType,
batchSize int,
partition models.Partition,
indexPrimaryKey int,
chBatchesOut chan<- models.Batch,
fromJsonColumns []config.FromJsonItem,
) (int64, error) {
query := dbwrapper.ExtractionQuery{
Schema: tableInfo.Schema,
Table: tableInfo.Table,
PrimaryKey: tableInfo.PrimaryKey,
Columns: columns,
LowerLimit: dbwrapper.ExtractorQueryLimit{
IsValid: partition.HasRange && partition.Range.Min > 0,
IsInclusive: partition.Range.IsMinInclusive,
Value: partition.Range.Min,
},
UpperLimit: dbwrapper.ExtractorQueryLimit{
IsValid: partition.HasRange && partition.Range.Max > 0,
IsInclusive: partition.Range.IsMaxInclusive,
Value: partition.Range.Max,
},
FromJsonColumns: fromJsonColumns,
}
// logrus.Debugf("Processing partition: %+v (%s.%s)", query, tableInfo.Schema, tableInfo.Table)
rows, err := ex.db.QueryFromObject(ctx, query)
if err != nil {
return 0, err
}
defer rows.Close()
batchRows := make([]models.UnknownRowValues, 0, batchSize)
var rowsRead int64 = 0
var lastRow models.UnknownRowValues
for rows.Next() {
rowValues := make([]any, len(columns))
scanArgs := make([]any, len(columns))
for i := range rowValues {
scanArgs[i] = &rowValues[i]
}
if err := rows.Scan(scanArgs...); err != nil {
if len(batchRows) == 0 {
return rowsRead, err
}
if err := flush(ctx, batchSize, batchRows, chBatchesOut); err != nil {
return rowsRead, err
}
lastRow := batchRows[len(batchRows)-1]
return rowsRead, errorFromLastPartitionRow(lastRow, indexPrimaryKey, partition, err)
}
rowsRead++
lastRow = rowValues
batchRows = append(batchRows, rowValues)
if len(batchRows) >= batchSize {
// logrus.Debugf("Batch size reached, flushing batch with %v rows (rowsRead=%v)", len(batchRows), rowsRead)
if err := flush(ctx, batchSize, batchRows, chBatchesOut); err != nil {
// logrus.Warnf("Error flushing rows: %v", err)
return rowsRead, err
}
batchRows = make([]models.UnknownRowValues, 0, batchSize)
}
}
if err := flush(ctx, batchSize, batchRows, chBatchesOut); err != nil {
return rowsRead, err
}
if err := rows.Err(); err != nil {
if lastRow != nil {
return rowsRead, errorFromLastPartitionRow(lastRow, indexPrimaryKey, partition, err)
}
return rowsRead, err
}
return rowsRead, nil
}

View File

@@ -1,153 +0,0 @@
package loaders
import (
"context"
"errors"
"sync"
"sync/atomic"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/config"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/custom_errors"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/models"
"github.com/google/uuid"
"github.com/sirupsen/logrus"
)
type loaderAccumulator struct {
batchSize int
rows []models.UnknownRowValues
parents []models.BatchRef
pendingDone int
}
func (a *loaderAccumulator) add(batch models.Batch) {
a.rows = append(a.rows, batch.Rows...)
a.parents = append(a.parents, models.BatchRef{Id: batch.Id})
a.pendingDone++
}
func (a *loaderAccumulator) ready() bool {
return len(a.rows) >= a.batchSize
}
func (a *loaderAccumulator) drainPending(wg *sync.WaitGroup) {
for range a.pendingDone {
wg.Done()
}
}
func sendLoadError(
ctx context.Context,
err error,
retryConfig config.RetryConfig,
failedBatchesCount *int32,
chErrorsOut chan<- custom_errors.JobError,
) bool {
atomic.AddInt32(failedBatchesCount, 1)
var jobErr custom_errors.JobError
if je, ok := errors.AsType[*custom_errors.JobError](err); ok {
jobErr = *je
} else {
jobErr = custom_errors.JobError{ShouldCancelJob: false, Msg: err.Error(), Prev: err}
}
select {
case <-ctx.Done():
return false
case chErrorsOut <- jobErr:
}
if atomic.LoadInt32(failedBatchesCount) > int32(retryConfig.MaxFailedBatchesLoad) {
select {
case <-ctx.Done():
case chErrorsOut <- custom_errors.JobError{ShouldCancelJob: true, Msg: "Max failed batches (load) reached"}:
}
return false
}
return true
}
func (gl *GenericLoader) Consume(
ctx context.Context,
tableInfo config.TargetTableInfo,
columns []models.ColumnType,
retryConfig config.RetryConfig,
batchSize int,
chBatchesIn <-chan models.Batch,
chErrorsOut chan<- custom_errors.JobError,
wgActiveBatches *sync.WaitGroup,
rowsLoaded *int64,
failedBatchesCount *int32,
) {
colNames := mapSlice(columns, func(col models.ColumnType) string {
return col.Name()
})
acc := &loaderAccumulator{batchSize: batchSize}
defer acc.drainPending(wgActiveBatches)
flush := func() bool {
if len(acc.rows) == 0 {
return true
}
count := len(acc.parents)
superBatch := models.Batch{
Id: uuid.New(),
ParentBatches: acc.parents,
Rows: acc.rows,
}
processedRows, err := gl.ProcessBatchWithRetries(ctx, tableInfo, colNames, retryConfig, superBatch)
for range count {
wgActiveBatches.Done()
}
acc.pendingDone -= count
acc.rows = nil
acc.parents = nil
if err != nil {
return sendLoadError(ctx, err, retryConfig, failedBatchesCount, chErrorsOut)
}
current := atomic.LoadInt64(rowsLoaded)
logrus.Debugf("Rows loaded (batch loaded): +%v [current=%v] (%s.%s)", processedRows, current, tableInfo.Schema, tableInfo.Table)
atomic.AddInt64(rowsLoaded, int64(processedRows))
return true
}
for {
select {
case <-ctx.Done():
return
case batch, ok := <-chBatchesIn:
if !ok {
flush()
return
}
if batchSize <= 0 {
processedRows, err := gl.ProcessBatchWithRetries(ctx, tableInfo, colNames, retryConfig, batch)
wgActiveBatches.Done()
if err != nil {
if !sendLoadError(ctx, err, retryConfig, failedBatchesCount, chErrorsOut) {
return
}
continue
}
current := atomic.LoadInt64(rowsLoaded)
logrus.Debugf("Rows loaded: +%v [current=%v] (%s.%s)", processedRows, current, tableInfo.Schema, tableInfo.Table)
atomic.AddInt64(rowsLoaded, int64(processedRows))
continue
}
acc.add(batch)
if acc.ready() {
if !flush() {
return
}
}
}
}
}

View File

@@ -1,603 +0,0 @@
package loaders
import (
"context"
"errors"
"sync"
"sync/atomic"
"testing"
"time"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/config"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/custom_errors"
dbwrapper "git.ksdemosapps.com/kylesoda/go-migrate/internal/app/db-wrapper"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/models"
"github.com/google/uuid"
)
const testTimeout = 2 * time.Second
type mockResult struct {
err error
}
type mockDbWrapper struct {
mu sync.Mutex
callCount int
results []mockResult
}
func newMockDb(results ...mockResult) *mockDbWrapper {
return &mockDbWrapper{results: results}
}
func (m *mockDbWrapper) SaveMassive(_ context.Context, _ string, _ string, _ []string, rows [][]any) (int64, error) {
m.mu.Lock()
defer m.mu.Unlock()
idx := m.callCount
m.callCount++
if idx < len(m.results) && m.results[idx].err != nil {
return 0, m.results[idx].err
}
return int64(len(rows)), nil
}
func (m *mockDbWrapper) Close() error { return nil }
func (m *mockDbWrapper) Connect(_ context.Context, _ string) error { return nil }
func (m *mockDbWrapper) Exec(_ context.Context, _ string, _ ...any) (dbwrapper.ExecResult, error) {
return dbwrapper.ExecResult{}, nil
}
func (m *mockDbWrapper) GetDialect() string { return "" }
func (m *mockDbWrapper) Query(_ context.Context, _ string, _ ...any) (dbwrapper.RowsResult, error) {
return nil, nil
}
func (m *mockDbWrapper) QueryRow(_ context.Context, _ string, _ ...any) dbwrapper.RowResult {
return nil
}
func (m *mockDbWrapper) QueryFromObject(_ context.Context, _ dbwrapper.ExtractionQuery) (dbwrapper.RowsResult, error) {
return nil, nil
}
func makeBatch(numRows int) models.Batch {
rows := make([]models.UnknownRowValues, numRows)
for i := range rows {
rows[i] = models.UnknownRowValues{i}
}
return models.Batch{Id: uuid.New(), Rows: rows}
}
func newLoader(db *mockDbWrapper) GenericLoader {
return GenericLoader{db: db}
}
func rc(maxFailed int) config.RetryConfig {
return config.RetryConfig{Attempts: 1, MaxFailedBatchesLoad: maxFailed}
}
func sendBatch(chIn chan<- models.Batch, batch models.Batch, wg *sync.WaitGroup) {
wg.Add(1)
chIn <- batch
}
func runConsume(
ctx context.Context,
gl GenericLoader,
retryConfig config.RetryConfig,
batchSize int,
chIn <-chan models.Batch,
chErr chan<- custom_errors.JobError,
wg *sync.WaitGroup,
rowsLoaded *int64,
failedCount *int32,
) <-chan struct{} {
done := make(chan struct{})
go func() {
gl.Consume(ctx, config.TargetTableInfo{}, nil, retryConfig, batchSize,
chIn, chErr, wg, rowsLoaded, failedCount)
close(done)
}()
return done
}
func waitWg(wg *sync.WaitGroup) <-chan struct{} {
done := make(chan struct{})
go func() { wg.Wait(); close(done) }()
return done
}
func dbError() error { return errors.New("connection reset by peer") }
func TestLoaderAccumulator_Add(t *testing.T) {
acc := &loaderAccumulator{batchSize: 5}
b1 := makeBatch(2)
b2 := makeBatch(3)
acc.add(b1)
acc.add(b2)
if len(acc.rows) != 5 {
t.Errorf("expected 5 rows, got %d", len(acc.rows))
}
if len(acc.parents) != 2 {
t.Fatalf("expected 2 parents, got %d", len(acc.parents))
}
if acc.parents[0].Id != b1.Id || acc.parents[1].Id != b2.Id {
t.Error("parent IDs do not match source batch IDs in order")
}
if acc.pendingDone != 2 {
t.Errorf("expected pendingDone=2, got %d", acc.pendingDone)
}
}
func TestLoaderAccumulator_Ready(t *testing.T) {
acc := &loaderAccumulator{batchSize: 3}
acc.add(makeBatch(2))
if acc.ready() {
t.Error("should not be ready with 2 rows and batchSize=3")
}
acc.add(makeBatch(1))
if !acc.ready() {
t.Error("should be ready with 3 rows and batchSize=3")
}
}
func TestLoaderAccumulator_DrainPending_ReleasesWg(t *testing.T) {
acc := &loaderAccumulator{batchSize: 5, pendingDone: 3}
var wg sync.WaitGroup
wg.Add(3)
acc.drainPending(&wg)
select {
case <-waitWg(&wg):
case <-time.After(testTimeout):
t.Fatal("wg.Wait() timed out: drainPending did not call Done() enough times")
}
}
func TestLoaderAccumulator_DrainPending_ZeroPending(t *testing.T) {
acc := &loaderAccumulator{batchSize: 5, pendingDone: 0}
var wg sync.WaitGroup
acc.drainPending(&wg)
select {
case <-waitWg(&wg):
case <-time.After(testTimeout):
t.Fatal("wg.Wait() timed out")
}
}
func TestSendLoadError_PlainError_WrappedAsNonFatal(t *testing.T) {
ch := make(chan custom_errors.JobError, 2)
var failedCount int32
result := sendLoadError(context.Background(), errors.New("db error"), rc(10), &failedCount, ch)
if !result {
t.Error("expected true (below threshold)")
}
if atomic.LoadInt32(&failedCount) != 1 {
t.Errorf("expected failedCount=1, got %d", failedCount)
}
select {
case e := <-ch:
if e.ShouldCancelJob {
t.Error("plain error should be wrapped as ShouldCancelJob=false")
}
default:
t.Error("expected an error in the channel")
}
}
func TestSendLoadError_JobError_PassesThrough(t *testing.T) {
ch := make(chan custom_errors.JobError, 2)
var failedCount int32
original := &custom_errors.JobError{ShouldCancelJob: false, Msg: "custom msg"}
sendLoadError(context.Background(), original, rc(10), &failedCount, ch)
select {
case e := <-ch:
if e.Msg != "custom msg" || e.ShouldCancelJob {
t.Errorf("JobError should pass through unchanged, got %+v", e)
}
default:
t.Error("expected an error in the channel")
}
}
func TestSendLoadError_FatalJobError_BelowThreshold_ReturnsTrue(t *testing.T) {
ch := make(chan custom_errors.JobError, 2)
var failedCount int32
fatal := &custom_errors.JobError{ShouldCancelJob: true, Msg: "unique constraint"}
result := sendLoadError(context.Background(), fatal, rc(10), &failedCount, ch)
if !result {
t.Error("below-threshold fatal error should return true (external cancel expected from JobErrorHandler)")
}
select {
case e := <-ch:
if !e.ShouldCancelJob {
t.Error("fatal JobError should be forwarded with ShouldCancelJob=true")
}
default:
t.Error("expected the fatal error in the channel")
}
}
func TestSendLoadError_ThresholdExceeded_ReturnsFalse(t *testing.T) {
ch := make(chan custom_errors.JobError, 2)
var failedCount int32
result := sendLoadError(context.Background(), errors.New("db error"), rc(0), &failedCount, ch)
if result {
t.Error("expected false when threshold exceeded")
}
if len(ch) != 2 {
t.Fatalf("expected 2 errors (batch error + fatal threshold error), got %d", len(ch))
}
<-ch // batch error
threshold := <-ch
if !threshold.ShouldCancelJob {
t.Error("second error should be the fatal threshold error (ShouldCancelJob=true)")
}
}
func TestSendLoadError_AtThresholdBoundary(t *testing.T) {
ch := make(chan custom_errors.JobError, 6)
var failedCount int32
if !sendLoadError(context.Background(), errors.New("err"), rc(2), &failedCount, ch) {
t.Error("first failure: expected true (below threshold)")
}
if !sendLoadError(context.Background(), errors.New("err"), rc(2), &failedCount, ch) {
t.Error("second failure: expected true (at threshold, not exceeded)")
}
if sendLoadError(context.Background(), errors.New("err"), rc(2), &failedCount, ch) {
t.Error("third failure: expected false (threshold exceeded)")
}
}
func TestSendLoadError_ContextCancelled_ReturnsFalse(t *testing.T) {
ch := make(chan custom_errors.JobError)
var failedCount int32
ctx, cancel := context.WithCancel(context.Background())
cancel()
result := sendLoadError(ctx, errors.New("db error"), rc(10), &failedCount, ch)
if result {
t.Error("expected false when context is cancelled")
}
if len(ch) != 0 {
t.Error("no error should be sent when context is cancelled")
}
}
func TestConsume_Passthrough_RowsLoaded(t *testing.T) {
db := newMockDb()
gl := newLoader(db)
chIn := make(chan models.Batch, 1)
chErr := make(chan custom_errors.JobError, 1)
var wg sync.WaitGroup
var rowsLoaded int64
var failedCount int32
sendBatch(chIn, makeBatch(5), &wg)
close(chIn)
<-runConsume(context.Background(), gl, rc(0), 0, chIn, chErr, &wg, &rowsLoaded, &failedCount)
wg.Wait()
if rowsLoaded != 5 {
t.Errorf("expected rowsLoaded=5, got %d", rowsLoaded)
}
if db.callCount != 1 {
t.Errorf("expected 1 SaveMassive call, got %d", db.callCount)
}
}
func TestConsume_Passthrough_MultipleBatches_RowsAccumulate(t *testing.T) {
db := newMockDb()
gl := newLoader(db)
chIn := make(chan models.Batch, 3)
chErr := make(chan custom_errors.JobError, 1)
var wg sync.WaitGroup
var rowsLoaded int64
var failedCount int32
sendBatch(chIn, makeBatch(3), &wg)
sendBatch(chIn, makeBatch(2), &wg)
sendBatch(chIn, makeBatch(4), &wg)
close(chIn)
<-runConsume(context.Background(), gl, rc(10), 0, chIn, chErr, &wg, &rowsLoaded, &failedCount)
wg.Wait()
if rowsLoaded != 9 {
t.Errorf("expected rowsLoaded=9, got %d", rowsLoaded)
}
}
func TestConsume_Passthrough_WgDoneBeforeErrorHandling(t *testing.T) {
db := newMockDb(mockResult{err: dbError()})
gl := newLoader(db)
chIn := make(chan models.Batch, 1)
chErr := make(chan custom_errors.JobError, 2)
var wg sync.WaitGroup
var rowsLoaded int64
var failedCount int32
sendBatch(chIn, makeBatch(2), &wg)
close(chIn)
<-runConsume(context.Background(), gl, rc(10), 0, chIn, chErr, &wg, &rowsLoaded, &failedCount)
select {
case <-waitWg(&wg):
case <-time.After(testTimeout):
t.Fatal("wg.Wait() timed out: Done() was not called even though processing failed")
}
}
func TestConsume_Passthrough_NonFatalError_Continues(t *testing.T) {
db := newMockDb(mockResult{err: dbError()})
gl := newLoader(db)
chIn := make(chan models.Batch, 2)
chErr := make(chan custom_errors.JobError, 3)
var wg sync.WaitGroup
var rowsLoaded int64
var failedCount int32
sendBatch(chIn, makeBatch(2), &wg)
sendBatch(chIn, makeBatch(3), &wg)
close(chIn)
<-runConsume(context.Background(), gl, rc(10), 0, chIn, chErr, &wg, &rowsLoaded, &failedCount)
wg.Wait()
if rowsLoaded != 3 {
t.Errorf("expected rowsLoaded=3 (only second batch succeeded), got %d", rowsLoaded)
}
if atomic.LoadInt32(&failedCount) != 1 {
t.Errorf("expected failedCount=1, got %d", failedCount)
}
if len(chErr) == 0 {
t.Error("expected at least one error in chErr for the failed batch")
}
}
func TestConsume_Passthrough_ThresholdExceeded_Exits(t *testing.T) {
db := newMockDb(mockResult{err: dbError()})
gl := newLoader(db)
chIn := make(chan models.Batch, 1)
chErr := make(chan custom_errors.JobError, 3)
var wg sync.WaitGroup
var rowsLoaded int64
var failedCount int32
sendBatch(chIn, makeBatch(1), &wg)
done := runConsume(context.Background(), gl, rc(0), 0, chIn, chErr, &wg, &rowsLoaded, &failedCount)
select {
case <-done:
case <-time.After(testTimeout):
t.Fatal("Consume did not exit after threshold exceeded")
}
select {
case <-waitWg(&wg):
case <-time.After(testTimeout):
t.Fatal("wg.Wait() timed out after threshold exit")
}
}
func TestConsume_Accumulation_FlushOnThreshold(t *testing.T) {
db := newMockDb()
gl := newLoader(db)
chIn := make(chan models.Batch, 3)
chErr := make(chan custom_errors.JobError, 1)
var wg sync.WaitGroup
var rowsLoaded int64
var failedCount int32
sendBatch(chIn, makeBatch(1), &wg)
sendBatch(chIn, makeBatch(1), &wg)
sendBatch(chIn, makeBatch(1), &wg)
close(chIn)
<-runConsume(context.Background(), gl, rc(0), 3, chIn, chErr, &wg, &rowsLoaded, &failedCount)
wg.Wait()
if rowsLoaded != 3 {
t.Errorf("expected rowsLoaded=3, got %d", rowsLoaded)
}
if db.callCount != 1 {
t.Errorf("expected 1 SaveMassive call, got %d", db.callCount)
}
}
func TestConsume_Accumulation_FlushOnClose(t *testing.T) {
db := newMockDb()
gl := newLoader(db)
chIn := make(chan models.Batch, 2)
chErr := make(chan custom_errors.JobError, 1)
var wg sync.WaitGroup
var rowsLoaded int64
var failedCount int32
sendBatch(chIn, makeBatch(2), &wg)
sendBatch(chIn, makeBatch(3), &wg)
close(chIn)
<-runConsume(context.Background(), gl, rc(0), 10, chIn, chErr, &wg, &rowsLoaded, &failedCount)
wg.Wait()
if rowsLoaded != 5 {
t.Errorf("expected rowsLoaded=5, got %d", rowsLoaded)
}
if db.callCount != 1 {
t.Errorf("expected exactly 1 SaveMassive call (single flush on close), got %d", db.callCount)
}
}
func TestConsume_Accumulation_RowsLoadedCorrect(t *testing.T) {
db := newMockDb()
gl := newLoader(db)
chIn := make(chan models.Batch, 5)
chErr := make(chan custom_errors.JobError, 1)
var wg sync.WaitGroup
var rowsLoaded int64
var failedCount int32
for range 5 {
sendBatch(chIn, makeBatch(2), &wg)
}
close(chIn)
<-runConsume(context.Background(), gl, rc(0), 4, chIn, chErr, &wg, &rowsLoaded, &failedCount)
wg.Wait()
if rowsLoaded != 10 {
t.Errorf("expected rowsLoaded=10, got %d", rowsLoaded)
}
if db.callCount != 3 {
t.Errorf("expected 3 SaveMassive calls (2 threshold flushes + 1 on close), got %d", db.callCount)
}
}
func TestConsume_Accumulation_WgBalanced_OnContextCancel(t *testing.T) {
db := newMockDb()
gl := newLoader(db)
chIn := make(chan models.Batch)
chErr := make(chan custom_errors.JobError, 1)
var wg sync.WaitGroup
var rowsLoaded int64
var failedCount int32
ctx, cancel := context.WithCancel(context.Background())
done := runConsume(ctx, gl, rc(0), 10, chIn, chErr, &wg, &rowsLoaded, &failedCount)
sendBatch(chIn, makeBatch(1), &wg)
sendBatch(chIn, makeBatch(1), &wg)
cancel()
select {
case <-done:
case <-time.After(testTimeout):
t.Fatal("Consume did not exit after context cancellation")
}
select {
case <-waitWg(&wg):
case <-time.After(testTimeout):
t.Fatal("wg.Wait() timed out: drainPending did not release accumulated batches on cancel")
}
}
func TestConsume_Accumulation_ErrorInFlush_WgStillBalanced(t *testing.T) {
db := newMockDb(mockResult{err: dbError()})
gl := newLoader(db)
chIn := make(chan models.Batch, 2)
chErr := make(chan custom_errors.JobError, 3)
var wg sync.WaitGroup
var rowsLoaded int64
var failedCount int32
sendBatch(chIn, makeBatch(1), &wg)
sendBatch(chIn, makeBatch(1), &wg)
close(chIn)
<-runConsume(context.Background(), gl, rc(10), 2, chIn, chErr, &wg, &rowsLoaded, &failedCount)
select {
case <-waitWg(&wg):
case <-time.After(testTimeout):
t.Fatal("wg.Wait() timed out: wg.Done() not called after flush error")
}
}
func TestConsume_Accumulation_MultipleFlushes_NonFatalErrors(t *testing.T) {
db := newMockDb(mockResult{err: dbError()}, mockResult{err: dbError()})
gl := newLoader(db)
chIn := make(chan models.Batch, 4)
chErr := make(chan custom_errors.JobError, 6)
var wg sync.WaitGroup
var rowsLoaded int64
var failedCount int32
for range 4 {
sendBatch(chIn, makeBatch(1), &wg)
}
close(chIn)
<-runConsume(context.Background(), gl, rc(10), 2, chIn, chErr, &wg, &rowsLoaded, &failedCount)
select {
case <-waitWg(&wg):
case <-time.After(testTimeout):
t.Fatal("wg.Wait() timed out")
}
if atomic.LoadInt32(&failedCount) != 2 {
t.Errorf("expected failedCount=2, got %d", failedCount)
}
if rowsLoaded != 0 {
t.Errorf("expected rowsLoaded=0 (all batches failed), got %d", rowsLoaded)
}
}
func TestConsume_EmptyInput_NoProcessing(t *testing.T) {
db := newMockDb()
gl := newLoader(db)
chIn := make(chan models.Batch)
chErr := make(chan custom_errors.JobError, 1)
var wg sync.WaitGroup
var rowsLoaded int64
var failedCount int32
close(chIn)
done := runConsume(context.Background(), gl, rc(0), 5, chIn, chErr, &wg, &rowsLoaded, &failedCount)
select {
case <-done:
case <-time.After(testTimeout):
t.Fatal("Consume did not exit after empty input channel was closed")
}
if db.callCount != 0 {
t.Errorf("expected no SaveMassive calls, got %d", db.callCount)
}
if rowsLoaded != 0 {
t.Errorf("expected rowsLoaded=0, got %d", rowsLoaded)
}
wg.Wait()
}
func TestConsume_ContextCancellation_Exits(t *testing.T) {
db := newMockDb()
gl := newLoader(db)
chIn := make(chan models.Batch)
chErr := make(chan custom_errors.JobError, 1)
var wg sync.WaitGroup
var rowsLoaded int64
var failedCount int32
ctx, cancel := context.WithCancel(context.Background())
done := runConsume(ctx, gl, rc(0), 0, chIn, chErr, &wg, &rowsLoaded, &failedCount)
cancel()
select {
case <-done:
case <-time.After(testTimeout):
t.Fatal("Consume did not exit after context cancellation")
}
wg.Wait()
}

View File

@@ -1,13 +1,117 @@
package loaders
import (
"context"
"errors"
"fmt"
"sync"
"sync/atomic"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/config"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/custom_errors"
dbwrapper "git.ksdemosapps.com/kylesoda/go-migrate/internal/app/db-wrapper"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/etl"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/models"
"github.com/jackc/pgx/v5/pgconn"
)
type GenericLoader struct {
db dbwrapper.DbWrapper
}
func NewGenericLoader(db dbwrapper.DbWrapper) GenericLoader {
return GenericLoader{db: db}
func NewGenericLoader(db dbwrapper.DbWrapper) etl.Loader {
return &GenericLoader{db: db}
}
func (gl *GenericLoader) ProcessBatch(
ctx context.Context,
tableInfo config.TargetTableInfo,
colNames []string,
batch models.Batch,
) (int, error) {
_, err := gl.db.SaveMassive(
ctx,
tableInfo.Schema,
tableInfo.Table,
colNames,
batch.Rows,
)
if err != nil {
var pgErr *pgconn.PgError
if errors.As(err, &pgErr) {
if pgErr.Code == "23505" {
return 0, &custom_errors.JobError{
ShouldCancelJob: true,
Msg: fmt.Sprintf("Fatal error in table %s.%s", tableInfo.Schema, tableInfo.Table),
Prev: err,
}
}
}
return 0, &custom_errors.LoaderError{Batch: batch, Msg: err.Error()}
}
return len(batch.Rows), nil
}
func (gl *GenericLoader) Exec(
ctx context.Context,
tableInfo config.TargetTableInfo,
columns []models.ColumnType,
chBatchesIn <-chan models.Batch,
chErrorsOut chan<- custom_errors.LoaderError,
chJobErrorsOut chan<- custom_errors.JobError,
wgActiveBatches *sync.WaitGroup,
rowsLoaded *int64,
) {
colNames := mapSlice(columns, func(col models.ColumnType) string {
return col.Name()
})
for {
if ctx.Err() != nil {
return
}
select {
case <-ctx.Done():
return
case batch, ok := <-chBatchesIn:
if !ok {
return
}
processedRows, err := gl.ProcessBatch(ctx, tableInfo, colNames, batch)
if err != nil {
var ldError *custom_errors.LoaderError
var jobError *custom_errors.JobError
if errors.As(err, &ldError) {
select {
case <-ctx.Done():
return
case chErrorsOut <- *ldError:
}
} else if errors.As(err, &jobError) {
select {
case <-ctx.Done():
return
case chJobErrorsOut <- *jobError:
}
} else {
select {
case <-ctx.Done():
return
case chErrorsOut <- custom_errors.LoaderError{Batch: batch, Msg: err.Error()}:
}
}
continue
}
wgActiveBatches.Done()
atomic.AddInt64(rowsLoaded, int64(processedRows))
}
}
}

View File

@@ -1,49 +0,0 @@
package loaders
import (
"context"
"errors"
"fmt"
"time"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/config"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/custom_errors"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/models"
)
func (gl *GenericLoader) ProcessBatchWithRetries(
ctx context.Context,
tableInfo config.TargetTableInfo,
colNames []string,
retryConfig config.RetryConfig,
batch models.Batch,
) (int64, error) {
for {
rowsLoaded, err := gl.ProcessBatch(ctx, tableInfo, colNames, batch)
if err == nil {
return rowsLoaded, nil
}
if btError, ok := errors.AsType[*custom_errors.LoaderError](err); ok {
batch.RetryCounter++
if batch.RetryCounter >= retryConfig.Attempts {
return rowsLoaded, &custom_errors.JobError{
Msg: fmt.Sprintf("Batch %v reached max retries (%d)", batch.Id, batch.RetryCounter),
Prev: btError,
}
}
delay := custom_errors.ComputeBackoffDelay(
batch.RetryCounter,
retryConfig.BaseDelayMs,
retryConfig.MaxDelayMs,
retryConfig.MaxJitterMs,
)
time.Sleep(delay)
continue
}
return rowsLoaded, err
}
}

View File

@@ -1,43 +0,0 @@
package loaders
import (
"context"
"errors"
"fmt"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/config"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/custom_errors"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/models"
"github.com/jackc/pgx/v5/pgconn"
)
func (gl *GenericLoader) ProcessBatch(
ctx context.Context,
tableInfo config.TargetTableInfo,
colNames []string,
batch models.Batch,
) (int64, error) {
_, err := gl.db.SaveMassive(
ctx,
tableInfo.Schema,
tableInfo.Table,
colNames,
batch.Rows,
)
if err != nil {
if pgErr, ok := errors.AsType[*pgconn.PgError](err); ok {
if pgErr.Code == "23505" {
return 0, &custom_errors.JobError{
ShouldCancelJob: true,
Msg: fmt.Sprintf("Fatal error in table %s.%s", tableInfo.Schema, tableInfo.Table),
Prev: err,
}
}
}
return 0, &custom_errors.LoaderError{Batch: batch, Msg: err.Error()}
}
return int64(len(batch.Rows)), nil
}

View File

@@ -2,13 +2,11 @@ package table_analyzers
import (
"context"
"math"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/config"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/etl"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/models"
"github.com/google/uuid"
"github.com/sirupsen/logrus"
)
func PartitionRangeGenerator(
@@ -16,118 +14,27 @@ func PartitionRangeGenerator(
tableAnalyzer etl.TableAnalyzer,
tableInfo config.TableInfo,
partitionColumn string,
partitionCalculationStrategy string,
rowsPerPartition int64,
jobRange config.RangeConfig,
) ([]models.Partition, error) {
rowsCount, err := tableAnalyzer.EstimateTotalRows(ctx, tableInfo)
logrus.Infof("Estimated rows in source: %v (%s.%s)", rowsCount, tableInfo.Schema, tableInfo.Table)
if err != nil {
return nil, err
}
if rowsCount <= rowsPerPartition {
hasRange := jobRange.Min != nil || jobRange.Max != nil
partition := models.Partition{Id: uuid.New(), HasRange: hasRange, RetryCounter: 0}
if hasRange {
var min, max int64
if jobRange.Min != nil {
min = *jobRange.Min
}
if jobRange.Max != nil {
max = *jobRange.Max
}
partition.Range = models.PartitionRange{
Min: min,
Max: max,
IsMinInclusive: jobRange.IsMinInclusive,
IsMaxInclusive: jobRange.IsMaxInclusive,
}
}
return []models.Partition{partition}, nil
return []models.Partition{{
Id: uuid.New(),
HasRange: false,
RetryCounter: 0,
}}, nil
}
partitionsCount := rowsCount / rowsPerPartition
if partitionCalculationStrategy == "ESTIMATION" {
return calculatePartitionsEstimation(ctx, tableAnalyzer, tableInfo, partitionColumn, partitionsCount, jobRange)
}
partitions, err := tableAnalyzer.CalculatePartitionRanges(ctx, tableInfo, partitionColumn, partitionsCount, jobRange)
partitions, err := tableAnalyzer.CalculatePartitionRanges(ctx, tableInfo, partitionColumn, partitionsCount)
if err != nil {
return nil, err
}
logrus.Debugf("Partitions count: %v (%s.%s)", len(partitions), tableInfo.Schema, tableInfo.Table)
return partitions, nil
}
func calculatePartitionsEstimation(
ctx context.Context,
tableAnalyzer etl.TableAnalyzer,
tableInfo config.TableInfo,
partitionColumn string,
partitionsCount int64,
rangeConstraint config.RangeConfig,
) ([]models.Partition, error) {
var minValue, maxValue int64
if rangeConstraint.Min != nil && rangeConstraint.Max != nil {
minValue = *rangeConstraint.Min
maxValue = *rangeConstraint.Max
logrus.Infof("Column range for %s.%s.%s: [%d, %d] (user-defined)", tableInfo.Schema, tableInfo.Table, partitionColumn, minValue, maxValue)
} else if rangeConstraint.Min != nil || rangeConstraint.Max != nil {
result, err := tableAnalyzer.QueryMaxMinFromColumn(ctx, tableInfo, partitionColumn)
if err != nil {
return nil, err
}
if rangeConstraint.Min != nil {
minValue = *rangeConstraint.Min
maxValue = result.Max
logrus.Infof("Column range for %s.%s.%s: [%d, %d] (min user-defined)", tableInfo.Schema, tableInfo.Table, partitionColumn, minValue, maxValue)
} else {
minValue = result.Min
maxValue = *rangeConstraint.Max
logrus.Infof("Column range for %s.%s.%s: [%d, %d] (max user-defined)", tableInfo.Schema, tableInfo.Table, partitionColumn, minValue, maxValue)
}
} else {
result, err := tableAnalyzer.QueryMaxMinFromColumn(ctx, tableInfo, partitionColumn)
if err != nil {
return nil, err
}
logrus.Infof("Column range for %s.%s.%s: [%d, %d]", tableInfo.Schema, tableInfo.Table, partitionColumn, result.Min, result.Max)
minValue = result.Min
maxValue = result.Max
}
rangeSize := maxValue - minValue
stepSize := int64(math.Ceil(float64(rangeSize) / float64(partitionsCount)))
partitions := make([]models.Partition, 0, partitionsCount)
for i := range partitionsCount {
partitionMin := minValue + (i * stepSize)
partitionMax := minValue + ((i + 1) * stepSize)
if i == partitionsCount-1 {
partitionMax = maxValue
}
isMinInclusive := i == 0
partition := models.Partition{
Id: uuid.New(),
HasRange: true,
RetryCounter: 0,
Range: models.PartitionRange{
Min: partitionMin,
Max: partitionMax,
IsMinInclusive: isMinInclusive,
IsMaxInclusive: true,
},
}
partitions = append(partitions, partition)
}
return partitions, nil
}

View File

@@ -1,332 +0,0 @@
package table_analyzers
import (
"context"
"testing"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/config"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/etl"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/models"
)
type MockTableAnalyzer struct {
minValue int64
maxValue int64
totalRows int64
capturedRangeConstraint config.RangeConfig
}
func (m *MockTableAnalyzer) QueryColumnTypes(_ context.Context, _ config.TableInfo) ([]models.ColumnType, error) {
return nil, nil
}
func (m *MockTableAnalyzer) EstimateTotalRows(_ context.Context, _ config.TableInfo) (int64, error) {
return m.totalRows, nil
}
func (m *MockTableAnalyzer) QueryMaxMinFromColumn(_ context.Context, _ config.TableInfo, _ string) (etl.MaxMinColumnResult, error) {
return etl.MaxMinColumnResult{Min: m.minValue, Max: m.maxValue}, nil
}
func (m *MockTableAnalyzer) CalculatePartitionRanges(_ context.Context, _ config.TableInfo, _ string, _ int64, rangeConstraint config.RangeConfig) ([]models.Partition, error) {
m.capturedRangeConstraint = rangeConstraint
return []models.Partition{}, nil
}
//go:fix inline
func ptr64(v int64) *int64 { return new(v) }
var testTableInfo = config.TableInfo{Schema: "dbo", Table: "test"}
func TestCalculatePartitionsEstimation_NoOverlap(t *testing.T) {
ctx := context.Background()
mock := &MockTableAnalyzer{minValue: 0, maxValue: 100}
partitions, err := calculatePartitionsEstimation(ctx, mock, testTableInfo, "id", 4, config.RangeConfig{})
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if len(partitions) != 4 {
t.Errorf("expected 4 partitions, got %d", len(partitions))
}
for i := 0; i < len(partitions)-1; i++ {
current := partitions[i].Range
next := partitions[i+1].Range
if current.Max == next.Min && current.IsMaxInclusive && next.IsMinInclusive {
t.Errorf("partition %d and %d overlap at value %d (both inclusive)", i, i+1, current.Max)
}
}
}
func TestCalculatePartitionsEstimation_CoverageComplete(t *testing.T) {
ctx := context.Background()
mock := &MockTableAnalyzer{minValue: 1000, maxValue: 2000}
partitions, err := calculatePartitionsEstimation(ctx, mock, testTableInfo, "id", 5, config.RangeConfig{})
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if partitions[0].Range.Min != 1000 || !partitions[0].Range.IsMinInclusive {
t.Errorf("first partition should start at 1000 (inclusive), got %d (inclusive=%v)",
partitions[0].Range.Min, partitions[0].Range.IsMinInclusive)
}
if partitions[len(partitions)-1].Range.Max != 2000 {
t.Errorf("last partition should end at 2000, got %d", partitions[len(partitions)-1].Range.Max)
}
}
func TestCalculatePartitionsEstimation_FirstPartitionInclusive(t *testing.T) {
ctx := context.Background()
mock := &MockTableAnalyzer{minValue: 50, maxValue: 70}
partitions, err := calculatePartitionsEstimation(ctx, mock, testTableInfo, "id", 3, config.RangeConfig{})
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if !partitions[0].Range.IsMinInclusive {
t.Errorf("first partition should have IsMinInclusive=true")
}
if partitions[0].Range.Min != 50 {
t.Errorf("first partition should start at 50, got %d", partitions[0].Range.Min)
}
for i := 1; i < len(partitions); i++ {
if partitions[i].Range.IsMinInclusive {
t.Errorf("partition %d should have IsMinInclusive=false to avoid overlap", i)
}
}
}
func TestPartitionRangeGenerator_Exact_NoRange_PassesEmptyConstraint(t *testing.T) {
ctx := context.Background()
mock := &MockTableAnalyzer{totalRows: 1000}
_, err := PartitionRangeGenerator(ctx, mock, testTableInfo, "id", "EXACT", 100, config.RangeConfig{})
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if mock.capturedRangeConstraint.Min != nil || mock.capturedRangeConstraint.Max != nil {
t.Errorf("expected empty range constraint, got min=%v max=%v",
mock.capturedRangeConstraint.Min, mock.capturedRangeConstraint.Max)
}
}
func TestPartitionRangeGenerator_Exact_BothBounds_PassesBothToAnalyzer(t *testing.T) {
ctx := context.Background()
mock := &MockTableAnalyzer{totalRows: 1000}
jobRange := config.RangeConfig{Min: ptr64(200), Max: ptr64(800), IsMinInclusive: true, IsMaxInclusive: true}
_, err := PartitionRangeGenerator(ctx, mock, testTableInfo, "id", "EXACT", 100, jobRange)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
rc := mock.capturedRangeConstraint
if rc.Min == nil || *rc.Min != 200 {
t.Errorf("expected Min=200, got %v", rc.Min)
}
if rc.Max == nil || *rc.Max != 800 {
t.Errorf("expected Max=800, got %v", rc.Max)
}
if !rc.IsMinInclusive || !rc.IsMaxInclusive {
t.Errorf("expected both bounds inclusive, got minInc=%v maxInc=%v", rc.IsMinInclusive, rc.IsMaxInclusive)
}
}
func TestPartitionRangeGenerator_Exact_MinOnly_PassesMinNilMax(t *testing.T) {
ctx := context.Background()
mock := &MockTableAnalyzer{totalRows: 1000}
jobRange := config.RangeConfig{Min: ptr64(500)}
_, err := PartitionRangeGenerator(ctx, mock, testTableInfo, "id", "EXACT", 100, jobRange)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
rc := mock.capturedRangeConstraint
if rc.Min == nil || *rc.Min != 500 {
t.Errorf("expected Min=500, got %v", rc.Min)
}
if rc.Max != nil {
t.Errorf("expected Max=nil (no upper bound), got %v", rc.Max)
}
}
func TestPartitionRangeGenerator_Exact_MaxOnly_PassesMaxNilMin(t *testing.T) {
ctx := context.Background()
mock := &MockTableAnalyzer{totalRows: 1000}
jobRange := config.RangeConfig{Max: ptr64(300)}
_, err := PartitionRangeGenerator(ctx, mock, testTableInfo, "id", "EXACT", 100, jobRange)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
rc := mock.capturedRangeConstraint
if rc.Min != nil {
t.Errorf("expected Min=nil (no lower bound), got %v", rc.Min)
}
if rc.Max == nil || *rc.Max != 300 {
t.Errorf("expected Max=300, got %v", rc.Max)
}
}
func TestPartitionRangeGenerator_Estimation_BothBounds_UsesUserRange(t *testing.T) {
ctx := context.Background()
// DB min/max differ intentionally — user bounds should take precedence.
mock := &MockTableAnalyzer{totalRows: 1000, minValue: 0, maxValue: 999}
jobRange := config.RangeConfig{Min: ptr64(200), Max: ptr64(700), IsMinInclusive: true}
partitions, err := PartitionRangeGenerator(ctx, mock, testTableInfo, "id", "ESTIMATION", 100, jobRange)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if len(partitions) == 0 {
t.Fatal("expected at least one partition")
}
if partitions[0].Range.Min != 200 {
t.Errorf("first partition should start at user min=200, got %d", partitions[0].Range.Min)
}
if partitions[len(partitions)-1].Range.Max != 700 {
t.Errorf("last partition should end at user max=700, got %d", partitions[len(partitions)-1].Range.Max)
}
}
func TestPartitionRangeGenerator_Estimation_MinOnly_QueriesDBForMax(t *testing.T) {
ctx := context.Background()
mock := &MockTableAnalyzer{totalRows: 1000, minValue: 0, maxValue: 999}
jobRange := config.RangeConfig{Min: ptr64(500), IsMinInclusive: true}
partitions, err := PartitionRangeGenerator(ctx, mock, testTableInfo, "id", "ESTIMATION", 100, jobRange)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if len(partitions) == 0 {
t.Fatal("expected at least one partition")
}
if partitions[0].Range.Min != 500 {
t.Errorf("first partition should start at user min=500, got %d", partitions[0].Range.Min)
}
if partitions[len(partitions)-1].Range.Max != 999 {
t.Errorf("last partition should end at DB max=999, got %d", partitions[len(partitions)-1].Range.Max)
}
}
func TestPartitionRangeGenerator_Estimation_MaxOnly_QueriesDBForMin(t *testing.T) {
ctx := context.Background()
mock := &MockTableAnalyzer{totalRows: 1000, minValue: 100, maxValue: 999}
jobRange := config.RangeConfig{Max: ptr64(600), IsMaxInclusive: true}
partitions, err := PartitionRangeGenerator(ctx, mock, testTableInfo, "id", "ESTIMATION", 100, jobRange)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if len(partitions) == 0 {
t.Fatal("expected at least one partition")
}
if partitions[0].Range.Min != 100 {
t.Errorf("first partition should start at DB min=100, got %d", partitions[0].Range.Min)
}
if partitions[len(partitions)-1].Range.Max != 600 {
t.Errorf("last partition should end at user max=600, got %d", partitions[len(partitions)-1].Range.Max)
}
}
func TestPartitionRangeGenerator_SinglePartition_NoRange(t *testing.T) {
ctx := context.Background()
mock := &MockTableAnalyzer{totalRows: 50}
partitions, err := PartitionRangeGenerator(ctx, mock, testTableInfo, "id", "EXACT", 100, config.RangeConfig{})
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if len(partitions) != 1 {
t.Fatalf("expected 1 partition, got %d", len(partitions))
}
if partitions[0].HasRange {
t.Error("single partition with no range should have HasRange=false")
}
}
func TestPartitionRangeGenerator_SinglePartition_BothBounds(t *testing.T) {
ctx := context.Background()
mock := &MockTableAnalyzer{totalRows: 50}
jobRange := config.RangeConfig{Min: ptr64(100), Max: ptr64(200), IsMinInclusive: true, IsMaxInclusive: true}
partitions, err := PartitionRangeGenerator(ctx, mock, testTableInfo, "id", "EXACT", 100, jobRange)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if len(partitions) != 1 {
t.Fatalf("expected 1 partition, got %d", len(partitions))
}
p := partitions[0]
if !p.HasRange {
t.Error("expected HasRange=true")
}
if p.Range.Min != 100 || p.Range.Max != 200 {
t.Errorf("expected [100, 200], got [%d, %d]", p.Range.Min, p.Range.Max)
}
if !p.Range.IsMinInclusive || !p.Range.IsMaxInclusive {
t.Errorf("expected both inclusive, got minInc=%v maxInc=%v", p.Range.IsMinInclusive, p.Range.IsMaxInclusive)
}
}
func TestPartitionRangeGenerator_SinglePartition_MinOnly(t *testing.T) {
ctx := context.Background()
mock := &MockTableAnalyzer{totalRows: 50}
jobRange := config.RangeConfig{Min: ptr64(100), IsMinInclusive: true}
partitions, err := PartitionRangeGenerator(ctx, mock, testTableInfo, "id", "EXACT", 100, jobRange)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
p := partitions[0]
if !p.HasRange {
t.Error("expected HasRange=true")
}
if p.Range.Min != 100 {
t.Errorf("expected Min=100, got %d", p.Range.Min)
}
if p.Range.Max != 0 {
t.Errorf("expected Max=0 (no upper bound), got %d", p.Range.Max)
}
}
func TestPartitionRangeGenerator_SinglePartition_MaxOnly(t *testing.T) {
ctx := context.Background()
mock := &MockTableAnalyzer{totalRows: 50}
jobRange := config.RangeConfig{Max: ptr64(200), IsMaxInclusive: true}
partitions, err := PartitionRangeGenerator(ctx, mock, testTableInfo, "id", "EXACT", 100, jobRange)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
p := partitions[0]
if !p.HasRange {
t.Error("expected HasRange=true")
}
if p.Range.Min != 0 {
t.Errorf("expected Min=0 (no lower bound), got %d", p.Range.Min)
}
if p.Range.Max != 200 {
t.Errorf("expected Max=200, got %d", p.Range.Max)
}
}

View File

@@ -39,6 +39,8 @@ JOIN sys.schemas s ON st.schema_id = s.schema_id
WHERE s.name = @schema AND st.name = @table AND (c.is_hidden = 0 OR (c.graph_type IS NOT NULL AND c.name LIKE '$%'))
ORDER BY c.column_id;`
// AND c.name NOT LIKE '$%'
type rawColumnMssql struct {
name string
userType string
@@ -196,65 +198,17 @@ GROUP BY t.name`
return rowsCount, nil
}
func (ta *MssqlTableAnalyzer) QueryMaxMinFromColumn(
ctx context.Context,
tableInfo config.TableInfo,
columnName string,
) (etl.MaxMinColumnResult, error) {
query := fmt.Sprintf(`
SELECT
MIN([%s]) AS min_value,
MAX([%s]) AS max_value
FROM [%s].[%s]`, columnName, columnName, tableInfo.Schema, tableInfo.Table)
ctxTimeout, cancel := context.WithTimeout(ctx, 1*time.Minute)
defer cancel()
result := etl.MaxMinColumnResult{}
err := ta.db.QueryRow(ctxTimeout, query).Scan(&result.Min, &result.Max)
if err != nil {
return etl.MaxMinColumnResult{}, err
}
return result, nil
}
func (ta *MssqlTableAnalyzer) CalculatePartitionRanges(
ctx context.Context,
tableInfo config.TableInfo,
partitionColumn string,
maxPartitions int64,
rangeConstraint config.RangeConfig,
) ([]models.Partition, error) {
whereClause := ""
args := []any{sql.Named("maxPartitions", maxPartitions)}
if rangeConstraint.Min != nil || rangeConstraint.Max != nil {
var conditions []string
if rangeConstraint.Min != nil {
minOp := ">"
if rangeConstraint.IsMinInclusive {
minOp = ">="
}
conditions = append(conditions, fmt.Sprintf("[%s] %s @rangeMin", partitionColumn, minOp))
args = append(args, sql.Named("rangeMin", *rangeConstraint.Min))
}
if rangeConstraint.Max != nil {
maxOp := "<"
if rangeConstraint.IsMaxInclusive {
maxOp = "<="
}
conditions = append(conditions, fmt.Sprintf("[%s] %s @rangeMax", partitionColumn, maxOp))
args = append(args, sql.Named("rangeMax", *rangeConstraint.Max))
}
whereClause = "WHERE " + strings.Join(conditions, " AND ")
}
query := fmt.Sprintf(`
SELECT
MIN([%s]) AS lower_limit,
MAX([%s]) AS upper_limit
FROM (SELECT [%s], NTILE(@maxPartitions) OVER (ORDER BY [%s]) AS batch_id FROM [%s].[%s] %s) AS T
FROM (SELECT [%s], NTILE(@maxPartitions) OVER (ORDER BY [%s]) AS batch_id FROM [%s].[%s]) AS T
GROUP BY batch_id
ORDER BY batch_id`,
partitionColumn,
@@ -262,13 +216,12 @@ ORDER BY batch_id`,
partitionColumn,
partitionColumn,
tableInfo.Schema,
tableInfo.Table,
whereClause)
tableInfo.Table)
ctxTimeout, cancel := context.WithTimeout(ctx, 1*time.Minute)
defer cancel()
rows, err := ta.db.Query(ctxTimeout, query, args...)
rows, err := ta.db.Query(ctxTimeout, query, sql.Named("maxPartitions", maxPartitions))
if err != nil {
return nil, err
}
@@ -283,7 +236,6 @@ ORDER BY batch_id`,
RetryCounter: 0,
Range: models.PartitionRange{
IsMinInclusive: true,
IsMaxInclusive: true,
},
}

View File

@@ -164,20 +164,11 @@ func (ta *PostgresTableAnalyzer) EstimateTotalRows(
return 0, nil
}
func (ta *PostgresTableAnalyzer) QueryMaxMinFromColumn(
ctx context.Context,
tableInfo config.TableInfo,
columnName string,
) (etl.MaxMinColumnResult, error) {
return etl.MaxMinColumnResult{}, nil
}
func (ta *PostgresTableAnalyzer) CalculatePartitionRanges(
ctx context.Context,
tableInfo config.TableInfo,
partitionColumn string,
maxPartitions int64,
rangeConstraint config.RangeConfig,
) ([]models.Partition, error) {
return []models.Partition{}, nil
}

View File

@@ -1,119 +0,0 @@
package transformers
import (
"context"
"errors"
"sync"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/config"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/custom_errors"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/models"
"github.com/google/uuid"
)
type batchAccumulator struct {
batchSize int
rows []models.UnknownRowValues
parents []models.BatchRef
}
func (a *batchAccumulator) add(batch models.Batch) {
a.rows = append(a.rows, batch.Rows...)
a.parents = append(a.parents, models.BatchRef{Id: batch.Id})
}
func (a *batchAccumulator) ready() bool {
return len(a.rows) >= a.batchSize
}
func (a *batchAccumulator) flush(ctx context.Context, chOut chan<- models.Batch, wg *sync.WaitGroup) bool {
if len(a.rows) == 0 {
return true
}
out := models.Batch{
Id: uuid.New(),
ParentBatches: a.parents,
Rows: a.rows,
}
wg.Add(1)
select {
case chOut <- out:
case <-ctx.Done():
wg.Done()
return false
}
a.rows = nil
a.parents = nil
return true
}
func sendTransformError(ctx context.Context, err error, ch chan<- custom_errors.JobError) {
if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) {
return
}
var jobErr custom_errors.JobError
if je, ok := errors.AsType[*custom_errors.JobError](err); ok {
jobErr = *je
} else {
jobErr = custom_errors.JobError{ShouldCancelJob: true, Msg: "Transformation failed", Prev: err}
}
select {
case ch <- jobErr:
case <-ctx.Done():
}
}
func (mssqlTr *MssqlTransformer) Consume(
ctx context.Context,
columns []models.ColumnType,
retryConfig config.RetryConfig,
batchSize int,
chBatchesIn <-chan models.Batch,
chBatchesOut chan<- models.Batch,
chJobErrorsOut chan<- custom_errors.JobError,
wgActiveBatches *sync.WaitGroup,
) {
transformationPlan := computeTransformationPlan(columns)
storagePlan := computeStorageTransformationPlan(ctx, mssqlTr.azureClient, mssqlTr.toStorage, columns, mssqlTr.sourceTable)
transformationPlan = append(transformationPlan, storagePlan...)
acc := &batchAccumulator{batchSize: batchSize}
for {
select {
case <-ctx.Done():
return
case batch, ok := <-chBatchesIn:
if !ok {
acc.flush(ctx, chBatchesOut, wgActiveBatches)
return
}
if len(transformationPlan) > 0 {
if err := ProcessBatchWithRetries(ctx, &batch, transformationPlan, retryConfig); err != nil {
sendTransformError(ctx, err, chJobErrorsOut)
return
}
}
if batchSize <= 0 {
wgActiveBatches.Add(1)
select {
case chBatchesOut <- batch:
case <-ctx.Done():
wgActiveBatches.Done()
return
}
continue
}
acc.add(batch)
if acc.ready() {
if !acc.flush(ctx, chBatchesOut, wgActiveBatches) {
return
}
}
}
}
}

View File

@@ -1,545 +0,0 @@
package transformers
import (
"context"
"errors"
"sync"
"testing"
"time"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/config"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/custom_errors"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/models"
"github.com/google/uuid"
)
const testTimeout = 2 * time.Second
func makeBatch(numRows int) models.Batch {
rows := make([]models.UnknownRowValues, numRows)
for i := range rows {
rows[i] = models.UnknownRowValues{i}
}
return models.Batch{Id: uuid.New(), Rows: rows}
}
func noRetry() config.RetryConfig {
return config.RetryConfig{Attempts: 1}
}
func newTransformer() *MssqlTransformer {
return &MssqlTransformer{}
}
func uuidColumn() models.ColumnType {
return models.NewColumnType("col_uuid", false, false, "uniqueidentifier", "uniqueidentifier", "string", false, 0, 0, 0)
}
func runConsume(
ctx context.Context,
tr *MssqlTransformer,
columns []models.ColumnType,
batchSize int,
chIn <-chan models.Batch,
chOut chan<- models.Batch,
chErr chan<- custom_errors.JobError,
wg *sync.WaitGroup,
) <-chan struct{} {
done := make(chan struct{})
go func() {
tr.Consume(ctx, columns, noRetry(), batchSize, chIn, chOut, chErr, wg)
close(done)
}()
return done
}
func drainOut(chOut <-chan models.Batch, wg *sync.WaitGroup) []models.Batch {
var batches []models.Batch
for {
select {
case b := <-chOut:
batches = append(batches, b)
wg.Done()
default:
return batches
}
}
}
func TestBatchAccumulator_Add(t *testing.T) {
acc := &batchAccumulator{batchSize: 5}
b1 := makeBatch(2)
b2 := makeBatch(3)
acc.add(b1)
acc.add(b2)
if len(acc.rows) != 5 {
t.Errorf("expected 5 rows, got %d", len(acc.rows))
}
if len(acc.parents) != 2 {
t.Fatalf("expected 2 parents, got %d", len(acc.parents))
}
if acc.parents[0].Id != b1.Id || acc.parents[1].Id != b2.Id {
t.Error("parent IDs do not match source batch IDs")
}
}
func TestBatchAccumulator_Ready(t *testing.T) {
acc := &batchAccumulator{batchSize: 3}
acc.add(makeBatch(2))
if acc.ready() {
t.Error("should not be ready with 2 rows and batchSize=3")
}
acc.add(makeBatch(1))
if !acc.ready() {
t.Error("should be ready with 3 rows and batchSize=3")
}
}
func TestBatchAccumulator_Flush_Empty(t *testing.T) {
acc := &batchAccumulator{batchSize: 5}
chOut := make(chan models.Batch, 1)
var wg sync.WaitGroup
if !acc.flush(context.Background(), chOut, &wg) {
t.Error("flush on empty accumulator should return true")
}
if len(chOut) != 0 {
t.Error("flush on empty accumulator should send nothing")
}
}
func TestBatchAccumulator_Flush_Success(t *testing.T) {
acc := &batchAccumulator{batchSize: 2}
b := makeBatch(2)
acc.add(b)
chOut := make(chan models.Batch, 1)
var wg sync.WaitGroup
if !acc.flush(context.Background(), chOut, &wg) {
t.Fatal("flush should return true on success")
}
select {
case out := <-chOut:
wg.Done()
if len(out.Rows) != 2 {
t.Errorf("expected 2 rows in flushed batch, got %d", len(out.Rows))
}
if len(out.ParentBatches) != 1 || out.ParentBatches[0].Id != b.Id {
t.Error("flushed batch should reference the source batch as parent")
}
default:
t.Error("expected a batch in chOut after flush")
}
if len(acc.rows) != 0 || len(acc.parents) != 0 {
t.Error("accumulator state should be reset after flush")
}
wg.Wait()
}
func TestBatchAccumulator_Flush_ContextCancelled(t *testing.T) {
acc := &batchAccumulator{batchSize: 2}
acc.add(makeBatch(2))
chOut := make(chan models.Batch)
var wg sync.WaitGroup
ctx, cancel := context.WithCancel(context.Background())
cancel()
if acc.flush(ctx, chOut, &wg) {
t.Error("flush should return false when context is cancelled")
}
wg.Wait()
}
func TestSendTransformError_PlainError(t *testing.T) {
ch := make(chan custom_errors.JobError, 1)
sendTransformError(context.Background(), errors.New("something broke"), ch)
select {
case e := <-ch:
if !e.ShouldCancelJob {
t.Error("plain error should produce ShouldCancelJob=true")
}
default:
t.Error("expected a job error in the channel")
}
}
func TestSendTransformError_JobError_Passthrough(t *testing.T) {
ch := make(chan custom_errors.JobError, 1)
original := &custom_errors.JobError{ShouldCancelJob: false, Msg: "custom msg"}
sendTransformError(context.Background(), original, ch)
select {
case e := <-ch:
if e.ShouldCancelJob != false || e.Msg != "custom msg" {
t.Errorf("JobError should pass through unchanged, got %+v", e)
}
default:
t.Error("expected a job error in the channel")
}
}
func TestSendTransformError_ContextCancelled_Silent(t *testing.T) {
ch := make(chan custom_errors.JobError, 1)
ctx, cancel := context.WithCancel(context.Background())
cancel()
sendTransformError(ctx, context.Canceled, ch)
if len(ch) != 0 {
t.Error("context.Canceled should be silently dropped")
}
}
func TestSendTransformError_DeadlineExceeded_Silent(t *testing.T) {
ch := make(chan custom_errors.JobError, 1)
ctx, cancel := context.WithCancel(context.Background())
cancel()
sendTransformError(ctx, context.DeadlineExceeded, ch)
if len(ch) != 0 {
t.Error("context.DeadlineExceeded should be silently dropped")
}
}
func TestConsume_Passthrough_PreservesOriginalBatch(t *testing.T) {
tr := newTransformer()
chIn := make(chan models.Batch, 1)
chOut := make(chan models.Batch, 1)
chErr := make(chan custom_errors.JobError, 1)
var wg sync.WaitGroup
batch := makeBatch(3)
chIn <- batch
close(chIn)
done := runConsume(context.Background(), tr, nil, 0, chIn, chOut, chErr, &wg)
select {
case got := <-chOut:
wg.Done()
if got.Id != batch.Id {
t.Error("passthrough should preserve the original batch ID")
}
if len(got.Rows) != 3 {
t.Errorf("expected 3 rows, got %d", len(got.Rows))
}
case <-time.After(testTimeout):
t.Fatal("timeout waiting for output batch")
}
<-done
wg.Wait()
}
func TestConsume_Passthrough_WaitGroupBalanced(t *testing.T) {
tr := newTransformer()
chIn := make(chan models.Batch, 3)
chOut := make(chan models.Batch, 3)
chErr := make(chan custom_errors.JobError, 1)
var wg sync.WaitGroup
for range 3 {
chIn <- makeBatch(1)
}
close(chIn)
done := runConsume(context.Background(), tr, nil, 0, chIn, chOut, chErr, &wg)
<-done
batches := drainOut(chOut, &wg)
if len(batches) != 3 {
t.Errorf("expected 3 output batches, got %d", len(batches))
}
wg.Wait()
}
func TestConsume_Accumulation_FlushOnThreshold(t *testing.T) {
tr := newTransformer()
chIn := make(chan models.Batch, 3)
chOut := make(chan models.Batch, 2)
chErr := make(chan custom_errors.JobError, 1)
var wg sync.WaitGroup
for range 3 {
chIn <- makeBatch(1)
}
close(chIn)
done := runConsume(context.Background(), tr, nil, 3, chIn, chOut, chErr, &wg)
<-done
batches := drainOut(chOut, &wg)
if len(batches) != 1 {
t.Fatalf("expected 1 accumulated batch, got %d", len(batches))
}
if len(batches[0].Rows) != 3 {
t.Errorf("expected 3 rows in accumulated batch, got %d", len(batches[0].Rows))
}
wg.Wait()
}
func TestConsume_Accumulation_FlushOnClose(t *testing.T) {
tr := newTransformer()
chIn := make(chan models.Batch, 2)
chOut := make(chan models.Batch, 2)
chErr := make(chan custom_errors.JobError, 1)
var wg sync.WaitGroup
chIn <- makeBatch(1)
chIn <- makeBatch(1)
close(chIn)
done := runConsume(context.Background(), tr, nil, 10, chIn, chOut, chErr, &wg)
<-done
batches := drainOut(chOut, &wg)
if len(batches) != 1 {
t.Fatalf("expected 1 batch flushed on close, got %d", len(batches))
}
if len(batches[0].Rows) != 2 {
t.Errorf("expected 2 rows, got %d", len(batches[0].Rows))
}
wg.Wait()
}
func TestConsume_Accumulation_TracksAllParentBatches(t *testing.T) {
tr := newTransformer()
chIn := make(chan models.Batch, 2)
chOut := make(chan models.Batch, 2)
chErr := make(chan custom_errors.JobError, 1)
var wg sync.WaitGroup
b1 := makeBatch(1)
b2 := makeBatch(1)
chIn <- b1
chIn <- b2
close(chIn)
done := runConsume(context.Background(), tr, nil, 10, chIn, chOut, chErr, &wg)
<-done
batches := drainOut(chOut, &wg)
if len(batches) != 1 {
t.Fatalf("expected 1 output batch, got %d", len(batches))
}
parents := batches[0].ParentBatches
if len(parents) != 2 {
t.Fatalf("expected 2 parent refs, got %d", len(parents))
}
if parents[0].Id != b1.Id || parents[1].Id != b2.Id {
t.Error("parent IDs should match source batch IDs in order")
}
wg.Wait()
}
func TestConsume_Accumulation_MultipleFlushes(t *testing.T) {
tr := newTransformer()
chIn := make(chan models.Batch, 5)
chOut := make(chan models.Batch, 5)
chErr := make(chan custom_errors.JobError, 1)
var wg sync.WaitGroup
for range 5 {
chIn <- makeBatch(1)
}
close(chIn)
done := runConsume(context.Background(), tr, nil, 2, chIn, chOut, chErr, &wg)
<-done
batches := drainOut(chOut, &wg)
if len(batches) != 3 {
t.Fatalf("expected 3 output batches (2+2+1 rows), got %d", len(batches))
}
totalRows := 0
for _, b := range batches {
totalRows += len(b.Rows)
}
if totalRows != 5 {
t.Errorf("expected 5 total rows across all batches, got %d", totalRows)
}
wg.Wait()
}
func TestConsume_EmptyInput_NoOutput(t *testing.T) {
tr := newTransformer()
chIn := make(chan models.Batch)
chOut := make(chan models.Batch, 1)
chErr := make(chan custom_errors.JobError, 1)
var wg sync.WaitGroup
close(chIn)
done := runConsume(context.Background(), tr, nil, 5, chIn, chOut, chErr, &wg)
select {
case <-done:
case <-time.After(testTimeout):
t.Fatal("timeout: Consume did not exit after empty input channel was closed")
}
if len(chOut) != 0 {
t.Error("expected no output for empty input")
}
wg.Wait()
}
func TestConsume_TransformError_SendsJobError(t *testing.T) {
tr := newTransformer()
col := uuidColumn()
chIn := make(chan models.Batch, 1)
chOut := make(chan models.Batch, 1)
chErr := make(chan custom_errors.JobError, 1)
var wg sync.WaitGroup
batch := models.Batch{
Id: uuid.New(),
Rows: []models.UnknownRowValues{{[]byte{1, 2, 3}}},
}
chIn <- batch
done := runConsume(context.Background(), tr, []models.ColumnType{col}, 0, chIn, chOut, chErr, &wg)
select {
case err := <-chErr:
if !err.ShouldCancelJob {
t.Error("transform error should set ShouldCancelJob=true")
}
case <-time.After(testTimeout):
t.Fatal("timeout: expected a job error from transform failure")
}
<-done
wg.Wait()
}
func TestConsume_TransformError_NoOutputForwarded(t *testing.T) {
tr := newTransformer()
col := uuidColumn()
chIn := make(chan models.Batch, 1)
chOut := make(chan models.Batch, 1)
chErr := make(chan custom_errors.JobError, 1)
var wg sync.WaitGroup
batch := models.Batch{
Id: uuid.New(),
Rows: []models.UnknownRowValues{{[]byte{1, 2, 3}}},
}
chIn <- batch
done := runConsume(context.Background(), tr, []models.ColumnType{col}, 0, chIn, chOut, chErr, &wg)
<-done
if len(chOut) != 0 {
t.Error("no batch should be forwarded when transformation fails")
}
wg.Wait()
}
func TestConsume_ContextCancellation_Exits(t *testing.T) {
tr := newTransformer()
chIn := make(chan models.Batch)
chOut := make(chan models.Batch, 1)
chErr := make(chan custom_errors.JobError, 1)
var wg sync.WaitGroup
ctx, cancel := context.WithCancel(context.Background())
done := runConsume(ctx, tr, nil, 0, chIn, chOut, chErr, &wg)
cancel()
select {
case <-done:
case <-time.After(testTimeout):
t.Fatal("timeout: Consume did not exit after context cancellation")
}
wg.Wait()
}
func TestConsume_Transform_DatetimeConvertedToUTC(t *testing.T) {
tr := newTransformer()
col := models.NewColumnType("col_dt", false, false, "datetime", "datetime", "timestamp", false, 0, 0, 0)
chIn := make(chan models.Batch, 1)
chOut := make(chan models.Batch, 1)
chErr := make(chan custom_errors.JobError, 1)
var wg sync.WaitGroup
nonUTC := time.Date(2024, 1, 15, 12, 0, 0, 0, time.FixedZone("EST", -5*3600))
batch := models.Batch{
Id: uuid.New(),
Rows: []models.UnknownRowValues{{nonUTC}},
}
chIn <- batch
close(chIn)
done := runConsume(context.Background(), tr, []models.ColumnType{col}, 0, chIn, chOut, chErr, &wg)
<-done
select {
case got := <-chOut:
wg.Done()
result, ok := got.Rows[0][0].(time.Time)
if !ok {
t.Fatal("expected time.Time in output row")
}
if result.Location() != time.UTC {
t.Errorf("expected UTC location after transform, got %v", result.Location())
}
default:
t.Error("expected an output batch")
}
wg.Wait()
}
func TestConsume_Transform_NilValueSkipped(t *testing.T) {
tr := newTransformer()
col := uuidColumn()
chIn := make(chan models.Batch, 1)
chOut := make(chan models.Batch, 1)
chErr := make(chan custom_errors.JobError, 1)
var wg sync.WaitGroup
batch := models.Batch{
Id: uuid.New(),
Rows: []models.UnknownRowValues{{nil}},
}
chIn <- batch
close(chIn)
done := runConsume(context.Background(), tr, []models.ColumnType{col}, 0, chIn, chOut, chErr, &wg)
<-done
select {
case got := <-chOut:
wg.Done()
if got.Rows[0][0] != nil {
t.Error("nil value should pass through unchanged")
}
default:
t.Error("expected an output batch even when value is nil")
}
if len(chErr) != 0 {
t.Error("nil value should not produce an error")
}
wg.Wait()
}

View File

@@ -1,21 +1,110 @@
package transformers
import (
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/azure"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/config"
"context"
"errors"
"sync"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/custom_errors"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/etl"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/models"
)
type MssqlTransformer struct {
toStorage config.ToStorageConfig
sourceTable config.SourceTableInfo
azureClient *azure.Client
type MssqlTransformer struct{}
func NewMssqlTransformer() etl.Transformer {
return &MssqlTransformer{}
}
func NewMssqlTransformer(toStorage config.ToStorageConfig, sourceTable config.SourceTableInfo, azureClient *azure.Client) etl.Transformer {
return &MssqlTransformer{
toStorage: toStorage,
sourceTable: sourceTable,
azureClient: azureClient,
func computeTransformationPlan(columns []models.ColumnType) []etl.ColumnTransformPlan {
return []etl.ColumnTransformPlan{}
}
const processBatchCtxCheck = 4096
func (mssqlTr *MssqlTransformer) ProcessBatch(
ctx context.Context,
batch *models.Batch,
transformationPlan []etl.ColumnTransformPlan,
) error {
for i, rowValues := range batch.Rows {
if i%processBatchCtxCheck == 0 {
if err := ctx.Err(); err != nil {
return err
}
}
for _, task := range transformationPlan {
val := rowValues[task.Index]
if val == nil {
continue
}
transformed, err := task.Fn(val)
if err != nil {
return err
}
rowValues[task.Index] = transformed
}
}
return nil
}
func (mssqlTr *MssqlTransformer) Exec(
ctx context.Context,
columns []models.ColumnType,
chBatchesIn <-chan models.Batch,
chBatchesOut chan<- models.Batch,
chJobErrorsOut chan<- custom_errors.JobError,
wgActiveBatches *sync.WaitGroup,
) {
transformationPlan := computeTransformationPlan(columns)
for {
if ctx.Err() != nil {
return
}
select {
case <-ctx.Done():
return
case batch, ok := <-chBatchesIn:
if !ok {
return
}
if len(transformationPlan) == 0 {
select {
case chBatchesOut <- batch:
wgActiveBatches.Add(1)
continue
case <-ctx.Done():
return
}
}
err := mssqlTr.ProcessBatch(ctx, &batch, transformationPlan)
if err != nil {
if errors.Is(err, ctx.Err()) {
return
}
select {
case chJobErrorsOut <- custom_errors.JobError{ShouldCancelJob: true, Msg: "Transformation failed", Prev: err}:
case <-ctx.Done():
}
return
}
select {
case chBatchesOut <- batch:
case <-ctx.Done():
return
}
wgActiveBatches.Add(1)
}
}
}

View File

@@ -1,122 +0,0 @@
package transformers
import (
"context"
"fmt"
"path"
"strings"
"time"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/azure"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/config"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/custom_errors"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/etl"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/models"
"github.com/google/uuid"
"github.com/sirupsen/logrus"
)
func computeTransformationPlan(columns []models.ColumnType) []etl.ColumnTransformPlan {
var plan []etl.ColumnTransformPlan
for i, col := range columns {
switch col.SystemType() {
case "uniqueidentifier":
plan = append(plan, etl.ColumnTransformPlan{
Index: i,
Fn: func(v any) (any, error) {
if b, ok := v.([]byte); ok && b != nil {
return mssqlUuidToBigEndian(b)
}
return v, nil
},
})
case "geometry", "geography":
plan = append(plan, etl.ColumnTransformPlan{
Index: i,
Fn: func(v any) (any, error) {
if b, ok := v.([]byte); ok && b != nil {
return wkbToEwkbWithSrid(b, 4326)
}
return v, nil
},
})
case "datetime", "datetime2":
plan = append(plan, etl.ColumnTransformPlan{
Index: i,
Fn: func(v any) (any, error) {
if t, ok := v.(time.Time); ok {
return ensureUTC(t), nil
}
return v, nil
},
})
}
}
return plan
}
func computeStorageTransformationPlan(
ctx context.Context,
azureClient *azure.Client,
toStorage config.ToStorageConfig,
sourceColumns []models.ColumnType,
sourceTable config.SourceTableInfo,
) []etl.ColumnTransformPlan {
if azureClient == nil || len(toStorage.Columns) == 0 {
return nil
}
colIndex := make(map[string]int, len(sourceColumns))
for i, col := range sourceColumns {
colIndex[strings.ToUpper(col.Name())] = i
}
var plan []etl.ColumnTransformPlan
for _, storageCol := range toStorage.Columns {
if storageCol.Mode != "REFERENCE_ONLY" {
logrus.Warnf("to_storage: unsupported mode %q for column %s — skipping", storageCol.Mode, storageCol.Source)
continue
}
idx, ok := colIndex[strings.ToUpper(storageCol.Source)]
if !ok {
logrus.Warnf("to_storage: source column %q not found in source schema — skipping", storageCol.Source)
continue
}
sourceColName := storageCol.Source
schema := sourceTable.Schema
table := sourceTable.Table
plan = append(plan, etl.ColumnTransformPlan{
Index: idx,
Fn: func(v any) (any, error) {
if v == nil {
return nil, nil
}
b, ok := v.([]byte)
if !ok {
logrus.Warnf("to_storage: expected []byte for %s.%s.%s, got %T — passing through", schema, table, sourceColName, v)
return v, nil
}
// start := time.Now()
blobPath := path.Join(storageCol.Prefix, uuid.New().String())
blobURL, err := azureClient.UploadAndGetURL(ctx, blobPath, b)
if err != nil {
return nil, &custom_errors.JobError{
Msg: fmt.Sprintf("Error uploading %s.%s.%s", schema, table, sourceColName),
Prev: err,
}
}
// logrus.Debugf(`Succesfully uploaded "%s", (%vms)`, blobURL, time.Since(start).Milliseconds())
return blobURL, nil
},
})
}
return plan
}

View File

@@ -1,73 +0,0 @@
package transformers
import (
"context"
"errors"
"time"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/config"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/custom_errors"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/etl"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/models"
)
const processBatchCtxCheck = 4096
func ProcessBatchWithRetries(
ctx context.Context,
batch *models.Batch,
transformationPlan []etl.ColumnTransformPlan,
retryConfig config.RetryConfig,
) error {
for i, rowValues := range batch.Rows {
if i%processBatchCtxCheck == 0 {
if err := ctx.Err(); err != nil {
return err
}
}
for _, task := range transformationPlan {
val := rowValues[task.Index]
if val == nil {
continue
}
var lastErr error
success := false
for attempt := 0; attempt < retryConfig.Attempts; attempt++ {
transformed, err := task.Fn(val)
if err == nil {
rowValues[task.Index] = transformed
success = true
break
}
lastErr = err
if jobError, ok := errors.AsType[*custom_errors.JobError](err); ok {
if jobError.ShouldCancelJob {
return jobError
}
}
if attempt == retryConfig.Attempts-1 {
break
}
delay := custom_errors.ComputeBackoffDelay(
attempt,
retryConfig.BaseDelayMs,
retryConfig.MaxDelayMs,
retryConfig.MaxJitterMs,
)
time.Sleep(delay)
}
if !success {
return lastErr
}
}
}
return nil
}

View File

@@ -0,0 +1 @@
package transformers

View File

@@ -9,6 +9,31 @@ import (
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/models"
)
type Extractor interface {
ProcessPartition(
ctx context.Context,
tableInfo config.SourceTableInfo,
columns []models.ColumnType,
batchSize int,
partition models.Partition,
indexPrimaryKey int,
chBatchesOut chan<- models.Batch,
) (int, error)
Exec(
ctx context.Context,
tableInfo config.SourceTableInfo,
columns []models.ColumnType,
batchSize int,
chPartitionsIn <-chan models.Partition,
chBatchesOut chan<- models.Batch,
chErrorsOut chan<- custom_errors.ExtractorError,
chJobErrorsOut chan<- custom_errors.JobError,
wgActivePartitions *sync.WaitGroup,
rowsRead *int64,
)
}
type TransformerFunc func(any) (any, error)
type ColumnTransformPlan struct {
@@ -17,21 +42,40 @@ type ColumnTransformPlan struct {
}
type Transformer interface {
Consume(
ProcessBatch(
ctx context.Context,
batch *models.Batch,
transformationPlan []ColumnTransformPlan,
) error
Exec(
ctx context.Context,
columns []models.ColumnType,
retryConfig config.RetryConfig,
batchSize int,
chBatchesIn <-chan models.Batch,
chBatchesOut chan<- models.Batch,
chBactchesOut chan<- models.Batch,
chJobErrorsOut chan<- custom_errors.JobError,
wgActiveBatches *sync.WaitGroup,
)
}
type MaxMinColumnResult struct {
Max int64
Min int64
type Loader interface {
ProcessBatch(
ctx context.Context,
tableInfo config.TargetTableInfo,
colNames []string,
batch models.Batch,
) (int, error)
Exec(
ctx context.Context,
tableInfo config.TargetTableInfo,
columns []models.ColumnType,
chBatchesIn <-chan models.Batch,
chErrorsOut chan<- custom_errors.LoaderError,
chJobErrorsOut chan<- custom_errors.JobError,
wgActiveBatches *sync.WaitGroup,
rowsLoaded *int64,
)
}
type TableAnalyzer interface {
@@ -45,17 +89,10 @@ type TableAnalyzer interface {
tableInfo config.TableInfo,
) (int64, error)
QueryMaxMinFromColumn(
ctx context.Context,
tableInfo config.TableInfo,
columnName string,
) (MaxMinColumnResult, error)
CalculatePartitionRanges(
ctx context.Context,
tableInfo config.TableInfo,
partitionColumn string,
maxPartitions int64,
rangeConstraint config.RangeConfig,
) ([]models.Partition, error)
}

View File

@@ -8,14 +8,9 @@ import (
type UnknownRowValues = []any
type BatchRef struct {
Id uuid.UUID
PartitionId uuid.UUID
}
type Batch struct {
Id uuid.UUID
ParentBatches []BatchRef
PartitionId uuid.UUID
Rows []UnknownRowValues
RetryCounter int
}

View File

@@ -1,44 +0,0 @@
package main
import (
"context"
"fmt"
"log"
"math/rand"
"sync"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/azure"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/config"
)
func main() {
cfg := config.App.AzureStorage
containerName := cfg.Container
client, err := azure.NewClient(cfg)
if err != nil {
log.Fatalf("Error creando cliente: %v", err)
}
ctx := context.Background()
var wg sync.WaitGroup
for i := 1; i <= 10; i++ {
wg.Add(1)
go func(id int) {
defer wg.Done()
blobName := fmt.Sprintf("%sarchivo-%d.txt", cfg.Prefix, id)
content := fmt.Sprintf("Contenido aleatorio: %d", rand.Intn(100000))
err := client.UploadBuffer(ctx, containerName, blobName, []byte(content))
if err != nil {
log.Printf("Fallo al subir %s: %v", blobName, err)
} else {
fmt.Printf("Subido exitosamente: %s\n", blobName)
}
}(i)
}
wg.Wait()
}

View File

@@ -1,8 +1,6 @@
package main
import (
"flag"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/config"
log "github.com/sirupsen/logrus"
)
@@ -10,18 +8,7 @@ import (
func main() {
log.SetLevel(log.DebugLevel)
configPath := flag.String("config", "", "path to migration config file")
flag.Parse()
if flag.NArg() > 1 {
log.Fatalf("only one config file path is allowed")
}
if *configPath == "" && flag.NArg() == 1 {
*configPath = flag.Arg(0)
}
migrationConfig, err := config.ReadMigrationConfig(*configPath)
migrationConfig, err := config.ReadMigrationConfig()
if err != nil {
log.Fatalf("error leyendo configuracion: %v", err)
}

View File

@@ -12,9 +12,8 @@ import (
)
const (
// totalRows int = 1_000_000
totalRows int = 1000
chunkSize int = 200
totalRows int = 1_000_000
chunkSize int = 50_000
queueSize int = 4
)
@@ -41,14 +40,6 @@ func main() {
seedManzanas(ctx, db)
})
wgSeed.Go(func() {
seedPuertos(ctx, db)
})
wgSeed.Go(func() {
seedSiteHolderAttach(ctx, db)
})
wgSeed.Wait()
}

View File

@@ -1,227 +0,0 @@
package main
import (
"bytes"
"context"
"database/sql"
"fmt"
"math/rand"
"sync"
"time"
"github.com/google/uuid"
log "github.com/sirupsen/logrus"
)
var siteHolderAttachJob = MigrationJob{
Schema: "Infraestructura",
Table: "SITE_HOLDER__ATTACH",
}
func seedSiteHolderAttach(ctx context.Context, db *sql.DB) error {
maxOid, err := getMaxGDBArchiveOidForAttach(ctx, db)
if err != nil {
log.Fatal("Error getting max GDB_ARCHIVE_OID: ", err)
}
log.Infof("Starting SITE_HOLDER__ATTACH data generation from GDB_ARCHIVE_OID: %d", maxOid+1)
rowsChan := make(chan []UnknownRowValues, queueSize)
var wgRowGenerator sync.WaitGroup
wgRowGenerator.Go(func() {
generateSiteHolderAttachRows(ctx, maxOid, totalRows, chunkSize, rowsChan)
})
columns := []string{
"GDB_ARCHIVE_OID",
"REL_GLOBALID",
"CONTENT_TYPE",
"ATT_NAME",
"DATA_SIZE",
"DATA",
"GLOBALID",
"GDB_FROM_DATE",
"GDB_TO_DATE",
"ATTACHMENTID",
}
if err := loadRowsMssql(ctx, siteHolderAttachJob, columns, db, rowsChan); err != nil {
return fmt.Errorf("Error loading rows (SITE_HOLDER__ATTACH): %w", err)
}
log.Info("Data generation and loading completed successfully (SITE_HOLDER__ATTACH)")
wgRowGenerator.Wait()
return nil
}
func getMaxGDBArchiveOidForAttach(ctx context.Context, db *sql.DB) (int, error) {
var maxOid sql.NullInt64
query := fmt.Sprintf(`
SELECT ISNULL(MAX(GDB_ARCHIVE_OID), 0)
FROM [%s].[%s]
`, siteHolderAttachJob.Schema, siteHolderAttachJob.Table)
err := db.QueryRowContext(ctx, query).Scan(&maxOid)
if err != nil && err != sql.ErrNoRows {
return 0, err
}
if !maxOid.Valid {
return 0, nil
}
return int(maxOid.Int64), nil
}
func generateSiteHolderAttachRows(
ctx context.Context,
startOid int,
totalRows int,
chunkSize int,
out chan<- []UnknownRowValues,
) {
defer close(out)
rowsGenerated := 0
currentChunk := make([]UnknownRowValues, 0, chunkSize)
for i := range totalRows {
gdbArchiveOid := startOid + i + 1
row := generateSiteHolderAttachRow(gdbArchiveOid)
currentChunk = append(currentChunk, row)
rowsGenerated++
if len(currentChunk) == chunkSize {
select {
case out <- currentChunk:
log.Debugf("Sent SITE_HOLDER__ATTACH chunk with %d rows", len(currentChunk))
case <-ctx.Done():
log.Info("Context cancelled, stopping SITE_HOLDER__ATTACH row generation")
return
}
currentChunk = make([]UnknownRowValues, 0, chunkSize)
}
if rowsGenerated%100_000 == 0 {
logSiteHolderAttachSampleRow(rowsGenerated, row)
}
}
if len(currentChunk) > 0 {
select {
case out <- currentChunk:
log.Debugf("Sent final SITE_HOLDER__ATTACH chunk with %d rows", len(currentChunk))
case <-ctx.Done():
log.Info("Context cancelled, stopping SITE_HOLDER__ATTACH row generation")
}
}
log.Infof("Finished generating %d SITE_HOLDER__ATTACH rows", rowsGenerated)
}
func generateSiteHolderAttachRow(gdbArchiveOid int) UnknownRowValues {
dateLowerLimit, _ := time.Parse(time.RFC3339, "2020-12-31T23:59:59Z")
dateUpperLimit, _ := time.Parse(time.RFC3339, "2025-12-31T23:59:59Z")
relGlobalID, _ := uuid.New().MarshalBinary()
contentType := generateRandomContentType()
attName := generateRandomAttachmentName()
binaryData := generateRandomBinaryContent()
dataSize := len(binaryData)
globalID, _ := uuid.New().MarshalBinary()
gdbFromDate := generateRandomTimestamp(dateLowerLimit, dateUpperLimit)
gdbToDate, _ := time.Parse(time.RFC3339, "9999-12-31T23:59:59Z")
attachmentID := rand.Intn(10000) + 1
return UnknownRowValues{
gdbArchiveOid,
relGlobalID,
contentType,
attName,
dataSize,
binaryData,
globalID,
gdbFromDate,
gdbToDate,
attachmentID,
}
}
func generateRandomContentType() string {
contentTypes := []string{
"text/plain",
"application/pdf",
"image/jpeg",
"image/png",
"application/msword",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"text/csv",
"application/json",
}
return contentTypes[rand.Intn(len(contentTypes))]
}
func generateRandomAttachmentName() string {
extensions := []string{".txt", ".pdf", ".jpg", ".png", ".doc", ".docx", ".csv", ".json"}
baseName := generateRandomString(20)
extension := extensions[rand.Intn(len(extensions))]
return baseName + extension
}
func generateRandomBinaryContent() []byte {
sizeOptions := []int{100, 500, 1000, 5000, 10000, 50000, 100000}
size := sizeOptions[rand.Intn(len(sizeOptions))]
var buf bytes.Buffer
lineCount := rand.Intn(size/50) + 1
for range lineCount {
line := generateRandomString(rand.Intn(80) + 20)
buf.WriteString(line)
buf.WriteString("\n")
}
for buf.Len() < size {
randomText := generateRandomString(rand.Intn(100) + 50)
buf.WriteString(randomText)
buf.WriteString("\n")
}
result := buf.Bytes()
if len(result) > size {
result = result[:size]
}
return result
}
func logSiteHolderAttachSampleRow(id int, rowValues UnknownRowValues) {
dataBytes := rowValues[5].([]byte)
log.Infof(`
Sample SITE_HOLDER__ATTACH row #%d:
GDB_ARCHIVE_OID: %v
REL_GLOBALID: [binary UUID]
CONTENT_TYPE: %v
ATT_NAME: %v
DATA_SIZE: %v
DATA: [%d bytes of binary content]
GLOBALID: [binary UUID]
GDB_FROM_DATE: %v
GDB_TO_DATE: %v
ATTACHMENTID: %v
`,
id,
rowValues[0],
rowValues[2],
rowValues[3],
rowValues[4],
len(dataBytes),
rowValues[7],
rowValues[8],
rowValues[9],
)
}